summaryrefslogtreecommitdiff
path: root/lib/model.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/model.rb')
-rw-r--r--lib/model.rb123
1 files changed, 80 insertions, 43 deletions
diff --git a/lib/model.rb b/lib/model.rb
index dce53a9..d7b2df6 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -32,20 +32,20 @@ module OpenTox
# @param [OpenTox::Feature, nil] prediction_feature
# By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature
# @param [Hash, nil] algorithms
- # Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and threshold), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys.
+ # Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and thresholds for predictions with high and low confidence), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys.
#
# @return [OpenTox::Model::Lazar]
def self.create prediction_feature:nil, training_dataset:, algorithms:{}
- bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset
- prediction_feature = training_dataset.features.first unless prediction_feature
- # TODO: prediction_feature without training_dataset: use all available data
+ raise ArgumentError, "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset
+ prediction_feature ||= training_dataset.features.select{|f| f.is_a? NumericBioActivity or f.is_a? NominalBioActivity}.first unless prediction_feature
# guess model type
- prediction_feature.numeric? ? model = LazarRegression.new : model = LazarClassification.new
+ prediction_feature.is_a?(NumericBioActivity) ? model = LazarRegression.new : model = LazarClassification.new
model.prediction_feature_id = prediction_feature.id
model.training_dataset_id = training_dataset.id
- model.name = "#{prediction_feature.name} (#{training_dataset.name})"
+ model.name = training_dataset.name
+
# git or gem versioning
dir = File.dirname(__FILE__)
path = File.expand_path("../", File.expand_path(dir))
@@ -62,7 +62,7 @@ module OpenTox
# set defaults#
substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
- bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
+ raise ArgumentError, "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
if substance_classes.first == "OpenTox::Compound"
@@ -80,7 +80,7 @@ module OpenTox
}
model.algorithms[:similarity] = {
:method => "Algorithm::Similarity.tanimoto",
- :min => 0.1,
+ :min => [0.5,0.2],
}
elsif model.class == LazarRegression
model.algorithms[:prediction] = {
@@ -88,7 +88,7 @@ module OpenTox
}
model.algorithms[:similarity] = {
:method => "Algorithm::Similarity.tanimoto",
- :min => 0.5,
+ :min => [0.5,0.2],
}
end
@@ -100,7 +100,7 @@ module OpenTox
},
:similarity => {
:method => "Algorithm::Similarity.weighted_cosine",
- :min => 0.5,
+ :min => [0.5,0.2],
},
:prediction => {
:method => "Algorithm::Caret.rf",
@@ -110,7 +110,7 @@ module OpenTox
},
}
else
- bad_request_error "Cannot create models for #{substance_classes.first}."
+ raise ArgumentError, "Cannot create models for #{substance_classes.first}."
end
# overwrite defaults with explicit parameters
@@ -175,7 +175,7 @@ module OpenTox
model.descriptor_ids = feature_ids & property_ids
model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}}
else
- bad_request_error "Descriptor method '#{descriptor_method}' not implemented."
+ raise ArgumentError, "Descriptor method '#{descriptor_method}' not implemented."
end
if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method]
@@ -197,7 +197,7 @@ module OpenTox
# Predict a substance (compound or nanoparticle)
# @param [OpenTox::Substance]
# @return [Hash]
- def predict_substance substance, threshold = self.algorithms[:similarity][:min]
+ def predict_substance substance, threshold = self.algorithms[:similarity][:min].first, prediction = nil
@independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
case algorithms[:similarity][:method]
@@ -224,11 +224,11 @@ module OpenTox
end
end
else
- bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
+ raise ArgumentError, "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
end
- prediction = {:warnings => [], :measurements => []}
- prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min]}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min]
+ prediction ||= {:warnings => [], :measurements => []}
+ prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min].first}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min].first
neighbor_ids = []
neighbor_similarities = []
neighbor_dependent_variables = []
@@ -238,7 +238,7 @@ module OpenTox
substance_ids.each_with_index do |s,i|
# handle query substance
if substance.id.to_s == s
- prediction[:measurements] << dependent_variables[i]
+ prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min].first # add measurements only once at first pass
prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
else
if fingerprints?
@@ -264,25 +264,37 @@ module OpenTox
if neighbor_similarities.empty?
prediction[:value] = nil
- prediction[:warnings] << "Could not find similar substances with experimental data in the training dataset."
+ prediction[:warnings] << "Could not find similar substances for threshold #{threshold} with experimental data in the training dataset."
+ if threshold == algorithms[:similarity][:min].last
+ prediction[:confidence] = "Out of applicability domain: Could not find similar substances with experimental data in the training dataset (Threshold: #{algorithms[:similarity][:min].last})."
+ return prediction
+ end
elsif neighbor_similarities.size == 1
prediction[:value] = nil
- prediction[:warnings] << "Cannot create prediction: Only one similar compound in the training set."
- prediction[:neighbors] = [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]
+ prediction[:warnings] << "Cannot create prediction: Only one similar compound for threshold #{threshold} in the training set (Threshold: #{algorithms[:similarity][:min].last})."
+ prediction[:neighbors] = [{:id => neighbor_ids.first, :measurement => neighbor_dependent_variables[0], :similarity => neighbor_similarities.first}]
+ if threshold == algorithms[:similarity][:min].last
+ prediction[:confidence] = "Out of applicability domain: Only one similar compound in the training set."
+ return prediction
+ end
else
query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint"
# call prediction algorithm
result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
prediction.merge! result
prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
- #if neighbor_similarities.max < algorithms[:similarity][:warn_min]
- #prediction[:warnings] << "Closest neighbor has similarity < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain."
- #end
end
- if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2
- prediction
- else # try again with a lower threshold
- predict_substance substance, 0.2
+ if threshold == algorithms[:similarity][:min].first
+ if prediction[:warnings].empty?
+ prediction[:confidence] = "Similar to bioassay results"
+ return prediction
+ else # try again with a lower threshold
+ prediction[:warnings] << "Lowering similarity threshold to #{algorithms[:similarity][:min].last}."
+ predict_substance substance, algorithms[:similarity][:min].last, prediction
+ end
+ elsif threshold < algorithms[:similarity][:min].first
+ prediction[:confidence] = "Lower than bioassay results"
+ return prediction
end
end
@@ -302,13 +314,18 @@ module OpenTox
elsif object.is_a? Dataset
substances = object.substances
else
- bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter."
+ raise ArgumentError, "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter."
end
# make predictions
predictions = {}
substances.each do |c|
predictions[c.id.to_s] = predict_substance c
+ if prediction_feature.is_a? NominalBioActivity and predictions[c.id.to_s][:value]
+ prediction_feature.accept_values.each do |v|
+ predictions[c.id.to_s][:probabilities][v] ||= 0.0 # use 0 instead of empty probabilities (happens if all neighbors have the same activity)
+ end
+ end
predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id
end
@@ -320,17 +337,35 @@ module OpenTox
elsif object.is_a? Array
return predictions
elsif object.is_a? Dataset
- # prepare prediction dataset
- measurement_feature = Feature.find prediction_feature_id
-
- prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
- prediction_dataset = LazarPrediction.create(
- :name => "Lazar prediction for #{prediction_feature.name}",
- :creator => __FILE__,
- :prediction_feature_id => prediction_feature.id,
- :predictions => predictions
- )
- return prediction_dataset
+ d = object.copy
+ #warning_feature = Warnings.find_or_create_by(:dataset_id => d.id)
+ confidence_feature = Confidence.find_or_create_by(:dataset_id => d.id)
+ if prediction_feature.is_a? NominalBioActivity
+ f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id)
+ probability_features = {}
+ prediction_feature.accept_values.each do |v|
+ probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
+ end
+ elsif prediction_feature.is_a? NumericBioActivity
+ f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id)
+ prediction_interval = []
+ ["lower","upper"].each do |v|
+ prediction_interval << LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
+ end
+ end
+
+ # add predictions to dataset
+ predictions.each do |substance_id,p|
+ substance_id = BSON::ObjectId.from_string(substance_id)
+ d.add substance_id,confidence_feature,p[:confidence]
+ unless p[:value].nil?
+ d.add substance_id,f,p[:value]
+ p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} if p[:probabilities]
+ p[:prediction_interval].each_with_index {|v,i| d.add substance_id, prediction_interval[i], v } if p[:prediction_interval]
+ end
+ end
+ d.save
+ return d
end
end
@@ -402,6 +437,7 @@ module OpenTox
field :species, type: String
field :source, type: String
field :unit, type: String
+ field :warnings, type: Array
field :model_id, type: BSON::ObjectId
field :repeated_crossvalidation_id, type: BSON::ObjectId
@@ -461,11 +497,11 @@ module OpenTox
end
# Create and validate a lazar model from a csv file with training data and a json file with metadata
- # @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at https://github.com/opentox/lazar-public-data.
- # @return [OpenTox::Model::Validation] lazar model with three independent 10-fold crossvalidations
+ # @param [File] CSV file with two or three columns. The first column is optional and may contain an arbitrary substance ID. The next column should contain either SMILES or InChIs of the training compounds, followed by toxic activities (qualitative or quantitative) in the last column. Use -log10 transformed values for regression datasets. The first line should contain "ID" (optional), either SMILES or InChI and the endpoint name (last column). Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source", "qmrf" (optional) and "unit" (regression only). You can find example training data in the data folder of lazar.
+ # @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations
def self.from_csv_file file
metadata_file = file.sub(/csv$/,"json")
- bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
+ raise ArgumentError, "No metadata file #{metadata_file}" unless File.exist? metadata_file
model_validation = self.new JSON.parse(File.read(metadata_file))
training_dataset = Dataset.from_csv_file file
model = Lazar.create training_dataset: training_dataset
@@ -477,6 +513,7 @@ module OpenTox
# Create and validate a nano-lazar model, import data from eNanoMapper if necessary
# nano-lazar methods are described in detail in https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf
+ # *eNanoMapper import is currently broken, because APIs and data formats are constantly changing and we have no resources to track this changes permanently!*
# @param [OpenTox::Dataset, nil] training_dataset
# @param [OpenTox::Feature, nil] prediction_feature
# @param [Hash, nil] algorithms
@@ -488,7 +525,7 @@ module OpenTox
unless training_dataset # try to import
Import::Enanomapper.import
training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
- bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset
+ raise ArgumentError, "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset
end
prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first