From 13e7865f386603fb784e62feef2ee2a56c015b45 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Tue, 14 Aug 2018 12:02:03 +0000 Subject: classification min_sim=0.5, 5 repeated cvs --- lib/model.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index dce53a9..0ed70f2 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -80,7 +80,7 @@ module OpenTox } model.algorithms[:similarity] = { :method => "Algorithm::Similarity.tanimoto", - :min => 0.1, + :min => 0.5, } elsif model.class == LazarRegression model.algorithms[:prediction] = { -- cgit v1.2.3 From bdc6b5b40437896384561d74a510560e9e592364 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Tue, 9 Oct 2018 18:20:27 +0200 Subject: tentative random forest classification: hangs unpredictably during caret model generation/optimization for some (inorganic?) compounds. --- lib/model.rb | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 0ed70f2..8901a2c 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -37,7 +37,7 @@ module OpenTox # @return [OpenTox::Model::Lazar] def self.create prediction_feature:nil, training_dataset:, algorithms:{} bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset - prediction_feature = training_dataset.features.first unless prediction_feature + prediction_feature = training_dataset.features.select{|f| f.measured}.first unless prediction_feature # TODO: prediction_feature without training_dataset: use all available data # guess model type @@ -199,6 +199,8 @@ module OpenTox # @return [Hash] def predict_substance substance, threshold = self.algorithms[:similarity][:min] + p substance.smiles + t = Time.now @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data case algorithms[:similarity][:method] when /tanimoto/ # binary features @@ -284,6 +286,9 @@ module OpenTox else # try again with a lower threshold predict_substance substance, 0.2 end + p prediction + p Time.now - t + prediction end # Predict a substance (compound or nanoparticle), an array of substances or a dataset -- cgit v1.2.3 From de763211bd2b6451e3a8dc20eb95a3ecf72bef17 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Thu, 11 Oct 2018 12:13:40 +0200 Subject: initial dataset batch prediction --- lib/model.rb | 50 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 17 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 8901a2c..7ee50fe 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -36,12 +36,12 @@ module OpenTox # # @return [OpenTox::Model::Lazar] def self.create prediction_feature:nil, training_dataset:, algorithms:{} - bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset - prediction_feature = training_dataset.features.select{|f| f.measured}.first unless prediction_feature + bad_request_error "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset + prediction_feature ||= training_dataset.features.select{|f| f.is_a? NumericBioActivity or f.is_a? NominalBioActivity}.first unless prediction_feature # TODO: prediction_feature without training_dataset: use all available data # guess model type - prediction_feature.numeric? ? model = LazarRegression.new : model = LazarClassification.new + prediction_feature.is_a?(NumericBioActivity) ? model = LazarRegression.new : model = LazarClassification.new model.prediction_feature_id = prediction_feature.id model.training_dataset_id = training_dataset.id @@ -199,7 +199,7 @@ module OpenTox # @return [Hash] def predict_substance substance, threshold = self.algorithms[:similarity][:min] - p substance.smiles + #p substance.smiles t = Time.now @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data case algorithms[:similarity][:method] @@ -286,8 +286,8 @@ module OpenTox else # try again with a lower threshold predict_substance substance, 0.2 end - p prediction - p Time.now - t + #p prediction + #p Time.now - t prediction end @@ -314,6 +314,11 @@ module OpenTox predictions = {} substances.each do |c| predictions[c.id.to_s] = predict_substance c + if prediction_feature.is_a? NominalBioActivity and predictions[c.id.to_s][:value] + prediction_feature.accept_values.each do |v| + predictions[c.id.to_s][:probabilities][v] ||= 0.0 # use 0 instead of empty probabilities (happens if all neighbors have the same activity) + end + end predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id end @@ -325,17 +330,28 @@ module OpenTox elsif object.is_a? Array return predictions elsif object.is_a? Dataset - # prepare prediction dataset - measurement_feature = Feature.find prediction_feature_id - - prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" ) - prediction_dataset = LazarPrediction.create( - :name => "Lazar prediction for #{prediction_feature.name}", - :creator => __FILE__, - :prediction_feature_id => prediction_feature.id, - :predictions => predictions - ) - return prediction_dataset + if prediction_feature.is_a? NominalBioActivity + f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id) + probability_features = {} + prediction_feature.accept_values.each do |v| + probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => "probability(#{v})", :accept_values => prediction_feature.accept_values, :value => v, :model_id => self.id, :training_feature_id => prediction_feature.id) + end + elsif prediction_feature.is_a? NumericBioActivity + f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id) + # TODO prediction interval + end + + d = Dataset.new(:name => object.name) + # add predictions to dataset + predictions.each do |substance_id,p| + d.warnings += p[:warnings] + unless p[:value].nil? + d.add substance_id,f,p[:value] + p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} + end + end + d.save + return d end end -- cgit v1.2.3 From 1652fd5df948da7ace622c73d158010add656b9f Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Wed, 24 Oct 2018 18:21:34 +0200 Subject: dataset map --- lib/model.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 7ee50fe..9858949 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -199,7 +199,6 @@ module OpenTox # @return [Hash] def predict_substance substance, threshold = self.algorithms[:similarity][:min] - #p substance.smiles t = Time.now @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data case algorithms[:similarity][:method] @@ -286,7 +285,6 @@ module OpenTox else # try again with a lower threshold predict_substance substance, 0.2 end - #p prediction #p Time.now - t prediction end @@ -330,11 +328,12 @@ module OpenTox elsif object.is_a? Array return predictions elsif object.is_a? Dataset + warning_feature = InfoFeature.find_or_create_by(:name => "Warnings") if prediction_feature.is_a? NominalBioActivity f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id) probability_features = {} prediction_feature.accept_values.each do |v| - probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => "probability(#{v})", :accept_values => prediction_feature.accept_values, :value => v, :model_id => self.id, :training_feature_id => prediction_feature.id) + probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id) end elsif prediction_feature.is_a? NumericBioActivity f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id) @@ -344,10 +343,11 @@ module OpenTox d = Dataset.new(:name => object.name) # add predictions to dataset predictions.each do |substance_id,p| - d.warnings += p[:warnings] + d.add substance_id,warning_feature,p[:warnings].join(" ") if p[:warnings] unless p[:value].nil? d.add substance_id,f,p[:value] p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} + # TODO prediction interval end end d.save -- cgit v1.2.3 From 5e9a08c0b534fa96179fb5c81a9b4193e7b0aad8 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Mon, 29 Oct 2018 17:58:09 +0100 Subject: dataset folds fixed --- lib/model.rb | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 9858949..fc98e09 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -277,7 +277,7 @@ module OpenTox prediction.merge! result prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}} #if neighbor_similarities.max < algorithms[:similarity][:warn_min] - #prediction[:warnings] << "Closest neighbor has similarity < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain." + #prediction[:warnings] << "Closest neighbor has similarity #{neighbor_similarities.max} < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain." #end end if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2 @@ -328,7 +328,8 @@ module OpenTox elsif object.is_a? Array return predictions elsif object.is_a? Dataset - warning_feature = InfoFeature.find_or_create_by(:name => "Warnings") + d = object.copy + warning_feature = Warnings.find_or_create_by(:dataset_id => d.id) if prediction_feature.is_a? NominalBioActivity f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id) probability_features = {} @@ -337,17 +338,19 @@ module OpenTox end elsif prediction_feature.is_a? NumericBioActivity f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id) - # TODO prediction interval + prediction_interval = {} + ["lower","upper"].each do |v| + prediction_interval[v] = LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id) + end end - d = Dataset.new(:name => object.name) # add predictions to dataset predictions.each do |substance_id,p| d.add substance_id,warning_feature,p[:warnings].join(" ") if p[:warnings] unless p[:value].nil? d.add substance_id,f,p[:value] - p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} - # TODO prediction interval + p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} if p[:probabilities] + p[:prediction_interval].each_with_index {|v,i| d.add substance_id, prediction_interval[i], v } if p[:prediction_interval] end end d.save -- cgit v1.2.3 From d9c9d78e49d886ea91386adbbd2b523347df226e Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Mon, 29 Oct 2018 20:34:39 +0100 Subject: dataset predictions fixed --- lib/model.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index fc98e09..7eaa469 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -346,7 +346,8 @@ module OpenTox # add predictions to dataset predictions.each do |substance_id,p| - d.add substance_id,warning_feature,p[:warnings].join(" ") if p[:warnings] + substance_id = BSON::ObjectId.from_string(substance_id) + d.add substance_id,warning_feature,p[:warnings].join(" ") unless p[:warnings].empty? unless p[:value].nil? d.add substance_id,f,p[:value] p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} if p[:probabilities] -- cgit v1.2.3 From d61f78093f4ddf03c27a2c8ae0bab9c1f10c80f5 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Tue, 30 Oct 2018 17:26:59 +0100 Subject: tests fixed --- lib/model.rb | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 7eaa469..6d5cf7b 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -46,6 +46,7 @@ module OpenTox model.prediction_feature_id = prediction_feature.id model.training_dataset_id = training_dataset.id model.name = "#{prediction_feature.name} (#{training_dataset.name})" + # git or gem versioning dir = File.dirname(__FILE__) path = File.expand_path("../", File.expand_path(dir)) @@ -485,6 +486,8 @@ module OpenTox model.is_a? LazarClassification end + # TODO from_pubchem_aid + # Create and validate a lazar model from a csv file with training data and a json file with metadata # @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at https://github.com/opentox/lazar-public-data. # @return [OpenTox::Model::Validation] lazar model with three independent 10-fold crossvalidations @@ -533,6 +536,14 @@ module OpenTox end + # TODO + def to_json + "{\n metadata:#{super},\n model:#{model.to_json}, repeated_crossvalidations:#{repeated_crossvalidations.to_json}\n}" + end + + def from_json_file + end + end end -- cgit v1.2.3 From 2d4ce39cb1b489e26b0d6d96026054566a4f77b9 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Tue, 30 Oct 2018 21:11:04 +0100 Subject: dataset merge --- lib/model.rb | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 6d5cf7b..f50fcd7 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -487,6 +487,16 @@ module OpenTox end # TODO from_pubchem_aid + def self.from_dataset training_dataset: , prediction_feature: , species: , endpoint: + model_validation = Model::Validation.create species: species, endpoint: endpoint + p "create model" + model = Lazar.create training_dataset: training_dataset, prediction_feature: prediction_feature + model_validation[:model_id] = model.id + p "create_crossvalidations" + model_validation[:repeated_crossvalidation_id] = OpenTox::Validation::RepeatedCrossValidation.create(model).id # full class name required + model_validation.save + model_validation + end # Create and validate a lazar model from a csv file with training data and a json file with metadata # @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at https://github.com/opentox/lazar-public-data. -- cgit v1.2.3 From 3a9c9332b660d35720ad4fa1f55ee0883e53aecd Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 2 Nov 2018 20:34:44 +0100 Subject: warnings fixed, cleanup --- lib/model.rb | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index f50fcd7..a0c60f0 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -38,7 +38,6 @@ module OpenTox def self.create prediction_feature:nil, training_dataset:, algorithms:{} bad_request_error "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset prediction_feature ||= training_dataset.features.select{|f| f.is_a? NumericBioActivity or f.is_a? NominalBioActivity}.first unless prediction_feature - # TODO: prediction_feature without training_dataset: use all available data # guess model type prediction_feature.is_a?(NumericBioActivity) ? model = LazarRegression.new : model = LazarClassification.new @@ -198,9 +197,8 @@ module OpenTox # Predict a substance (compound or nanoparticle) # @param [OpenTox::Substance] # @return [Hash] - def predict_substance substance, threshold = self.algorithms[:similarity][:min] + def predict_substance substance, threshold = self.algorithms[:similarity][:min], prediction = nil - t = Time.now @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data case algorithms[:similarity][:method] when /tanimoto/ # binary features @@ -229,7 +227,7 @@ module OpenTox bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'." end - prediction = {:warnings => [], :measurements => []} + prediction ||= {:warnings => [], :measurements => []} prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min]}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min] neighbor_ids = [] neighbor_similarities = [] @@ -240,7 +238,7 @@ module OpenTox substance_ids.each_with_index do |s,i| # handle query substance if substance.id.to_s == s - prediction[:measurements] << dependent_variables[i] + prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min] # add measurements only once at first pass prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance." else if fingerprints? @@ -277,17 +275,13 @@ module OpenTox result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors prediction.merge! result prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}} - #if neighbor_similarities.max < algorithms[:similarity][:warn_min] - #prediction[:warnings] << "Closest neighbor has similarity #{neighbor_similarities.max} < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain." - #end end if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2 prediction else # try again with a lower threshold - predict_substance substance, 0.2 + prediction[:warnings] << "Lowering similarity threshold to 0.2." + predict_substance substance, 0.2, prediction end - #p Time.now - t - prediction end # Predict a substance (compound or nanoparticle), an array of substances or a dataset -- cgit v1.2.3 From cf80ed17102a0368df8d65037d113b521cdf6f0c Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Tue, 6 Nov 2018 19:01:58 +0100 Subject: sdf export fixed --- lib/model.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index a0c60f0..966460b 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -481,13 +481,13 @@ module OpenTox end # TODO from_pubchem_aid - def self.from_dataset training_dataset: , prediction_feature: , species: , endpoint: + def self.from_dataset training_dataset: , prediction_feature: , species: , endpoint: , folds: 10, repeats: 5 model_validation = Model::Validation.create species: species, endpoint: endpoint - p "create model" + #p "create model" model = Lazar.create training_dataset: training_dataset, prediction_feature: prediction_feature model_validation[:model_id] = model.id - p "create_crossvalidations" - model_validation[:repeated_crossvalidation_id] = OpenTox::Validation::RepeatedCrossValidation.create(model).id # full class name required + #p "create_crossvalidations" + model_validation[:repeated_crossvalidation_id] = OpenTox::Validation::RepeatedCrossValidation.create(model,folds,repeats).id # full class name required model_validation.save model_validation end -- cgit v1.2.3 From 6e23be652ad90c747aaccf15258bdaa4458185a4 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Tue, 13 Nov 2018 14:32:09 +0100 Subject: public dataset download --- lib/model.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 966460b..70ae43c 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -422,6 +422,7 @@ module OpenTox field :species, type: String field :source, type: String field :unit, type: String + field :warnings, type: Array field :model_id, type: BSON::ObjectId field :repeated_crossvalidation_id, type: BSON::ObjectId @@ -494,7 +495,7 @@ module OpenTox # Create and validate a lazar model from a csv file with training data and a json file with metadata # @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at https://github.com/opentox/lazar-public-data. - # @return [OpenTox::Model::Validation] lazar model with three independent 10-fold crossvalidations + # @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations def self.from_csv_file file metadata_file = file.sub(/csv$/,"json") bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file -- cgit v1.2.3 From ae78e8216909ebfa708b8da3c55248a68abc291c Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Wed, 14 Nov 2018 13:35:17 +0100 Subject: public model validation, updated documentation --- lib/model.rb | 25 +++---------------------- 1 file changed, 3 insertions(+), 22 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 70ae43c..db69120 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -44,7 +44,7 @@ module OpenTox model.prediction_feature_id = prediction_feature.id model.training_dataset_id = training_dataset.id - model.name = "#{prediction_feature.name} (#{training_dataset.name})" + model.name = training_dataset.name # git or gem versioning dir = File.dirname(__FILE__) @@ -481,20 +481,8 @@ module OpenTox model.is_a? LazarClassification end - # TODO from_pubchem_aid - def self.from_dataset training_dataset: , prediction_feature: , species: , endpoint: , folds: 10, repeats: 5 - model_validation = Model::Validation.create species: species, endpoint: endpoint - #p "create model" - model = Lazar.create training_dataset: training_dataset, prediction_feature: prediction_feature - model_validation[:model_id] = model.id - #p "create_crossvalidations" - model_validation[:repeated_crossvalidation_id] = OpenTox::Validation::RepeatedCrossValidation.create(model,folds,repeats).id # full class name required - model_validation.save - model_validation - end - # Create and validate a lazar model from a csv file with training data and a json file with metadata - # @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at https://github.com/opentox/lazar-public-data. + # @param [File] CSV file with two or three columns. The first column is optional and may contain an arbitrary substance ID. The next column should contain either SMILES or InChIs of the training compounds, followed by toxic activities (qualitative or quantitative) in the last column. Use -log10 transformed values for regression datasets. The first line should contain "ID" (optional), either SMILES or InChI and the endpoint name (last column). Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source", "qmrf" (optional) and "unit" (regression only). You can find example training data in the data folder of lazar. # @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations def self.from_csv_file file metadata_file = file.sub(/csv$/,"json") @@ -510,6 +498,7 @@ module OpenTox # Create and validate a nano-lazar model, import data from eNanoMapper if necessary # nano-lazar methods are described in detail in https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf + # *eNanoMapper import is currently broken, because APIs and data formats are constantly changing and we have no resources to track this changes permanently!* # @param [OpenTox::Dataset, nil] training_dataset # @param [OpenTox::Feature, nil] prediction_feature # @param [Hash, nil] algorithms @@ -541,14 +530,6 @@ module OpenTox end - # TODO - def to_json - "{\n metadata:#{super},\n model:#{model.to_json}, repeated_crossvalidations:#{repeated_crossvalidations.to_json}\n}" - end - - def from_json_file - end - end end -- cgit v1.2.3 From 7e547fd4a296f497615a7805d565b378cb1bd7cd Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Wed, 14 Nov 2018 17:33:44 +0100 Subject: bad_request_error substituted with ArgumentError --- lib/model.rb | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index db69120..caf8a6e 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -36,7 +36,7 @@ module OpenTox # # @return [OpenTox::Model::Lazar] def self.create prediction_feature:nil, training_dataset:, algorithms:{} - bad_request_error "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset + raise ArgumentError, "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset prediction_feature ||= training_dataset.features.select{|f| f.is_a? NumericBioActivity or f.is_a? NominalBioActivity}.first unless prediction_feature # guess model type @@ -62,7 +62,7 @@ module OpenTox # set defaults# substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq - bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1 + raise ArgumentError, "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1 if substance_classes.first == "OpenTox::Compound" @@ -110,7 +110,7 @@ module OpenTox }, } else - bad_request_error "Cannot create models for #{substance_classes.first}." + raise ArgumentError, "Cannot create models for #{substance_classes.first}." end # overwrite defaults with explicit parameters @@ -175,7 +175,7 @@ module OpenTox model.descriptor_ids = feature_ids & property_ids model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} else - bad_request_error "Descriptor method '#{descriptor_method}' not implemented." + raise ArgumentError, "Descriptor method '#{descriptor_method}' not implemented." end if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method] @@ -224,7 +224,7 @@ module OpenTox end end else - bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'." + raise ArgumentError, "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'." end prediction ||= {:warnings => [], :measurements => []} @@ -300,7 +300,7 @@ module OpenTox elsif object.is_a? Dataset substances = object.substances else - bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter." + raise ArgumentError, "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter." end # make predictions @@ -486,7 +486,7 @@ module OpenTox # @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations def self.from_csv_file file metadata_file = file.sub(/csv$/,"json") - bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file + raise ArgumentError, "No metadata file #{metadata_file}" unless File.exist? metadata_file model_validation = self.new JSON.parse(File.read(metadata_file)) training_dataset = Dataset.from_csv_file file model = Lazar.create training_dataset: training_dataset @@ -510,7 +510,7 @@ module OpenTox unless training_dataset # try to import Import::Enanomapper.import training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset + raise ArgumentError, "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset end prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first -- cgit v1.2.3 From 0882c2cd0de934d7377fc9d08c306be98612c88a Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 16 Nov 2018 18:42:42 +0100 Subject: real datasets for testing, test data cleanup, Daphnia import, upper and lower similarity thresholds --- lib/model.rb | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index caf8a6e..08ca07e 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -32,7 +32,7 @@ module OpenTox # @param [OpenTox::Feature, nil] prediction_feature # By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature # @param [Hash, nil] algorithms - # Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and threshold), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys. + # Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and thresholds for predictions with high and low confidence), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys. # # @return [OpenTox::Model::Lazar] def self.create prediction_feature:nil, training_dataset:, algorithms:{} @@ -80,7 +80,7 @@ module OpenTox } model.algorithms[:similarity] = { :method => "Algorithm::Similarity.tanimoto", - :min => 0.5, + :min => [0.5,0.2], } elsif model.class == LazarRegression model.algorithms[:prediction] = { @@ -88,7 +88,7 @@ module OpenTox } model.algorithms[:similarity] = { :method => "Algorithm::Similarity.tanimoto", - :min => 0.5, + :min => [0.5,0.2], } end @@ -100,7 +100,7 @@ module OpenTox }, :similarity => { :method => "Algorithm::Similarity.weighted_cosine", - :min => 0.5, + :min => [0.5,0.2], }, :prediction => { :method => "Algorithm::Caret.rf", @@ -197,7 +197,7 @@ module OpenTox # Predict a substance (compound or nanoparticle) # @param [OpenTox::Substance] # @return [Hash] - def predict_substance substance, threshold = self.algorithms[:similarity][:min], prediction = nil + def predict_substance substance, threshold = self.algorithms[:similarity][:min].first, prediction = nil @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data case algorithms[:similarity][:method] @@ -228,7 +228,7 @@ module OpenTox end prediction ||= {:warnings => [], :measurements => []} - prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min]}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min] + prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min].first}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min].first neighbor_ids = [] neighbor_similarities = [] neighbor_dependent_variables = [] @@ -238,7 +238,7 @@ module OpenTox substance_ids.each_with_index do |s,i| # handle query substance if substance.id.to_s == s - prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min] # add measurements only once at first pass + prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min].first # add measurements only once at first pass prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance." else if fingerprints? @@ -264,11 +264,19 @@ module OpenTox if neighbor_similarities.empty? prediction[:value] = nil - prediction[:warnings] << "Could not find similar substances with experimental data in the training dataset." + prediction[:warnings] << "Could not find similar substances for threshold #{threshold} with experimental data in the training dataset." + if threshold == algorithms[:similarity][:min].last + prediction[:confidence] = "Out of applicability domain: Could not find similar substances with experimental data in the training dataset (Threshold: #{algorithms[:similarity][:min].last})." + return prediction + end elsif neighbor_similarities.size == 1 prediction[:value] = nil - prediction[:warnings] << "Cannot create prediction: Only one similar compound in the training set." + prediction[:warnings] << "Cannot create prediction: Only one similar compound for threshold #{threshold} in the training set (Threshold: #{algorithms[:similarity][:min].last})." prediction[:neighbors] = [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}] + if threshold == algorithms[:similarity][:min].last + prediction[:confidence] = "Out of applicability domain: Only one similar compound in the training set." + return prediction + end else query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint" # call prediction algorithm @@ -276,11 +284,17 @@ module OpenTox prediction.merge! result prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}} end - if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2 - prediction - else # try again with a lower threshold - prediction[:warnings] << "Lowering similarity threshold to 0.2." - predict_substance substance, 0.2, prediction + if threshold == algorithms[:similarity][:min].first + if prediction[:warnings].empty? + prediction[:confidence] = "High (close to bioassay results)" + return prediction + else # try again with a lower threshold + prediction[:warnings] << "Lowering similarity threshold to #{algorithms[:similarity][:min].last}." + predict_substance substance, algorithms[:similarity][:min].last, prediction + end + elsif threshold < algorithms[:similarity][:min].first + prediction[:confidence] = "Low (lower than bioassay results)" + return prediction end end -- cgit v1.2.3 From 1b44e0cd76f2ead93b8b3fa0f970c85ef32a4b14 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 16 Nov 2018 22:45:17 +0100 Subject: confidence for prediction datasets --- lib/model.rb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 08ca07e..cbfefe3 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -338,7 +338,8 @@ module OpenTox return predictions elsif object.is_a? Dataset d = object.copy - warning_feature = Warnings.find_or_create_by(:dataset_id => d.id) + #warning_feature = Warnings.find_or_create_by(:dataset_id => d.id) + confidence_feature = Confidence.find_or_create_by(:dataset_id => d.id) if prediction_feature.is_a? NominalBioActivity f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id) probability_features = {} @@ -356,7 +357,7 @@ module OpenTox # add predictions to dataset predictions.each do |substance_id,p| substance_id = BSON::ObjectId.from_string(substance_id) - d.add substance_id,warning_feature,p[:warnings].join(" ") unless p[:warnings].empty? + d.add substance_id,confidence_feature,p[:confidence] unless p[:value].nil? d.add substance_id,f,p[:value] p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} if p[:probabilities] -- cgit v1.2.3 From 455da06aa6459da0d25b286ca6cb866ff64c4c34 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 20 Jun 2019 22:01:50 +0200 Subject: separate csv serialisations for batch predictions and training data, repeated measurements in mutagenicity dataset fixed, daphnia import fixed, CENTRAL_MONGO_IP removed --- lib/model.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index cbfefe3..05cd113 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -286,14 +286,14 @@ module OpenTox end if threshold == algorithms[:similarity][:min].first if prediction[:warnings].empty? - prediction[:confidence] = "High (close to bioassay results)" + prediction[:confidence] = "Similar to bioassay results" return prediction else # try again with a lower threshold prediction[:warnings] << "Lowering similarity threshold to #{algorithms[:similarity][:min].last}." predict_substance substance, algorithms[:similarity][:min].last, prediction end elsif threshold < algorithms[:similarity][:min].first - prediction[:confidence] = "Low (lower than bioassay results)" + prediction[:confidence] = "Lower than bioassay results" return prediction end end @@ -348,9 +348,9 @@ module OpenTox end elsif prediction_feature.is_a? NumericBioActivity f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id) - prediction_interval = {} + prediction_interval = [] ["lower","upper"].each do |v| - prediction_interval[v] = LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id) + prediction_interval << LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id) end end -- cgit v1.2.3 From bea6f89f2a54a0612a30ce90c56c4ddecc4f7ed1 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 21 Jun 2019 12:55:21 +0200 Subject: measurement added for single neighbors --- lib/model.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/model.rb') diff --git a/lib/model.rb b/lib/model.rb index 05cd113..d7b2df6 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -272,7 +272,7 @@ module OpenTox elsif neighbor_similarities.size == 1 prediction[:value] = nil prediction[:warnings] << "Cannot create prediction: Only one similar compound for threshold #{threshold} in the training set (Threshold: #{algorithms[:similarity][:min].last})." - prediction[:neighbors] = [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}] + prediction[:neighbors] = [{:id => neighbor_ids.first, :measurement => neighbor_dependent_variables[0], :similarity => neighbor_similarities.first}] if threshold == algorithms[:similarity][:min].last prediction[:confidence] = "Out of applicability domain: Only one similar compound in the training set." return prediction -- cgit v1.2.3