From 3e7d98147ef55e4e6e55a518a05c6eecee6f769b Mon Sep 17 00:00:00 2001 From: gebele Date: Thu, 19 Jan 2017 12:41:55 +0000 Subject: fixed feature test and unique_descriptors --- lib/unique_descriptors.rb | 2 +- test/feature.rb | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/unique_descriptors.rb b/lib/unique_descriptors.rb index 8341a67..1b19d6a 100644 --- a/lib/unique_descriptors.rb +++ b/lib/unique_descriptors.rb @@ -48,7 +48,7 @@ UNIQUEDESCRIPTORS = [ #"Cdk.HBondAcceptorCount", #Descriptor that calculates the number of hydrogen bond acceptors. #"Cdk.HBondDonorCount", #Descriptor that calculates the number of hydrogen bond donors. "Cdk.HybridizationRatio", #Characterizes molecular complexity in terms of carbon hybridization states. - "Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential. + #"Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential. "Cdk.KappaShapeIndices", #Descriptor that calculates Kier and Hall kappa molecular shape indices. "Cdk.KierHallSmarts", #Counts the number of occurrences of the E-state fragments "Cdk.LargestChain", #Returns the number of atoms in the largest chain diff --git a/test/feature.rb b/test/feature.rb index 40edb9f..85ce588 100644 --- a/test/feature.rb +++ b/test/feature.rb @@ -57,20 +57,20 @@ class FeatureTest < MiniTest::Test def test_physchem_description assert_equal 346, PhysChem.descriptors.size assert_equal 15, PhysChem.openbabel_descriptors.size - assert_equal 295, PhysChem.cdk_descriptors.size + assert_equal 286, PhysChem.cdk_descriptors.size assert_equal 45, PhysChem.joelib_descriptors.size - assert_equal 310, PhysChem.unique_descriptors.size + assert_equal 309, PhysChem.unique_descriptors.size end def test_physchem assert_equal 346, PhysChem.descriptors.size c = Compound.from_smiles "CC(=O)CC(C)C" logP = PhysChem.find_or_create_by :name => "Openbabel.logP" - assert_equal 1.6215, logP.calculate(c) + assert_equal 1.6215, c.calculate_properties([logP]).first jlogP = PhysChem.find_or_create_by :name => "Joelib.LogP" - assert_equal 3.5951, jlogP.calculate(c) + assert_equal 3.5951, c.calculate_properties([jlogP]).first alogP = PhysChem.find_or_create_by :name => "Cdk.ALOGP.ALogP" - assert_equal 0.35380000000000034, alogP.calculate(c) + assert_equal 0.35380000000000034, c.calculate_properties([alogP]).first end end -- cgit v1.2.3 From 4c791043366275f6748a706dd5f73260d0560d41 Mon Sep 17 00:00:00 2001 From: gebele Date: Thu, 19 Jan 2017 16:39:46 +0000 Subject: added TODO for missing descriptor --- lib/unique_descriptors.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/unique_descriptors.rb b/lib/unique_descriptors.rb index 1b19d6a..fc10cd4 100644 --- a/lib/unique_descriptors.rb +++ b/lib/unique_descriptors.rb @@ -48,6 +48,7 @@ UNIQUEDESCRIPTORS = [ #"Cdk.HBondAcceptorCount", #Descriptor that calculates the number of hydrogen bond acceptors. #"Cdk.HBondDonorCount", #Descriptor that calculates the number of hydrogen bond donors. "Cdk.HybridizationRatio", #Characterizes molecular complexity in terms of carbon hybridization states. + # TODO check why the next descriptor is not present in the CDK_DESCRIPTIONS variable. #"Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential. "Cdk.KappaShapeIndices", #Descriptor that calculates Kier and Hall kappa molecular shape indices. "Cdk.KierHallSmarts", #Counts the number of occurrences of the E-state fragments -- cgit v1.2.3 From 4e0088110a6a92fc2787c33c1b09ea76dae70dae Mon Sep 17 00:00:00 2001 From: gebele Date: Thu, 16 Mar 2017 12:53:23 +0000 Subject: loael edit --- lib/caret.rb | 11 +++++------ lib/dataset.rb | 2 +- lib/model.rb | 35 ++++++++++++++++++++++------------- lib/regression.rb | 2 +- 4 files changed, 29 insertions(+), 21 deletions(-) diff --git a/lib/caret.rb b/lib/caret.rb index f5c2bde..8bccf74 100644 --- a/lib/caret.rb +++ b/lib/caret.rb @@ -22,12 +22,11 @@ module OpenTox end if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == [] prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights - prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." + prediction[:warnings] << "No variables for regression model. Using weighted average of similar substances." elsif dependent_variables.size < 3 prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights - prediction[:warning] = "Insufficient number of neighbors (#{dependent_variables.size}) for regression model. Using weighted average of similar substances." - + prediction[:warnings] << "Insufficient number of neighbors (#{dependent_variables.size}) for regression model. Using weighted average of similar substances." else dependent_variables.each_with_index do |v,i| dependent_variables[i] = to_r(v) @@ -52,7 +51,7 @@ module OpenTox $logger.debug dependent_variables $logger.debug independent_variables prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights - prediction[:warning] = "R caret model creation error. Using weighted average of similar substances." + prediction[:warnings] << "R caret model creation error. Using weighted average of similar substances." return prediction end begin @@ -73,12 +72,12 @@ module OpenTox $logger.debug "R caret prediction error for:" $logger.debug self.inspect prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights - prediction[:warning] = "R caret prediction error. Using weighted average of similar substances" + prediction[:warnings] << "R caret prediction error. Using weighted average of similar substances" return prediction end if prediction.nil? or prediction[:value].nil? prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights - prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances." + prediction[:warnings] << "Empty R caret prediction. Using weighted average of similar substances." end end prediction diff --git a/lib/dataset.rb b/lib/dataset.rb index 44690e1..6e7d67f 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -46,7 +46,7 @@ module OpenTox if data_entries[substance.to_s] and data_entries[substance.to_s][feature.to_s] data_entries[substance.to_s][feature.to_s] else - nil + [nil] end end diff --git a/lib/model.rb b/lib/model.rb index b18610d..56d8665 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -57,7 +57,7 @@ module OpenTox model.version = {:warning => "git is not installed"} end - # set defaults + # set defaults# substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1 @@ -70,7 +70,7 @@ module OpenTox }, :similarity => { :method => "Algorithm::Similarity.tanimoto", - :min => 0.1 + :min => 0.5, }, :feature_selection => nil } @@ -81,7 +81,7 @@ module OpenTox } elsif model.class == LazarRegression model.algorithms[:prediction] = { - :method => "Algorithm::Caret.pls", + :method => "Algorithm::Caret.rf", } end @@ -93,7 +93,7 @@ module OpenTox }, :similarity => { :method => "Algorithm::Similarity.weighted_cosine", - :min => 0.5 + :min => 0.5, }, :prediction => { :method => "Algorithm::Caret.rf", @@ -191,7 +191,7 @@ module OpenTox # Predict a substance (compound or nanoparticle) # @param [OpenTox::Substance] # @return [Hash] - def predict_substance substance + def predict_substance substance, threshold = self.algorithms[:similarity][:min] @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data case algorithms[:similarity][:method] @@ -221,20 +221,19 @@ module OpenTox bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'." end - prediction = {} + prediction = {:warnings => [], :measurements => []} + prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min]}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min] neighbor_ids = [] neighbor_similarities = [] neighbor_dependent_variables = [] neighbor_independent_variables = [] - prediction = {} # find neighbors substance_ids.each_with_index do |s,i| # handle query substance if substance.id.to_s == s - prediction[:measurements] ||= [] prediction[:measurements] << dependent_variables[i] - prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance." + prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance." else if fingerprints? neighbor_descriptors = fingerprints[i] @@ -243,7 +242,7 @@ module OpenTox neighbor_descriptors = scaled_variables.collect{|v| v[i]} end sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights] - if sim >= algorithms[:similarity][:min] + if sim >= threshold neighbor_ids << s neighbor_similarities << sim neighbor_dependent_variables << dependent_variables[i] @@ -258,17 +257,27 @@ module OpenTox measurements = nil if neighbor_similarities.empty? - prediction.merge!({:value => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) + prediction[:value] = nil + prediction[:warnings] << "Could not find similar substances with experimental data in the training dataset." elsif neighbor_similarities.size == 1 - prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]}) + prediction[:value] = nil + prediction[:warnings] << "Cannot create prediction: Only one similar compound in the training set." + prediction[:neighbors] = [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}] else query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint" # call prediction algorithm result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors prediction.merge! result prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}} + #if neighbor_similarities.max < algorithms[:similarity][:warn_min] + #prediction[:warnings] << "Closest neighbor has similarity < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain." + #end + end + if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] + prediction + else # try again with a lower threshold + predict_substance substance, 0.2 end - prediction end # Predict a substance (compound or nanoparticle), an array of substances or a dataset diff --git a/lib/regression.rb b/lib/regression.rb index fd2855f..25c0732 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -17,7 +17,7 @@ module OpenTox sim_sum += weights[i] end if dependent_variables sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum - {:value => prediction} + {:value => prediction, :warnings => ["Weighted average prediction, no prediction interval available."]} end end -- cgit v1.2.3 From a8e6ea94d037f576be699b43751702ef5bc37496 Mon Sep 17 00:00:00 2001 From: gebele Date: Tue, 21 Mar 2017 09:18:49 +0000 Subject: fixed test_to_csv --- test/dataset.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/dataset.rb b/test/dataset.rb index e91e65a..055a029 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -160,7 +160,7 @@ class DatasetTest < MiniTest::Test if v.numeric? assert_equal v.to_f, serialized[inchi][i].to_f else - assert_equal v, serialized[inchi][i] + assert_equal v.to_s, serialized[inchi][i].to_s end end -- cgit v1.2.3 From 5d7aec4c09709f2179bbbac1e1140255156c0cda Mon Sep 17 00:00:00 2001 From: gebele Date: Tue, 21 Mar 2017 15:08:38 +0000 Subject: fixed endless loop; rescue for missing neighbors --- lib/model.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/model.rb b/lib/model.rb index 56d8665..7cc6765 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -273,7 +273,7 @@ module OpenTox #prediction[:warnings] << "Closest neighbor has similarity < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain." #end end - if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] + if prediction[:warnings].empty? or threshold <= algorithms[:similarity][:min] prediction else # try again with a lower threshold predict_substance substance, 0.2 @@ -309,7 +309,7 @@ module OpenTox # serialize result if object.is_a? Substance prediction = predictions[substances.first.id.to_s] - prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity + prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} if prediction[:neighbors]# sort according to similarity return prediction elsif object.is_a? Array return predictions -- cgit v1.2.3 From 658cee5e1df2e1fcb0c7f9259955f1e3199deb5a Mon Sep 17 00:00:00 2001 From: gebele Date: Tue, 21 Mar 2017 15:11:44 +0000 Subject: fixed regression test for rf algorithm see: 'loael edit' commit;along with larger treining datasets --- test/model-regression.rb | 28 ++++++++++++++-------------- test/setup.rb | 2 ++ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/test/model-regression.rb b/test/model-regression.rb index 86b927c..5903e88 100644 --- a/test/model-regression.rb +++ b/test/model-regression.rb @@ -10,21 +10,21 @@ class LazarRegressionTest < MiniTest::Test }, :similarity => { :method => "Algorithm::Similarity.tanimoto", - :min => 0.1 + :min => 0.5 }, :prediction => { - :method => "Algorithm::Caret.pls", + :method => "Algorithm::Caret.rf", }, :feature_selection => nil, } - training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv") + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM_log10.csv") model = Model::Lazar.create training_dataset: training_dataset assert_kind_of Model::LazarRegression, model assert_equal algorithms, model.algorithms - substance = training_dataset.substances[10] + substance = training_dataset.substances[145] prediction = model.predict substance assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." - substance = Compound.from_smiles "NC(=O)OCCC" + substance = Compound.from_smiles "c1ccc(cc1)Oc1ccccc1" prediction = model.predict substance refute_nil prediction[:value] refute_nil prediction[:prediction_interval] @@ -59,8 +59,8 @@ class LazarRegressionTest < MiniTest::Test model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms compound = Compound.from_smiles "CCCSCCSCC" prediction = model.predict compound - assert_equal 4, prediction[:neighbors].size - assert_equal 1.37, prediction[:value].round(2) + assert_equal 3, prediction[:neighbors].size + assert prediction[:value].round(2) > 1.37, "Prediction value (#{prediction[:value].round(2)}) should be larger than 1.37." end def test_local_physchem_regression @@ -112,12 +112,12 @@ class LazarRegressionTest < MiniTest::Test :method => "Algorithm::Similarity.cosine", } } - training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv") + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv") model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms assert_kind_of Model::LazarRegression, model - assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method] + assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method] assert_equal "Algorithm::Similarity.cosine", model.algorithms[:similarity][:method] - assert_equal 0.1, model.algorithms[:similarity][:min] + assert_equal 0.5, model.algorithms[:similarity][:min] algorithms[:descriptors].delete :features assert_equal algorithms[:descriptors], model.algorithms[:descriptors] prediction = model.predict training_dataset.substances[10] @@ -130,14 +130,14 @@ class LazarRegressionTest < MiniTest::Test :method => "Algorithm::FeatureSelection.correlation_filter", }, } - training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv") + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM_log10.csv") model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms assert_kind_of Model::LazarRegression, model - assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method] + assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method] assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method] - assert_equal 0.1, model.algorithms[:similarity][:min] + assert_equal 0.5, model.algorithms[:similarity][:min] assert_equal algorithms[:feature_selection][:method], model.algorithms[:feature_selection][:method] - prediction = model.predict training_dataset.substances[10] + prediction = model.predict training_dataset.substances[145] refute_nil prediction[:value] end diff --git a/test/setup.rb b/test/setup.rb index 40c8ebf..c1cddfb 100644 --- a/test/setup.rb +++ b/test/setup.rb @@ -3,6 +3,8 @@ require 'minitest/autorun' require_relative '../lib/lazar.rb' #require 'lazar' include OpenTox +#$mongo.database.drop +#$gridfs = $mongo.database.fs # recreate GridFS indexes TEST_DIR ||= File.expand_path(File.dirname(__FILE__)) DATA_DIR ||= File.join(TEST_DIR,"data") training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first -- cgit v1.2.3 From 2cebe2c0dc60bda0c94d0d9c38e7931efadd7155 Mon Sep 17 00:00:00 2001 From: gebele Date: Fri, 24 Mar 2017 12:51:12 +0000 Subject: adjusted for higher similarity min value --- test/model-classification.rb | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/test/model-classification.rb b/test/model-classification.rb index 1424f6a..0bb3e83 100644 --- a/test/model-classification.rb +++ b/test/model-classification.rb @@ -10,35 +10,35 @@ class LazarClassificationTest < MiniTest::Test }, :similarity => { :method => "Algorithm::Similarity.tanimoto", - :min => 0.1 + :min => 0.5 }, + :feature_selection => nil, :prediction => { :method => "Algorithm::Classification.weighted_majority_vote", }, - :feature_selection => nil, } training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") model = Model::Lazar.create training_dataset: training_dataset assert_kind_of Model::LazarClassification, model assert_equal algorithms, model.algorithms - substance = training_dataset.substances[10] + substance = training_dataset.substances[49] prediction = model.predict substance assert_equal "false", prediction[:value] [ { - :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"), + :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H14N2O4/c1-5(10)2-8(7-12)3-6(11)4-9/h5-6,9-11H,2-4H2,1H3"), :prediction => "false", },{ - :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"), + :compound => OpenTox::Compound.from_smiles("OCC(CN(CC(O)C)N=O)O"), :prediction => "false", } ].each do |example| prediction = model.predict example[:compound] assert_equal example[:prediction], prediction[:value] end - compound = Compound.from_smiles "CCO" + compound = Compound.from_smiles "O=NN1CCC1" prediction = model.predict compound assert_equal "true", prediction[:value] - assert_equal ["false"], prediction[:measurements] + #assert_equal ["false"], prediction[:measurements] # make a dataset prediction compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv") @@ -46,12 +46,12 @@ class LazarClassificationTest < MiniTest::Test assert_equal compound_dataset.compounds, prediction_dataset.compounds cid = prediction_dataset.compounds[7].id.to_s - assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warning] + assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warnings][0] prediction_dataset.predictions.each do |cid,pred| - assert_equal "Could not find similar substances with experimental data in the training dataset.", pred[:warning] if pred[:value].nil? + assert_equal "Could not find similar substances with experimental data in the training dataset.", pred[:warnings][0] if pred[:value].nil? end cid = Compound.from_smiles("CCOC(=O)N").id.to_s - assert_match "excluded", prediction_dataset.predictions[cid][:warning] + assert_match "excluded", prediction_dataset.predictions[cid][:info] # cleanup [training_dataset,model,compound_dataset,prediction_dataset].each{|o| o.delete} end @@ -85,7 +85,7 @@ class LazarClassificationTest < MiniTest::Test model = Model::Lazar.create training_dataset: training_dataset t = Time.now 2.times do - compound = Compound.from_smiles("Clc1ccccc1NN") + compound = Compound.from_smiles("OCC(CN(CC(O)C)N=O)O") prediction = model.predict compound assert_equal "1", prediction[:value] end -- cgit v1.2.3 From 04f1f35f2248db65b313c382ee57a53047778472 Mon Sep 17 00:00:00 2001 From: gebele Date: Fri, 24 Mar 2017 13:00:19 +0000 Subject: adjusted for higher default similarity min value --- test/validation-classification.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/validation-classification.rb b/test/validation-classification.rb index fb4c3e7..ac25b29 100644 --- a/test/validation-classification.rb +++ b/test/validation-classification.rb @@ -47,9 +47,9 @@ class ValidationClassificationTest < MiniTest::Test dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" model = Model::Lazar.create training_dataset: dataset loo = ClassificationLeaveOneOut.create model - assert_equal 14, loo.nr_unpredicted + assert_equal 77, loo.nr_unpredicted refute_empty loo.confusion_matrix - assert loo.accuracy > 0.77 + assert loo.accuracy > 0.74 assert loo.weighted_accuracy > loo.accuracy, "Weighted accuracy (#{loo.weighted_accuracy}) should be larger than accuracy (#{loo.accuracy})." end -- cgit v1.2.3 From eef5d89a92dd7dde9acf9fc063a54e1fe729a89b Mon Sep 17 00:00:00 2001 From: gebele Date: Tue, 28 Mar 2017 11:00:12 +0000 Subject: fixed wrong accuracy assertion to rmse --- test/validation-regression.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/validation-regression.rb b/test/validation-regression.rb index 01ed644..c5ad312 100644 --- a/test/validation-regression.rb +++ b/test/validation-regression.rb @@ -84,7 +84,7 @@ class ValidationRegressionTest < MiniTest::Test repeated_cv = RepeatedCrossValidation.create model repeated_cv.crossvalidations.each do |cv| assert cv.r_squared > 0.34, "R^2 (#{cv.r_squared}) should be larger than 0.034" - assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split" + assert cv.rmse < 0.5, "RMSE (#{cv.rmse}) should be smaller than 0.5" end end -- cgit v1.2.3 From db38c345fdc119edd8a892a5b0ba2c2a4b1cbe1f Mon Sep 17 00:00:00 2001 From: gebele Date: Fri, 31 Mar 2017 15:07:28 +0000 Subject: set default min sim to 0.1 for classification and 0.5 for regression --- lib/model.rb | 12 ++++++++---- test/model-classification.rb | 22 +++++++++++----------- test/validation-classification.rb | 4 ++-- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/lib/model.rb b/lib/model.rb index 7cc6765..80affd5 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -68,10 +68,6 @@ module OpenTox :method => "fingerprint", :type => "MP2D", }, - :similarity => { - :method => "Algorithm::Similarity.tanimoto", - :min => 0.5, - }, :feature_selection => nil } @@ -79,10 +75,18 @@ module OpenTox model.algorithms[:prediction] = { :method => "Algorithm::Classification.weighted_majority_vote", } + model.algorithms[:similarity] = { + :method => "Algorithm::Similarity.tanimoto", + :min => 0.1, + } elsif model.class == LazarRegression model.algorithms[:prediction] = { :method => "Algorithm::Caret.rf", } + model.algorithms[:similarity] = { + :method => "Algorithm::Similarity.tanimoto", + :min => 0.5, + } end elsif substance_classes.first == "OpenTox::Nanoparticle" diff --git a/test/model-classification.rb b/test/model-classification.rb index 0bb3e83..1424f6a 100644 --- a/test/model-classification.rb +++ b/test/model-classification.rb @@ -10,35 +10,35 @@ class LazarClassificationTest < MiniTest::Test }, :similarity => { :method => "Algorithm::Similarity.tanimoto", - :min => 0.5 + :min => 0.1 }, - :feature_selection => nil, :prediction => { :method => "Algorithm::Classification.weighted_majority_vote", }, + :feature_selection => nil, } training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") model = Model::Lazar.create training_dataset: training_dataset assert_kind_of Model::LazarClassification, model assert_equal algorithms, model.algorithms - substance = training_dataset.substances[49] + substance = training_dataset.substances[10] prediction = model.predict substance assert_equal "false", prediction[:value] [ { - :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H14N2O4/c1-5(10)2-8(7-12)3-6(11)4-9/h5-6,9-11H,2-4H2,1H3"), + :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"), :prediction => "false", },{ - :compound => OpenTox::Compound.from_smiles("OCC(CN(CC(O)C)N=O)O"), + :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"), :prediction => "false", } ].each do |example| prediction = model.predict example[:compound] assert_equal example[:prediction], prediction[:value] end - compound = Compound.from_smiles "O=NN1CCC1" + compound = Compound.from_smiles "CCO" prediction = model.predict compound assert_equal "true", prediction[:value] - #assert_equal ["false"], prediction[:measurements] + assert_equal ["false"], prediction[:measurements] # make a dataset prediction compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv") @@ -46,12 +46,12 @@ class LazarClassificationTest < MiniTest::Test assert_equal compound_dataset.compounds, prediction_dataset.compounds cid = prediction_dataset.compounds[7].id.to_s - assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warnings][0] + assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warning] prediction_dataset.predictions.each do |cid,pred| - assert_equal "Could not find similar substances with experimental data in the training dataset.", pred[:warnings][0] if pred[:value].nil? + assert_equal "Could not find similar substances with experimental data in the training dataset.", pred[:warning] if pred[:value].nil? end cid = Compound.from_smiles("CCOC(=O)N").id.to_s - assert_match "excluded", prediction_dataset.predictions[cid][:info] + assert_match "excluded", prediction_dataset.predictions[cid][:warning] # cleanup [training_dataset,model,compound_dataset,prediction_dataset].each{|o| o.delete} end @@ -85,7 +85,7 @@ class LazarClassificationTest < MiniTest::Test model = Model::Lazar.create training_dataset: training_dataset t = Time.now 2.times do - compound = Compound.from_smiles("OCC(CN(CC(O)C)N=O)O") + compound = Compound.from_smiles("Clc1ccccc1NN") prediction = model.predict compound assert_equal "1", prediction[:value] end diff --git a/test/validation-classification.rb b/test/validation-classification.rb index ac25b29..fb4c3e7 100644 --- a/test/validation-classification.rb +++ b/test/validation-classification.rb @@ -47,9 +47,9 @@ class ValidationClassificationTest < MiniTest::Test dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" model = Model::Lazar.create training_dataset: dataset loo = ClassificationLeaveOneOut.create model - assert_equal 77, loo.nr_unpredicted + assert_equal 14, loo.nr_unpredicted refute_empty loo.confusion_matrix - assert loo.accuracy > 0.74 + assert loo.accuracy > 0.77 assert loo.weighted_accuracy > loo.accuracy, "Weighted accuracy (#{loo.weighted_accuracy}) should be larger than accuracy (#{loo.accuracy})." end -- cgit v1.2.3 From 6201c1f3814628499e168bd99fddc5b65eb32fb5 Mon Sep 17 00:00:00 2001 From: gebele Date: Fri, 31 Mar 2017 15:08:14 +0000 Subject: adjusted classification tests for min sim 0.1 --- test/model-classification.rb | 8 +++++--- test/validation-classification.rb | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/test/model-classification.rb b/test/model-classification.rb index 1424f6a..f75598b 100644 --- a/test/model-classification.rb +++ b/test/model-classification.rb @@ -46,12 +46,14 @@ class LazarClassificationTest < MiniTest::Test assert_equal compound_dataset.compounds, prediction_dataset.compounds cid = prediction_dataset.compounds[7].id.to_s - assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warning] + assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warnings][0] + expectations = ["Cannot create prediction: Only one similar compound in the training set.", + "Could not find similar substances with experimental data in the training dataset."] prediction_dataset.predictions.each do |cid,pred| - assert_equal "Could not find similar substances with experimental data in the training dataset.", pred[:warning] if pred[:value].nil? + assert_includes expectations, pred[:warnings][0] if pred[:value].nil? end cid = Compound.from_smiles("CCOC(=O)N").id.to_s - assert_match "excluded", prediction_dataset.predictions[cid][:warning] + assert_match "excluded", prediction_dataset.predictions[cid][:info] # cleanup [training_dataset,model,compound_dataset,prediction_dataset].each{|o| o.delete} end diff --git a/test/validation-classification.rb b/test/validation-classification.rb index fb4c3e7..ce06063 100644 --- a/test/validation-classification.rb +++ b/test/validation-classification.rb @@ -47,7 +47,7 @@ class ValidationClassificationTest < MiniTest::Test dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" model = Model::Lazar.create training_dataset: dataset loo = ClassificationLeaveOneOut.create model - assert_equal 14, loo.nr_unpredicted + assert_equal 24, loo.nr_unpredicted refute_empty loo.confusion_matrix assert loo.accuracy > 0.77 assert loo.weighted_accuracy > loo.accuracy, "Weighted accuracy (#{loo.weighted_accuracy}) should be larger than accuracy (#{loo.accuracy})." -- cgit v1.2.3 From 324e502a3a8415512336b93fa4faf591d7904ad8 Mon Sep 17 00:00:00 2001 From: gebele Date: Tue, 11 Apr 2017 08:03:15 +0000 Subject: changed compound to predict fixed test --- test/model-validation.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/model-validation.rb b/test/model-validation.rb index 83986d6..9304232 100644 --- a/test/model-validation.rb +++ b/test/model-validation.rb @@ -12,7 +12,7 @@ class ValidationModelTest < MiniTest::Test m.crossvalidations.each do |cv| assert cv.accuracy > 0.74, "Crossvalidation accuracy (#{cv.accuracy}) should be larger than 0.75. This may happen due to an unfavorable training/test set split." end - prediction = m.predict Compound.from_smiles("CCCC(NN)C") + prediction = m.predict Compound.from_smiles("OCC(CN(CC(O)C)N=O)O") assert_equal "true", prediction[:value] m.delete end -- cgit v1.2.3 From 1f95bfdc11de7c4fdc2e9f0313be3506dd6ea9c1 Mon Sep 17 00:00:00 2001 From: gebele Date: Thu, 4 May 2017 09:50:08 +0000 Subject: ensure warnings for validation statistics --- lib/crossvalidation.rb | 1 + lib/validation-statistics.rb | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 75c5db5..06a1e2a 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -90,6 +90,7 @@ module OpenTox field :within_prediction_interval, type: Integer, default:0 field :out_of_prediction_interval, type: Integer, default:0 field :correlation_plot_id, type: BSON::ObjectId + field :warnings, type: Array end # Independent repeated crossvalidations diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 2d522ae..69e7992 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -111,6 +111,7 @@ module OpenTox # Get statistics # @return [Hash] def statistics + self.warnings = [] self.rmse = 0 self.mae = 0 self.within_prediction_interval = 0 @@ -132,8 +133,10 @@ module OpenTox end end else - warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + trd_id = model.training_dataset_id + smiles = Compound.find(cid).smiles + self.warnings << "No training activities for #{smiles} in training dataset #{trd_id}." + $logger.debug "No training activities for #{smiles} in training dataset #{trd_id}." end end R.assign "measurement", x @@ -146,6 +149,7 @@ module OpenTox $logger.debug "RMSE #{rmse}" $logger.debug "MAE #{mae}" $logger.debug "#{percent_within_prediction_interval.round(2)}% of measurements within prediction interval" + $logger.debug "#{warnings}" save { :mae => mae, -- cgit v1.2.3 From d4d914052de0a37489f9491dbe491093bd14a03a Mon Sep 17 00:00:00 2001 From: gebele Date: Thu, 4 May 2017 09:50:57 +0000 Subject: ensure always two probability values for classification --- lib/classification.rb | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/classification.rb b/lib/classification.rb index 638492b..a875903 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -18,6 +18,11 @@ module OpenTox class_weights.each do |a,w| probabilities[a] = w.sum/weights.sum end + # DG: hack to ensure always two probability values + if probabilities.keys.uniq.size == 1 + missing_key = probabilities.keys.uniq[0].match(/^non/) ? probabilities.keys.uniq[0].sub(/non-/,"") : "non-"+probabilities.keys.uniq[0] + probabilities[missing_key] = 0.0 + end probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h p_max = probabilities.collect{|a,p| p}.max prediction = probabilities.key(p_max) -- cgit v1.2.3 From 658e0f706622eabce6900134fb1d968b440fd704 Mon Sep 17 00:00:00 2001 From: gebele Date: Thu, 4 May 2017 09:59:37 +0000 Subject: cleanup and hack for mongos file size limit --- lib/model.rb | 1 - lib/train-test-validation.rb | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/model.rb b/lib/model.rb index 80affd5..9c5c19b 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -145,7 +145,6 @@ module OpenTox end model.descriptor_ids = model.fingerprints.flatten.uniq model.descriptor_ids.each do |d| - # resulting model may break BSON size limit (e.g. f Kazius dataset) model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/ end # calculate physchem properties diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb index 034ae3a..9a5532d 100644 --- a/lib/train-test-validation.rb +++ b/lib/train-test-validation.rb @@ -27,6 +27,8 @@ module OpenTox end end predictions.select!{|cid,p| p[:value] and p[:measurements]} + # hack to avoid mongos file size limit error on large datasets + #predictions.each{|cid,p| p[:neighbors] = []} if model.training_dataset.name.match(/mutagenicity/i) validation = self.new( :model_id => validation_model.id, :test_dataset_id => test_set.id, -- cgit v1.2.3 From f82f516db24cd688f8a75b2b9a27c5ac46aade67 Mon Sep 17 00:00:00 2001 From: gebele Date: Fri, 5 May 2017 12:43:01 +0000 Subject: added warnings field for leave-one-out test fails --- lib/leave-one-out-validation.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index 8d22018..c33c92b 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -58,6 +58,7 @@ module OpenTox field :within_prediction_interval, type: Integer, default:0 field :out_of_prediction_interval, type: Integer, default:0 field :correlation_plot_id, type: BSON::ObjectId + field :warnings, type: Array end end -- cgit v1.2.3 From 9aa5203dd375225996c1efe4be1a4324ddc6cda7 Mon Sep 17 00:00:00 2001 From: gebele Date: Mon, 22 May 2017 12:45:32 +0000 Subject: fix for lower min sim threshold --- lib/model.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/model.rb b/lib/model.rb index 9c5c19b..475a346 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -276,7 +276,7 @@ module OpenTox #prediction[:warnings] << "Closest neighbor has similarity < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain." #end end - if prediction[:warnings].empty? or threshold <= algorithms[:similarity][:min] + if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2 prediction else # try again with a lower threshold predict_substance substance, 0.2 -- cgit v1.2.3