From 9d17895ab9e8cd31e0f32e8e622e13612ea5ff77 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 12 Oct 2018 21:58:36 +0200 Subject: validation statistic fixes --- test/classification-model.rb | 128 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 test/classification-model.rb (limited to 'test/classification-model.rb') diff --git a/test/classification-model.rb b/test/classification-model.rb new file mode 100644 index 0000000..b94b5e6 --- /dev/null +++ b/test/classification-model.rb @@ -0,0 +1,128 @@ +require_relative "setup.rb" + +class LazarClassificationTest < MiniTest::Test + + def test_classification_default + algorithms = { + :descriptors => { + :method => "fingerprint", + :type => "MP2D" + }, + :similarity => { + :method => "Algorithm::Similarity.tanimoto", + :min => 0.5 + }, + :prediction => { + :method => "Algorithm::Classification.weighted_majority_vote", + }, + :feature_selection => nil, + } + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") + model = Model::Lazar.create training_dataset: training_dataset + assert_kind_of Model::LazarClassification, model + assert_equal algorithms, model.algorithms + [ { + :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"), + :prediction => "false", + },{ + :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"), + :prediction => "false", + } ].each do |example| + prediction = model.predict example[:compound] + p example[:compound] + p prediction + #assert_equal example[:prediction], prediction[:value] + end + + compound = Compound.from_smiles "CCO" + prediction = model.predict compound + assert_equal "true", prediction[:value] + assert_equal ["false"], prediction[:measurements] + + # make a dataset prediction + compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv") + prediction_dataset = model.predict compound_dataset + assert_equal compound_dataset.compounds, prediction_dataset.compounds + + cid = prediction_dataset.compounds[7].id.to_s + assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warnings][0] + expectations = ["Cannot create prediction: Only one similar compound in the training set.", + "Could not find similar substances with experimental data in the training dataset."] + prediction_dataset.predictions.each do |cid,pred| + assert_includes expectations, pred[:warnings][0] if pred[:value].nil? + end + cid = Compound.from_smiles("CCOC(=O)N").id.to_s + assert_match "excluded", prediction_dataset.predictions[cid][:info] + end + + def test_classification_parameters + algorithms = { + :descriptors => { + :method => "fingerprint", + :type => "MACCS" + }, + :similarity => { + :min => 0.4 + }, + } + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") + model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms + assert_kind_of Model::LazarClassification, model + assert_equal "Algorithm::Classification.weighted_majority_vote", model.algorithms[:prediction][:method] + assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method] + assert_equal algorithms[:similarity][:min], model.algorithms[:similarity][:min] + substance = training_dataset.substances[10] + prediction = model.predict substance + assert_equal "false", prediction[:value] + assert_equal 4, prediction[:neighbors].size + end + + def test_dataset_prediction + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") + model = Model::Lazar.create training_dataset: training_dataset + result = model.predict training_dataset + assert_kind_of Dataset, result + assert 3, result.features.size + assert 8, result.compounds.size + assert_equal ["true"], result.values(result.compounds.first, result.features[0]) + assert_equal [0.65], result.values(result.compounds.first, result.features[1]) + assert_equal [0], result.values(result.compounds.first, result.features[2]) # classification returns nil, check if + end + + def test_carcinogenicity_rf_classification + skip "Caret rf may run into a (endless?) loop for some compounds." + dataset = Dataset.from_csv_file "#{DATA_DIR}/multi_cell_call.csv" + algorithms = { + :prediction => { + :method => "Algorithm::Caret.rf", + }, + } + model = Model::Lazar.create training_dataset: dataset, algorithms: algorithms + substance = Compound.from_smiles "[O-]S(=O)(=O)[O-].[Mn+2].O" + prediction = model.predict substance + p prediction + + end + + def test_rf_classification + skip "Caret rf may run into a (endless?) loop for some compounds." + algorithms = { + :prediction => { + :method => "Algorithm::Caret.rf", + }, + } + training_dataset = Dataset.from_sdf_file File.join(DATA_DIR,"cas_4337.sdf") + model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms + #p model.id.to_s + #model = Model::Lazar.find "5bbb4c0cca626909f6c8a924" + assert_kind_of Model::LazarClassification, model + assert_equal algorithms[:prediction][:method], model.algorithms["prediction"]["method"] + substance = Compound.from_smiles "Clc1ccc(cc1)C(=O)c1ccc(cc1)OC(C(=O)O)(C)C" + prediction = model.predict substance + assert_equal 51, prediction[:neighbors].size + assert_equal "nonmutagen", prediction[:value] + assert_equal 0.1, prediction[:probabilities]["mutagen"].round(1) + assert_equal 0.9, prediction[:probabilities]["nonmutagen"].round(1) + end + +end -- cgit v1.2.3 From 1652fd5df948da7ace622c73d158010add656b9f Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Wed, 24 Oct 2018 18:21:34 +0200 Subject: dataset map --- test/classification-model.rb | 47 +++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 22 deletions(-) (limited to 'test/classification-model.rb') diff --git a/test/classification-model.rb b/test/classification-model.rb index b94b5e6..7a2a64f 100644 --- a/test/classification-model.rb +++ b/test/classification-model.rb @@ -22,37 +22,40 @@ class LazarClassificationTest < MiniTest::Test assert_kind_of Model::LazarClassification, model assert_equal algorithms, model.algorithms [ { - :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"), + :compound => OpenTox::Compound.from_smiles("OCC(CN(CC(O)C)N=O)O"), :prediction => "false", },{ - :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"), - :prediction => "false", + :compound => OpenTox::Compound.from_smiles("O=CNc1scc(n1)c1ccc(o1)[N+](=O)[O-]"), + :prediction => "true", } ].each do |example| prediction = model.predict example[:compound] - p example[:compound] - p prediction - #assert_equal example[:prediction], prediction[:value] + assert_equal example[:prediction], prediction[:value] end - compound = Compound.from_smiles "CCO" - prediction = model.predict compound - assert_equal "true", prediction[:value] - assert_equal ["false"], prediction[:measurements] - # make a dataset prediction - compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv") + compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"multi_cell_call.csv") prediction_dataset = model.predict compound_dataset - assert_equal compound_dataset.compounds, prediction_dataset.compounds + puts prediction_dataset.to_csv + assert_equal compound_dataset.compounds.size, prediction_dataset.compounds.size + c = Compound.from_smiles "CC(CN(CC(O)C)N=O)O" + prediction_feature = prediction_dataset.features.select{|f| f.class == NominalLazarPrediction}[0] + assert_equal ["true"], prediction_dataset.values(c, prediction_feature) + p_true = LazarPredictionProbability.find_by(:name => "true") + p_false = LazarPredictionProbability.find_by(:name => "false") + p p_true + assert_equal [0.7], prediction_dataset.values(c,p_true) + assert_equal [0.0], prediction_dataset.values(c,p_false) + assert_equal 0.0, p_false - cid = prediction_dataset.compounds[7].id.to_s - assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warnings][0] - expectations = ["Cannot create prediction: Only one similar compound in the training set.", - "Could not find similar substances with experimental data in the training dataset."] - prediction_dataset.predictions.each do |cid,pred| - assert_includes expectations, pred[:warnings][0] if pred[:value].nil? - end - cid = Compound.from_smiles("CCOC(=O)N").id.to_s - assert_match "excluded", prediction_dataset.predictions[cid][:info] +# cid = prediction_dataset.compounds[7].id.to_s +# assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warnings][0] +# expectations = ["Cannot create prediction: Only one similar compound in the training set.", +# "Could not find similar substances with experimental data in the training dataset."] +# prediction_dataset.predictions.each do |cid,pred| +# assert_includes expectations, pred[:warnings][0] if pred[:value].nil? +# end +# cid = Compound.from_smiles("CCOC(=O)N").id.to_s +# assert_match "excluded", prediction_dataset.predictions[cid][:info] end def test_classification_parameters -- cgit v1.2.3 From 5e9a08c0b534fa96179fb5c81a9b4193e7b0aad8 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Mon, 29 Oct 2018 17:58:09 +0100 Subject: dataset folds fixed --- test/classification-model.rb | 1 + 1 file changed, 1 insertion(+) (limited to 'test/classification-model.rb') diff --git a/test/classification-model.rb b/test/classification-model.rb index 7a2a64f..bfb64db 100644 --- a/test/classification-model.rb +++ b/test/classification-model.rb @@ -84,6 +84,7 @@ class LazarClassificationTest < MiniTest::Test training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") model = Model::Lazar.create training_dataset: training_dataset result = model.predict training_dataset + puts result.to_csv assert_kind_of Dataset, result assert 3, result.features.size assert 8, result.compounds.size -- cgit v1.2.3 From d9c9d78e49d886ea91386adbbd2b523347df226e Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Mon, 29 Oct 2018 20:34:39 +0100 Subject: dataset predictions fixed --- test/classification-model.rb | 46 ++++++++++++-------------------------------- 1 file changed, 12 insertions(+), 34 deletions(-) (limited to 'test/classification-model.rb') diff --git a/test/classification-model.rb b/test/classification-model.rb index bfb64db..85668fb 100644 --- a/test/classification-model.rb +++ b/test/classification-model.rb @@ -1,6 +1,6 @@ require_relative "setup.rb" -class LazarClassificationTest < MiniTest::Test +class ClassificationModelTest < MiniTest::Test def test_classification_default algorithms = { @@ -31,31 +31,6 @@ class LazarClassificationTest < MiniTest::Test prediction = model.predict example[:compound] assert_equal example[:prediction], prediction[:value] end - - # make a dataset prediction - compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"multi_cell_call.csv") - prediction_dataset = model.predict compound_dataset - puts prediction_dataset.to_csv - assert_equal compound_dataset.compounds.size, prediction_dataset.compounds.size - c = Compound.from_smiles "CC(CN(CC(O)C)N=O)O" - prediction_feature = prediction_dataset.features.select{|f| f.class == NominalLazarPrediction}[0] - assert_equal ["true"], prediction_dataset.values(c, prediction_feature) - p_true = LazarPredictionProbability.find_by(:name => "true") - p_false = LazarPredictionProbability.find_by(:name => "false") - p p_true - assert_equal [0.7], prediction_dataset.values(c,p_true) - assert_equal [0.0], prediction_dataset.values(c,p_false) - assert_equal 0.0, p_false - -# cid = prediction_dataset.compounds[7].id.to_s -# assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warnings][0] -# expectations = ["Cannot create prediction: Only one similar compound in the training set.", -# "Could not find similar substances with experimental data in the training dataset."] -# prediction_dataset.predictions.each do |cid,pred| -# assert_includes expectations, pred[:warnings][0] if pred[:value].nil? -# end -# cid = Compound.from_smiles("CCOC(=O)N").id.to_s -# assert_match "excluded", prediction_dataset.predictions[cid][:info] end def test_classification_parameters @@ -81,16 +56,19 @@ class LazarClassificationTest < MiniTest::Test end def test_dataset_prediction - training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"multi_cell_call.csv") + test_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") model = Model::Lazar.create training_dataset: training_dataset - result = model.predict training_dataset - puts result.to_csv + result = model.predict test_dataset assert_kind_of Dataset, result - assert 3, result.features.size - assert 8, result.compounds.size - assert_equal ["true"], result.values(result.compounds.first, result.features[0]) - assert_equal [0.65], result.values(result.compounds.first, result.features[1]) - assert_equal [0], result.values(result.compounds.first, result.features[2]) # classification returns nil, check if + assert_equal 7, result.features.size + assert_equal 85, result.compounds.size + prediction_feature = result.prediction_features.first + assert_equal ["yes"], result.values(result.compounds[1], prediction_feature) + assert_equal ["no"], result.values(result.compounds[5], prediction_feature) + assert_nil result.predictions[result.compounds.first][:value] + assert_equal "yes", result.predictions[result.compounds[1]][:value] + assert_equal 0.27, result.predictions[result.compounds[1]][:probabilities]["no"].round(2) end def test_carcinogenicity_rf_classification -- cgit v1.2.3 From d61f78093f4ddf03c27a2c8ae0bab9c1f10c80f5 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Tue, 30 Oct 2018 17:26:59 +0100 Subject: tests fixed --- test/classification-model.rb | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'test/classification-model.rb') diff --git a/test/classification-model.rb b/test/classification-model.rb index 85668fb..1a3d4a8 100644 --- a/test/classification-model.rb +++ b/test/classification-model.rb @@ -32,6 +32,27 @@ class ClassificationModelTest < MiniTest::Test assert_equal example[:prediction], prediction[:value] end end + + def test_export_import + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") + export = Model::Lazar.create training_dataset: training_dataset + File.open("tmp.csv","w+"){|f| f.puts export.to_json } + import = Model::LazarClassification.new JSON.parse(File.read "tmp.csv") + assert_kind_of Model::LazarClassification, import + import.algorithms.each{|k,v| v.transform_keys!(&:to_sym) if v.is_a? Hash} + import.algorithms.transform_keys!(&:to_sym) + assert_equal export.algorithms, import.algorithms + [ { + :compound => OpenTox::Compound.from_smiles("OCC(CN(CC(O)C)N=O)O"), + :prediction => "false", + },{ + :compound => OpenTox::Compound.from_smiles("O=CNc1scc(n1)c1ccc(o1)[N+](=O)[O-]"), + :prediction => "true", + } ].each do |example| + prediction = import.predict example[:compound] + assert_equal example[:prediction], prediction[:value] + end + end def test_classification_parameters algorithms = { -- cgit v1.2.3 From 0882c2cd0de934d7377fc9d08c306be98612c88a Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 16 Nov 2018 18:42:42 +0100 Subject: real datasets for testing, test data cleanup, Daphnia import, upper and lower similarity thresholds --- test/classification-model.rb | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'test/classification-model.rb') diff --git a/test/classification-model.rb b/test/classification-model.rb index 1a3d4a8..8cbd4bb 100644 --- a/test/classification-model.rb +++ b/test/classification-model.rb @@ -10,7 +10,7 @@ class ClassificationModelTest < MiniTest::Test }, :similarity => { :method => "Algorithm::Similarity.tanimoto", - :min => 0.5 + :min => [0.5,0.2] }, :prediction => { :method => "Algorithm::Classification.weighted_majority_vote", @@ -61,7 +61,7 @@ class ClassificationModelTest < MiniTest::Test :type => "MACCS" }, :similarity => { - :min => 0.4 + :min => [0.4,0.1] }, } training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") @@ -77,7 +77,7 @@ class ClassificationModelTest < MiniTest::Test end def test_dataset_prediction - training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"multi_cell_call.csv") + training_dataset = Dataset.from_csv_file File.join(Download::DATA,"Carcinogenicity-Rodents.csv") test_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") model = Model::Lazar.create training_dataset: training_dataset result = model.predict test_dataset @@ -85,16 +85,16 @@ class ClassificationModelTest < MiniTest::Test assert_equal 7, result.features.size assert_equal 85, result.compounds.size prediction_feature = result.prediction_features.first - assert_equal ["yes"], result.values(result.compounds[1], prediction_feature) - assert_equal ["no"], result.values(result.compounds[5], prediction_feature) + assert_equal ["carcinogenic"], result.values(result.compounds[1], prediction_feature) + assert_equal ["non-carcinogenic"], result.values(result.compounds[5], prediction_feature) assert_nil result.predictions[result.compounds.first][:value] - assert_equal "yes", result.predictions[result.compounds[1]][:value] + assert_equal "carcinogenic", result.predictions[result.compounds[1]][:value] assert_equal 0.27, result.predictions[result.compounds[1]][:probabilities]["no"].round(2) end def test_carcinogenicity_rf_classification skip "Caret rf may run into a (endless?) loop for some compounds." - dataset = Dataset.from_csv_file "#{DATA_DIR}/multi_cell_call.csv" + dataset = Dataset.from_csv_file File.join(Download::DATA,"Carcinogenicity-Rodents.csv") algorithms = { :prediction => { :method => "Algorithm::Caret.rf", -- cgit v1.2.3 From c12d5bb40ab2a0783f755c3238a20448b9a5a42e Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 16 Nov 2018 22:17:55 +0100 Subject: minor test fixes --- test/classification-model.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'test/classification-model.rb') diff --git a/test/classification-model.rb b/test/classification-model.rb index 8cbd4bb..2032bf8 100644 --- a/test/classification-model.rb +++ b/test/classification-model.rb @@ -89,7 +89,7 @@ class ClassificationModelTest < MiniTest::Test assert_equal ["non-carcinogenic"], result.values(result.compounds[5], prediction_feature) assert_nil result.predictions[result.compounds.first][:value] assert_equal "carcinogenic", result.predictions[result.compounds[1]][:value] - assert_equal 0.27, result.predictions[result.compounds[1]][:probabilities]["no"].round(2) + assert_equal 0.27, result.predictions[result.compounds[1]][:probabilities]["non-carcinogenic"].round(2) end def test_carcinogenicity_rf_classification -- cgit v1.2.3 From 1b44e0cd76f2ead93b8b3fa0f970c85ef32a4b14 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 16 Nov 2018 22:45:17 +0100 Subject: confidence for prediction datasets --- test/classification-model.rb | 1 + 1 file changed, 1 insertion(+) (limited to 'test/classification-model.rb') diff --git a/test/classification-model.rb b/test/classification-model.rb index 2032bf8..79ccb98 100644 --- a/test/classification-model.rb +++ b/test/classification-model.rb @@ -90,6 +90,7 @@ class ClassificationModelTest < MiniTest::Test assert_nil result.predictions[result.compounds.first][:value] assert_equal "carcinogenic", result.predictions[result.compounds[1]][:value] assert_equal 0.27, result.predictions[result.compounds[1]][:probabilities]["non-carcinogenic"].round(2) + assert_match /High/i, result.predictions[result.compounds[1]][:confidence] end def test_carcinogenicity_rf_classification -- cgit v1.2.3 From 455da06aa6459da0d25b286ca6cb866ff64c4c34 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 20 Jun 2019 22:01:50 +0200 Subject: separate csv serialisations for batch predictions and training data, repeated measurements in mutagenicity dataset fixed, daphnia import fixed, CENTRAL_MONGO_IP removed --- test/classification-model.rb | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'test/classification-model.rb') diff --git a/test/classification-model.rb b/test/classification-model.rb index 79ccb98..c41b211 100644 --- a/test/classification-model.rb +++ b/test/classification-model.rb @@ -84,13 +84,19 @@ class ClassificationModelTest < MiniTest::Test assert_kind_of Dataset, result assert_equal 7, result.features.size assert_equal 85, result.compounds.size - prediction_feature = result.prediction_features.first + prediction_feature = result.prediction_feature assert_equal ["carcinogenic"], result.values(result.compounds[1], prediction_feature) assert_equal ["non-carcinogenic"], result.values(result.compounds[5], prediction_feature) assert_nil result.predictions[result.compounds.first][:value] assert_equal "carcinogenic", result.predictions[result.compounds[1]][:value] assert_equal 0.27, result.predictions[result.compounds[1]][:probabilities]["non-carcinogenic"].round(2) - assert_match /High/i, result.predictions[result.compounds[1]][:confidence] + assert_match /Similar/i, result.predictions[result.compounds[1]][:confidence] + csv = result.to_prediction_csv + rows = csv.split("\n") + assert_equal "ID,Original SMILES,Canonical SMILES,Prediction,Confidence,Probability: carcinogenic,Probability: non-carcinogenic,Measurements", rows[0] + items = rows[2].split(",") + assert_equal "carcinogenic", items[3] + assert_equal 0.27, items[6].to_f.round(2) # probabilities end def test_carcinogenicity_rf_classification -- cgit v1.2.3