From 9d17895ab9e8cd31e0f32e8e622e13612ea5ff77 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 12 Oct 2018 21:58:36 +0200 Subject: validation statistic fixes --- test/classification-model.rb | 128 ++++++++++++++++++++++++ test/classification-validation.rb | 126 ++++++++++++++++++++++++ test/descriptor.rb | 4 +- test/model-classification.rb | 145 ---------------------------- test/model-nanoparticle.rb | 135 -------------------------- test/model-nanoparticle.rb~ | 135 ++++++++++++++++++++++++++ test/model-regression.rb | 171 --------------------------------- test/model-validation.rb | 19 ---- test/nanomaterial-model-validation.rb | 54 ----------- test/nanomaterial-model-validation.rb~ | 54 +++++++++++ test/regression-model.rb | 171 +++++++++++++++++++++++++++++++++ test/regression-validation.rb | 91 ++++++++++++++++++ test/setup.rb | 4 +- test/validation-classification.rb | 113 ---------------------- test/validation-nanoparticle.rb | 133 ------------------------- test/validation-nanoparticle.rb~ | 133 +++++++++++++++++++++++++ test/validation-regression.rb | 91 ------------------ 17 files changed, 842 insertions(+), 865 deletions(-) create mode 100644 test/classification-model.rb create mode 100644 test/classification-validation.rb delete mode 100644 test/model-classification.rb delete mode 100644 test/model-nanoparticle.rb create mode 100644 test/model-nanoparticle.rb~ delete mode 100644 test/model-regression.rb delete mode 100644 test/model-validation.rb delete mode 100644 test/nanomaterial-model-validation.rb create mode 100644 test/nanomaterial-model-validation.rb~ create mode 100644 test/regression-model.rb create mode 100644 test/regression-validation.rb delete mode 100644 test/validation-classification.rb delete mode 100644 test/validation-nanoparticle.rb create mode 100644 test/validation-nanoparticle.rb~ delete mode 100644 test/validation-regression.rb (limited to 'test') diff --git a/test/classification-model.rb b/test/classification-model.rb new file mode 100644 index 0000000..b94b5e6 --- /dev/null +++ b/test/classification-model.rb @@ -0,0 +1,128 @@ +require_relative "setup.rb" + +class LazarClassificationTest < MiniTest::Test + + def test_classification_default + algorithms = { + :descriptors => { + :method => "fingerprint", + :type => "MP2D" + }, + :similarity => { + :method => "Algorithm::Similarity.tanimoto", + :min => 0.5 + }, + :prediction => { + :method => "Algorithm::Classification.weighted_majority_vote", + }, + :feature_selection => nil, + } + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") + model = Model::Lazar.create training_dataset: training_dataset + assert_kind_of Model::LazarClassification, model + assert_equal algorithms, model.algorithms + [ { + :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"), + :prediction => "false", + },{ + :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"), + :prediction => "false", + } ].each do |example| + prediction = model.predict example[:compound] + p example[:compound] + p prediction + #assert_equal example[:prediction], prediction[:value] + end + + compound = Compound.from_smiles "CCO" + prediction = model.predict compound + assert_equal "true", prediction[:value] + assert_equal ["false"], prediction[:measurements] + + # make a dataset prediction + compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv") + prediction_dataset = model.predict compound_dataset + assert_equal compound_dataset.compounds, prediction_dataset.compounds + + cid = prediction_dataset.compounds[7].id.to_s + assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warnings][0] + expectations = ["Cannot create prediction: Only one similar compound in the training set.", + "Could not find similar substances with experimental data in the training dataset."] + prediction_dataset.predictions.each do |cid,pred| + assert_includes expectations, pred[:warnings][0] if pred[:value].nil? + end + cid = Compound.from_smiles("CCOC(=O)N").id.to_s + assert_match "excluded", prediction_dataset.predictions[cid][:info] + end + + def test_classification_parameters + algorithms = { + :descriptors => { + :method => "fingerprint", + :type => "MACCS" + }, + :similarity => { + :min => 0.4 + }, + } + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") + model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms + assert_kind_of Model::LazarClassification, model + assert_equal "Algorithm::Classification.weighted_majority_vote", model.algorithms[:prediction][:method] + assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method] + assert_equal algorithms[:similarity][:min], model.algorithms[:similarity][:min] + substance = training_dataset.substances[10] + prediction = model.predict substance + assert_equal "false", prediction[:value] + assert_equal 4, prediction[:neighbors].size + end + + def test_dataset_prediction + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") + model = Model::Lazar.create training_dataset: training_dataset + result = model.predict training_dataset + assert_kind_of Dataset, result + assert 3, result.features.size + assert 8, result.compounds.size + assert_equal ["true"], result.values(result.compounds.first, result.features[0]) + assert_equal [0.65], result.values(result.compounds.first, result.features[1]) + assert_equal [0], result.values(result.compounds.first, result.features[2]) # classification returns nil, check if + end + + def test_carcinogenicity_rf_classification + skip "Caret rf may run into a (endless?) loop for some compounds." + dataset = Dataset.from_csv_file "#{DATA_DIR}/multi_cell_call.csv" + algorithms = { + :prediction => { + :method => "Algorithm::Caret.rf", + }, + } + model = Model::Lazar.create training_dataset: dataset, algorithms: algorithms + substance = Compound.from_smiles "[O-]S(=O)(=O)[O-].[Mn+2].O" + prediction = model.predict substance + p prediction + + end + + def test_rf_classification + skip "Caret rf may run into a (endless?) loop for some compounds." + algorithms = { + :prediction => { + :method => "Algorithm::Caret.rf", + }, + } + training_dataset = Dataset.from_sdf_file File.join(DATA_DIR,"cas_4337.sdf") + model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms + #p model.id.to_s + #model = Model::Lazar.find "5bbb4c0cca626909f6c8a924" + assert_kind_of Model::LazarClassification, model + assert_equal algorithms[:prediction][:method], model.algorithms["prediction"]["method"] + substance = Compound.from_smiles "Clc1ccc(cc1)C(=O)c1ccc(cc1)OC(C(=O)O)(C)C" + prediction = model.predict substance + assert_equal 51, prediction[:neighbors].size + assert_equal "nonmutagen", prediction[:value] + assert_equal 0.1, prediction[:probabilities]["mutagen"].round(1) + assert_equal 0.9, prediction[:probabilities]["nonmutagen"].round(1) + end + +end diff --git a/test/classification-validation.rb b/test/classification-validation.rb new file mode 100644 index 0000000..6ff8be0 --- /dev/null +++ b/test/classification-validation.rb @@ -0,0 +1,126 @@ +require_relative "setup.rb" + +class ValidationClassificationTest < MiniTest::Test + include OpenTox::Validation + + # defaults + + def test_default_classification_crossvalidation + dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" + model = Model::Lazar.create training_dataset: dataset + cv = ClassificationCrossValidation.create model + assert cv.accuracy[:without_warnings] > 0.65, "Accuracy (#{cv.accuracy[:without_warnings]}) should be larger than 0.65, this may occur due to an unfavorable training/test set split" + assert cv.weighted_accuracy[:all] > cv.accuracy[:all], "Weighted accuracy (#{cv.weighted_accuracy[:all]}) should be larger than accuracy (#{cv.accuracy[:all]})." + File.open("/tmp/tmp.pdf","w+"){|f| f.puts cv.probability_plot(format:"pdf")} + assert_match "PDF", `file -b /tmp/tmp.pdf` + File.open("/tmp/tmp.png","w+"){|f| f.puts cv.probability_plot(format:"png")} + assert_match "PNG", `file -b /tmp/tmp.png` + end + + # parameters + + def test_classification_crossvalidation_parameters + dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" + algorithms = { + :similarity => { :min => 0.3, }, + :descriptors => { :type => "FP3" } + } + model = Model::Lazar.create training_dataset: dataset, algorithms: algorithms + cv = ClassificationCrossValidation.create model + params = model.algorithms + params = JSON.parse(params.to_json) # convert symbols to string + + cv.validations.each do |validation| + validation_params = validation.model.algorithms + refute_nil model.training_dataset_id + refute_nil validation.model.training_dataset_id + refute_equal model.training_dataset_id, validation.model.training_dataset_id + assert_equal params, validation_params + end + end + + # LOO + + def test_classification_loo_validation + dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" + model = Model::Lazar.create training_dataset: dataset + loo = ClassificationLeaveOneOut.create model + assert_equal 77, loo.nr_unpredicted + refute_empty loo.confusion_matrix + assert loo.accuracy[:without_warnings] > 0.650 + assert loo.weighted_accuracy[:all] > loo.accuracy[:all], "Weighted accuracy (#{loo.weighted_accuracy[:all]}) should be larger than accuracy (#{loo.accuracy[:all]})." + end + + # repeated CV + + def test_repeated_crossvalidation + dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" + model = Model::Lazar.create training_dataset: dataset + repeated_cv = RepeatedCrossValidation.create model + repeated_cv.crossvalidations.each do |cv| + assert_operator cv.accuracy[:without_warnings], :>, 0.65, "model accuracy < 0.65, this may happen by chance due to an unfavorable training/test set split" + end + end + + def test_validation_model + m = Model::Validation.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" + [:endpoint,:species,:source].each do |p| + refute_empty m[p] + end + assert m.classification? + refute m.regression? + m.crossvalidations.each do |cv| + assert cv.accuracy[:without_warnings] > 0.65, "Crossvalidation accuracy (#{cv.accuracy[:without_warnings]}) should be larger than 0.65. This may happen due to an unfavorable training/test set split." + end + prediction = m.predict Compound.from_smiles("OCC(CN(CC(O)C)N=O)O") + assert_equal "false", prediction[:value] + m.delete + end + + def test_carcinogenicity_rf_classification + skip "Caret rf classification may run into a (endless?) loop for some compounds." + dataset = Dataset.from_csv_file "#{DATA_DIR}/multi_cell_call.csv" + algorithms = { + :prediction => { + :method => "Algorithm::Caret.rf", + }, + } + model = Model::Lazar.create training_dataset: dataset, algorithms: algorithms + cv = ClassificationCrossValidation.create model +# cv = ClassificationCrossValidation.find "5bbc822dca626919731e2822" + puts cv.statistics + puts cv.id + + end + + def test_mutagenicity_classification_algorithms + skip "Caret rf classification may run into a (endless?) loop for some compounds." + source_feature = Feature.where(:name => "Ames test categorisation").first + target_feature = Feature.where(:name => "Mutagenicity").first + kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" + hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv" + efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv" + dataset = Dataset.merge [kazius,hansen,efsa], {source_feature => target_feature}, {1 => "mutagen", 0 => "nonmutagen"} + model = Model::Lazar.create training_dataset: dataset + repeated_cv = RepeatedCrossValidation.create model + puts repeated_cv.id + repeated_cv.crossvalidations.each do |cv| + puts cv.accuracy + puts cv.confusion_matrix + end + algorithms = { + :prediction => { + :method => "Algorithm::Caret.rf", + }, + } + model = Model::Lazar.create training_dataset: dataset, algorithms: algorithms + repeated_cv = RepeatedCrossValidation.create model + puts repeated_cv.id + repeated_cv.crossvalidations.each do |cv| + puts cv.accuracy + puts cv.confusion_matrix + end + + end + +end diff --git a/test/descriptor.rb b/test/descriptor.rb index 563cdce..95211f5 100644 --- a/test/descriptor.rb +++ b/test/descriptor.rb @@ -4,10 +4,10 @@ class DescriptorTest < MiniTest::Test def test_list # check available descriptors - assert_equal 15,PhysChem.openbabel_descriptors.size,"incorrect number of Openbabel descriptors" + assert_equal 16,PhysChem.openbabel_descriptors.size,"incorrect number of Openbabel descriptors" assert_equal 45,PhysChem.joelib_descriptors.size,"incorrect number of Joelib descriptors" assert_equal 286,PhysChem.cdk_descriptors.size,"incorrect number of Cdk descriptors" - assert_equal 346,PhysChem.descriptors.size,"incorrect number of physchem descriptors" + assert_equal 347,PhysChem.descriptors.size,"incorrect number of physchem descriptors" end def test_smarts diff --git a/test/model-classification.rb b/test/model-classification.rb deleted file mode 100644 index ca6eb27..0000000 --- a/test/model-classification.rb +++ /dev/null @@ -1,145 +0,0 @@ -require_relative "setup.rb" - -class LazarClassificationTest < MiniTest::Test - - def test_classification_default - algorithms = { - :descriptors => { - :method => "fingerprint", - :type => "MP2D" - }, - :similarity => { - :method => "Algorithm::Similarity.tanimoto", - :min => 0.1 - }, - :prediction => { - :method => "Algorithm::Classification.weighted_majority_vote", - }, - :feature_selection => nil, - } - training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") - model = Model::Lazar.create training_dataset: training_dataset - assert_kind_of Model::LazarClassification, model - assert_equal algorithms, model.algorithms - substance = training_dataset.substances[10] - prediction = model.predict substance - assert_equal "false", prediction[:value] - [ { - :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"), - :prediction => "false", - },{ - :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"), - :prediction => "false", - } ].each do |example| - prediction = model.predict example[:compound] - assert_equal example[:prediction], prediction[:value] - end - - compound = Compound.from_smiles "CCO" - prediction = model.predict compound - assert_equal "true", prediction[:value] - assert_equal ["false"], prediction[:measurements] - - # make a dataset prediction - compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv") - prediction_dataset = model.predict compound_dataset - assert_equal compound_dataset.compounds, prediction_dataset.compounds - - cid = prediction_dataset.compounds[7].id.to_s - assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warnings][0] - expectations = ["Cannot create prediction: Only one similar compound in the training set.", - "Could not find similar substances with experimental data in the training dataset."] - prediction_dataset.predictions.each do |cid,pred| - assert_includes expectations, pred[:warnings][0] if pred[:value].nil? - end - cid = Compound.from_smiles("CCOC(=O)N").id.to_s - assert_match "excluded", prediction_dataset.predictions[cid][:info] - # cleanup - [training_dataset,model,compound_dataset,prediction_dataset].each{|o| o.delete} - end - - def test_classification_parameters - algorithms = { - :descriptors => { - :method => "fingerprint", - :type => "MACCS" - }, - :similarity => { - :min => 0.4 - }, - } - training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") - model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms - assert_kind_of Model::LazarClassification, model - assert_equal "Algorithm::Classification.weighted_majority_vote", model.algorithms[:prediction][:method] - assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method] - assert_equal algorithms[:similarity][:min], model.algorithms[:similarity][:min] - substance = training_dataset.substances[10] - prediction = model.predict substance - assert_equal "false", prediction[:value] - assert_equal 4, prediction[:neighbors].size - end - - def test_kazius - t = Time.now - training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"kazius.csv") - t = Time.now - model = Model::Lazar.create training_dataset: training_dataset - t = Time.now - 2.times do - compound = Compound.from_smiles("Clc1ccccc1NN") - prediction = model.predict compound - assert_equal "1", prediction[:value] - end - training_dataset.delete - end - - def test_dataset_prediction - training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") - model = Model::Lazar.create training_dataset: training_dataset - result = model.predict training_dataset - assert 3, result.features.size - assert 8, result.compounds.size - assert_equal ["true"], result.values(result.compounds.first, result.features[0]) - assert_equal [0.65], result.values(result.compounds.first, result.features[1]) - assert_equal [0], result.values(result.compounds.first, result.features[2]) # classification returns nil, check if - #p prediction_dataset - end - - def test_carcinogenicity_rf_classification - skip "Caret rf may run into a (endless?) loop for some compounds." - dataset = Dataset.from_csv_file "#{DATA_DIR}/multi_cell_call.csv" - algorithms = { - :prediction => { - :method => "Algorithm::Caret.rf", - }, - } - model = Model::Lazar.create training_dataset: dataset, algorithms: algorithms - substance = Compound.from_smiles "[O-]S(=O)(=O)[O-].[Mn+2].O" - prediction = model.predict substance - p prediction - - end - - def test_rf_classification - skip "Caret rf may run into a (endless?) loop for some compounds." - algorithms = { - :prediction => { - :method => "Algorithm::Caret.rf", - }, - } - training_dataset = Dataset.from_sdf_file File.join(DATA_DIR,"cas_4337.sdf") - model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms - #p model.id.to_s - #model = Model::Lazar.find "5bbb4c0cca626909f6c8a924" - assert_kind_of Model::LazarClassification, model - assert_equal algorithms[:prediction][:method], model.algorithms["prediction"]["method"] - substance = Compound.from_smiles "Clc1ccc(cc1)C(=O)c1ccc(cc1)OC(C(=O)O)(C)C" - prediction = model.predict substance - assert_equal 51, prediction[:neighbors].size - assert_equal "nonmutagen", prediction[:value] - assert_equal 0.1, prediction[:probabilities]["mutagen"].round(1) - assert_equal 0.9, prediction[:probabilities]["nonmutagen"].round(1) - end - -end diff --git a/test/model-nanoparticle.rb b/test/model-nanoparticle.rb deleted file mode 100644 index 67bbfdd..0000000 --- a/test/model-nanoparticle.rb +++ /dev/null @@ -1,135 +0,0 @@ -require_relative "setup.rb" - -class NanoparticleModelTest < MiniTest::Test - include OpenTox::Validation - - def setup - @training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - @prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first - end - - def test_core_coating_source_uris - @training_dataset.nanoparticles.each do |np| - refute_nil np.core.source - np.coating.each{|c| refute_nil c.source} - end - end - - def test_nanoparticle_model - assert true, @prediction_feature.measured - model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature - refute_empty model.dependent_variables - refute_empty model.descriptor_ids - refute_empty model.independent_variables - assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method] - assert_equal "Algorithm::Similarity.weighted_cosine", model.algorithms[:similarity][:method] - nanoparticle = @training_dataset.nanoparticles[-34] - assert_includes nanoparticle.dataset_ids, @training_dataset.id - prediction = model.predict nanoparticle - refute_nil prediction[:value] - assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." - prediction = model.predict @training_dataset.substances[14] - refute_nil prediction[:value] - assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." - model.delete - end - - def test_nanoparticle_fingerprint_model - assert true, @prediction_feature.measured - algorithms = { - :descriptors => { - :method => "fingerprint", - :type => "MP2D", - }, - :similarity => { - :method => "Algorithm::Similarity.tanimoto", - :min => 0.1 - }, - :feature_selection => nil - } - model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms - refute_empty model.dependent_variables - refute_empty model.descriptor_ids - refute_empty model.independent_variables - assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method] - assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method] - assert_nil model.algorithms[:descriptors][:categories] - nanoparticle = @training_dataset.nanoparticles[-34] - assert_includes nanoparticle.dataset_ids, @training_dataset.id - prediction = model.predict nanoparticle - refute_nil prediction[:value] - assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." - prediction = model.predict @training_dataset.substances[14] - refute_nil prediction[:value] - assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." - model.delete - end - - def test_nanoparticle_fingerprint_model_with_feature_selection - assert true, @prediction_feature.measured - algorithms = { - :descriptors => { - :method => "fingerprint", - :type => "MP2D", - }, - :similarity => { - :method => "Algorithm::Similarity.tanimoto", - :min => 0.1 - }, - } - model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms - refute_empty model.algorithms[:feature_selection] - refute_empty model.dependent_variables - refute_empty model.descriptor_ids - refute_empty model.independent_variables - assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method] - assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method] - nanoparticle = @training_dataset.nanoparticles[-34] - assert_includes nanoparticle.dataset_ids, @training_dataset.id - prediction = model.predict nanoparticle - refute_nil prediction[:value] - assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." - prediction = model.predict @training_dataset.substances[14] - refute_nil prediction[:value] - assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." - model.delete - end - - def test_nanoparticle_calculated_properties_model - skip "Nanoparticle calculate_properties similarity not yet implemented" - assert true, @prediction_feature.measured - algorithms = { - :descriptors => { - :method => "calculate_properties", - :features => PhysChem.openbabel_descriptors, - }, - :similarity => { - :method => "Algorithm::Similarity.weighted_cosine", - :min => 0.5 - }, - :prediction => { - :method => "Algorithm::Regression.weighted_average", - }, - } - model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms - refute_empty model.dependent_variables - refute_empty model.descriptor_ids - refute_empty model.independent_variables - assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method] - assert_equal "Algorithm::Similarity.weighted", model.algorithms[:similarity][:method] - nanoparticle = @training_dataset.nanoparticles[-34] - assert_includes nanoparticle.dataset_ids, @training_dataset.id - prediction = model.predict nanoparticle - refute_nil prediction[:value] - assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." - prediction = model.predict @training_dataset.substances[14] - refute_nil prediction[:value] - assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." - model.delete - end - - def test_import_ld - skip # Ambit JSON-LD export defunct - dataset_ids = Import::Enanomapper.import_ld - end -end diff --git a/test/model-nanoparticle.rb~ b/test/model-nanoparticle.rb~ new file mode 100644 index 0000000..67bbfdd --- /dev/null +++ b/test/model-nanoparticle.rb~ @@ -0,0 +1,135 @@ +require_relative "setup.rb" + +class NanoparticleModelTest < MiniTest::Test + include OpenTox::Validation + + def setup + @training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + @prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first + end + + def test_core_coating_source_uris + @training_dataset.nanoparticles.each do |np| + refute_nil np.core.source + np.coating.each{|c| refute_nil c.source} + end + end + + def test_nanoparticle_model + assert true, @prediction_feature.measured + model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature + refute_empty model.dependent_variables + refute_empty model.descriptor_ids + refute_empty model.independent_variables + assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method] + assert_equal "Algorithm::Similarity.weighted_cosine", model.algorithms[:similarity][:method] + nanoparticle = @training_dataset.nanoparticles[-34] + assert_includes nanoparticle.dataset_ids, @training_dataset.id + prediction = model.predict nanoparticle + refute_nil prediction[:value] + assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." + prediction = model.predict @training_dataset.substances[14] + refute_nil prediction[:value] + assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." + model.delete + end + + def test_nanoparticle_fingerprint_model + assert true, @prediction_feature.measured + algorithms = { + :descriptors => { + :method => "fingerprint", + :type => "MP2D", + }, + :similarity => { + :method => "Algorithm::Similarity.tanimoto", + :min => 0.1 + }, + :feature_selection => nil + } + model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms + refute_empty model.dependent_variables + refute_empty model.descriptor_ids + refute_empty model.independent_variables + assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method] + assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method] + assert_nil model.algorithms[:descriptors][:categories] + nanoparticle = @training_dataset.nanoparticles[-34] + assert_includes nanoparticle.dataset_ids, @training_dataset.id + prediction = model.predict nanoparticle + refute_nil prediction[:value] + assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." + prediction = model.predict @training_dataset.substances[14] + refute_nil prediction[:value] + assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." + model.delete + end + + def test_nanoparticle_fingerprint_model_with_feature_selection + assert true, @prediction_feature.measured + algorithms = { + :descriptors => { + :method => "fingerprint", + :type => "MP2D", + }, + :similarity => { + :method => "Algorithm::Similarity.tanimoto", + :min => 0.1 + }, + } + model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms + refute_empty model.algorithms[:feature_selection] + refute_empty model.dependent_variables + refute_empty model.descriptor_ids + refute_empty model.independent_variables + assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method] + assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method] + nanoparticle = @training_dataset.nanoparticles[-34] + assert_includes nanoparticle.dataset_ids, @training_dataset.id + prediction = model.predict nanoparticle + refute_nil prediction[:value] + assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." + prediction = model.predict @training_dataset.substances[14] + refute_nil prediction[:value] + assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." + model.delete + end + + def test_nanoparticle_calculated_properties_model + skip "Nanoparticle calculate_properties similarity not yet implemented" + assert true, @prediction_feature.measured + algorithms = { + :descriptors => { + :method => "calculate_properties", + :features => PhysChem.openbabel_descriptors, + }, + :similarity => { + :method => "Algorithm::Similarity.weighted_cosine", + :min => 0.5 + }, + :prediction => { + :method => "Algorithm::Regression.weighted_average", + }, + } + model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms + refute_empty model.dependent_variables + refute_empty model.descriptor_ids + refute_empty model.independent_variables + assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method] + assert_equal "Algorithm::Similarity.weighted", model.algorithms[:similarity][:method] + nanoparticle = @training_dataset.nanoparticles[-34] + assert_includes nanoparticle.dataset_ids, @training_dataset.id + prediction = model.predict nanoparticle + refute_nil prediction[:value] + assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." + prediction = model.predict @training_dataset.substances[14] + refute_nil prediction[:value] + assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." + model.delete + end + + def test_import_ld + skip # Ambit JSON-LD export defunct + dataset_ids = Import::Enanomapper.import_ld + end +end diff --git a/test/model-regression.rb b/test/model-regression.rb deleted file mode 100644 index 5903e88..0000000 --- a/test/model-regression.rb +++ /dev/null @@ -1,171 +0,0 @@ -require_relative "setup.rb" - -class LazarRegressionTest < MiniTest::Test - - def test_default_regression - algorithms = { - :descriptors => { - :method => "fingerprint", - :type => "MP2D" - }, - :similarity => { - :method => "Algorithm::Similarity.tanimoto", - :min => 0.5 - }, - :prediction => { - :method => "Algorithm::Caret.rf", - }, - :feature_selection => nil, - } - training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM_log10.csv") - model = Model::Lazar.create training_dataset: training_dataset - assert_kind_of Model::LazarRegression, model - assert_equal algorithms, model.algorithms - substance = training_dataset.substances[145] - prediction = model.predict substance - assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." - substance = Compound.from_smiles "c1ccc(cc1)Oc1ccccc1" - prediction = model.predict substance - refute_nil prediction[:value] - refute_nil prediction[:prediction_interval] - refute_empty prediction[:neighbors] - end - - def test_weighted_average - training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv" - algorithms = { - :similarity => { - :min => 0 - }, - :prediction => { - :method => "Algorithm::Regression.weighted_average", - }, - } - model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms - compound = Compound.from_smiles "CC(C)(C)CN" - prediction = model.predict compound - assert_equal -0.86, prediction[:value].round(2) - assert_equal model.substance_ids.size, prediction[:neighbors].size - end - - def test_mpd_fingerprints - training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv" - algorithms = { - :descriptors => { - :method => "fingerprint", - :type => "MP2D" - }, - } - model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms - compound = Compound.from_smiles "CCCSCCSCC" - prediction = model.predict compound - assert_equal 3, prediction[:neighbors].size - assert prediction[:value].round(2) > 1.37, "Prediction value (#{prediction[:value].round(2)}) should be larger than 1.37." - end - - def test_local_physchem_regression - training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv" - algorithms = { - :descriptors => { - :method => "calculate_properties", - :features => PhysChem.openbabel_descriptors, - }, - :similarity => { - :method => "Algorithm::Similarity.weighted_cosine", - :min => 0.5 - }, - } - model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms) - compound = Compound.from_smiles "NC(=O)OCCC" - prediction = model.predict compound - refute_nil prediction[:value] - end - - def test_local_physchem_regression_with_feature_selection - training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv" - algorithms = { - :descriptors => { - :method => "calculate_properties", - :features => PhysChem.openbabel_descriptors, - }, - :similarity => { - :method => "Algorithm::Similarity.weighted_cosine", - :min => 0.5 - }, - :feature_selection => { - :method => "Algorithm::FeatureSelection.correlation_filter", - }, - } - model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms) - compound = Compound.from_smiles "NC(=O)OCCC" - prediction = model.predict compound - refute_nil prediction[:value] - end - - def test_unweighted_cosine_physchem_regression - algorithms = { - :descriptors => { - :method => "calculate_properties", - :features => PhysChem.openbabel_descriptors, - }, - :similarity => { - :method => "Algorithm::Similarity.cosine", - } - } - training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv") - model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms - assert_kind_of Model::LazarRegression, model - assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method] - assert_equal "Algorithm::Similarity.cosine", model.algorithms[:similarity][:method] - assert_equal 0.5, model.algorithms[:similarity][:min] - algorithms[:descriptors].delete :features - assert_equal algorithms[:descriptors], model.algorithms[:descriptors] - prediction = model.predict training_dataset.substances[10] - refute_nil prediction[:value] - end - - def test_regression_with_feature_selection - algorithms = { - :feature_selection => { - :method => "Algorithm::FeatureSelection.correlation_filter", - }, - } - training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM_log10.csv") - model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms - assert_kind_of Model::LazarRegression, model - assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method] - assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method] - assert_equal 0.5, model.algorithms[:similarity][:min] - assert_equal algorithms[:feature_selection][:method], model.algorithms[:feature_selection][:method] - prediction = model.predict training_dataset.substances[145] - refute_nil prediction[:value] - end - - def test_regression_parameters - algorithms = { - :descriptors => { - :method => "fingerprint", - :type => "MP2D" - }, - :similarity => { - :method => "Algorithm::Similarity.tanimoto", - :min => 0.3 - }, - :prediction => { - :method => "Algorithm::Regression.weighted_average", - }, - :feature_selection => nil, - } - training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv") - model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms - assert_kind_of Model::LazarRegression, model - assert_equal "Algorithm::Regression.weighted_average", model.algorithms[:prediction][:method] - assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method] - assert_equal algorithms[:similarity][:min], model.algorithms[:similarity][:min] - assert_equal algorithms[:prediction][:parameters], model.algorithms[:prediction][:parameters] - substance = training_dataset.substances[10] - prediction = model.predict substance - assert_equal 0.83, prediction[:value].round(2) - end - -end diff --git a/test/model-validation.rb b/test/model-validation.rb deleted file mode 100644 index 9304232..0000000 --- a/test/model-validation.rb +++ /dev/null @@ -1,19 +0,0 @@ -require_relative "setup.rb" - -class ValidationModelTest < MiniTest::Test - - def test_validation_model - m = Model::Validation.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" - [:endpoint,:species,:source].each do |p| - refute_empty m[p] - end - assert m.classification? - refute m.regression? - m.crossvalidations.each do |cv| - assert cv.accuracy > 0.74, "Crossvalidation accuracy (#{cv.accuracy}) should be larger than 0.75. This may happen due to an unfavorable training/test set split." - end - prediction = m.predict Compound.from_smiles("OCC(CN(CC(O)C)N=O)O") - assert_equal "true", prediction[:value] - m.delete - end -end diff --git a/test/nanomaterial-model-validation.rb b/test/nanomaterial-model-validation.rb deleted file mode 100644 index 9eaa17d..0000000 --- a/test/nanomaterial-model-validation.rb +++ /dev/null @@ -1,54 +0,0 @@ -require_relative "setup.rb" - -class NanomaterialValidationModelTest < MiniTest::Test - - def setup - @training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - @prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first - end - - def test_default_nanomaterial_validation_model - validation_model = Model::Validation.from_enanomapper - [:endpoint,:species,:source].each do |p| - refute_empty validation_model[p] - end - assert validation_model.regression? - refute validation_model.classification? - validation_model.crossvalidations.each do |cv| - refute_nil cv.r_squared - refute_nil cv.rmse - end - nanoparticle = @training_dataset.nanoparticles[-34] - assert_includes nanoparticle.dataset_ids, @training_dataset.id - prediction = validation_model.predict nanoparticle - refute_nil prediction[:value] - assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." - validation_model.delete - end - - def test_nanomaterial_validation_model_parameters - algorithms = { - :descriptors => { - :method => "fingerprint", - :type => "MP2D", - }, - :similarity => { - :method => "Algorithm::Similarity.tanimoto", - :min => 0.1 - }, - :prediction => { :method => "OpenTox::Algorithm::Regression.weighted_average" }, - :feature_selection => nil - } - validation_model = Model::Validation.from_enanomapper algorithms: algorithms - assert validation_model.regression? - refute validation_model.classification? - validation_model.crossvalidations.each do |cv| - refute_nil cv.r_squared - refute_nil cv.rmse - end - nanoparticle = @training_dataset.nanoparticles[-34] - assert_includes nanoparticle.dataset_ids, @training_dataset.id - prediction = validation_model.predict nanoparticle - refute_nil prediction[:value] - end -end diff --git a/test/nanomaterial-model-validation.rb~ b/test/nanomaterial-model-validation.rb~ new file mode 100644 index 0000000..9eaa17d --- /dev/null +++ b/test/nanomaterial-model-validation.rb~ @@ -0,0 +1,54 @@ +require_relative "setup.rb" + +class NanomaterialValidationModelTest < MiniTest::Test + + def setup + @training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + @prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first + end + + def test_default_nanomaterial_validation_model + validation_model = Model::Validation.from_enanomapper + [:endpoint,:species,:source].each do |p| + refute_empty validation_model[p] + end + assert validation_model.regression? + refute validation_model.classification? + validation_model.crossvalidations.each do |cv| + refute_nil cv.r_squared + refute_nil cv.rmse + end + nanoparticle = @training_dataset.nanoparticles[-34] + assert_includes nanoparticle.dataset_ids, @training_dataset.id + prediction = validation_model.predict nanoparticle + refute_nil prediction[:value] + assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." + validation_model.delete + end + + def test_nanomaterial_validation_model_parameters + algorithms = { + :descriptors => { + :method => "fingerprint", + :type => "MP2D", + }, + :similarity => { + :method => "Algorithm::Similarity.tanimoto", + :min => 0.1 + }, + :prediction => { :method => "OpenTox::Algorithm::Regression.weighted_average" }, + :feature_selection => nil + } + validation_model = Model::Validation.from_enanomapper algorithms: algorithms + assert validation_model.regression? + refute validation_model.classification? + validation_model.crossvalidations.each do |cv| + refute_nil cv.r_squared + refute_nil cv.rmse + end + nanoparticle = @training_dataset.nanoparticles[-34] + assert_includes nanoparticle.dataset_ids, @training_dataset.id + prediction = validation_model.predict nanoparticle + refute_nil prediction[:value] + end +end diff --git a/test/regression-model.rb b/test/regression-model.rb new file mode 100644 index 0000000..5903e88 --- /dev/null +++ b/test/regression-model.rb @@ -0,0 +1,171 @@ +require_relative "setup.rb" + +class LazarRegressionTest < MiniTest::Test + + def test_default_regression + algorithms = { + :descriptors => { + :method => "fingerprint", + :type => "MP2D" + }, + :similarity => { + :method => "Algorithm::Similarity.tanimoto", + :min => 0.5 + }, + :prediction => { + :method => "Algorithm::Caret.rf", + }, + :feature_selection => nil, + } + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM_log10.csv") + model = Model::Lazar.create training_dataset: training_dataset + assert_kind_of Model::LazarRegression, model + assert_equal algorithms, model.algorithms + substance = training_dataset.substances[145] + prediction = model.predict substance + assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." + substance = Compound.from_smiles "c1ccc(cc1)Oc1ccccc1" + prediction = model.predict substance + refute_nil prediction[:value] + refute_nil prediction[:prediction_interval] + refute_empty prediction[:neighbors] + end + + def test_weighted_average + training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv" + algorithms = { + :similarity => { + :min => 0 + }, + :prediction => { + :method => "Algorithm::Regression.weighted_average", + }, + } + model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms + compound = Compound.from_smiles "CC(C)(C)CN" + prediction = model.predict compound + assert_equal -0.86, prediction[:value].round(2) + assert_equal model.substance_ids.size, prediction[:neighbors].size + end + + def test_mpd_fingerprints + training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv" + algorithms = { + :descriptors => { + :method => "fingerprint", + :type => "MP2D" + }, + } + model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms + compound = Compound.from_smiles "CCCSCCSCC" + prediction = model.predict compound + assert_equal 3, prediction[:neighbors].size + assert prediction[:value].round(2) > 1.37, "Prediction value (#{prediction[:value].round(2)}) should be larger than 1.37." + end + + def test_local_physchem_regression + training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv" + algorithms = { + :descriptors => { + :method => "calculate_properties", + :features => PhysChem.openbabel_descriptors, + }, + :similarity => { + :method => "Algorithm::Similarity.weighted_cosine", + :min => 0.5 + }, + } + model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms) + compound = Compound.from_smiles "NC(=O)OCCC" + prediction = model.predict compound + refute_nil prediction[:value] + end + + def test_local_physchem_regression_with_feature_selection + training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv" + algorithms = { + :descriptors => { + :method => "calculate_properties", + :features => PhysChem.openbabel_descriptors, + }, + :similarity => { + :method => "Algorithm::Similarity.weighted_cosine", + :min => 0.5 + }, + :feature_selection => { + :method => "Algorithm::FeatureSelection.correlation_filter", + }, + } + model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms) + compound = Compound.from_smiles "NC(=O)OCCC" + prediction = model.predict compound + refute_nil prediction[:value] + end + + def test_unweighted_cosine_physchem_regression + algorithms = { + :descriptors => { + :method => "calculate_properties", + :features => PhysChem.openbabel_descriptors, + }, + :similarity => { + :method => "Algorithm::Similarity.cosine", + } + } + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv") + model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms + assert_kind_of Model::LazarRegression, model + assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method] + assert_equal "Algorithm::Similarity.cosine", model.algorithms[:similarity][:method] + assert_equal 0.5, model.algorithms[:similarity][:min] + algorithms[:descriptors].delete :features + assert_equal algorithms[:descriptors], model.algorithms[:descriptors] + prediction = model.predict training_dataset.substances[10] + refute_nil prediction[:value] + end + + def test_regression_with_feature_selection + algorithms = { + :feature_selection => { + :method => "Algorithm::FeatureSelection.correlation_filter", + }, + } + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM_log10.csv") + model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms + assert_kind_of Model::LazarRegression, model + assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method] + assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method] + assert_equal 0.5, model.algorithms[:similarity][:min] + assert_equal algorithms[:feature_selection][:method], model.algorithms[:feature_selection][:method] + prediction = model.predict training_dataset.substances[145] + refute_nil prediction[:value] + end + + def test_regression_parameters + algorithms = { + :descriptors => { + :method => "fingerprint", + :type => "MP2D" + }, + :similarity => { + :method => "Algorithm::Similarity.tanimoto", + :min => 0.3 + }, + :prediction => { + :method => "Algorithm::Regression.weighted_average", + }, + :feature_selection => nil, + } + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv") + model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms + assert_kind_of Model::LazarRegression, model + assert_equal "Algorithm::Regression.weighted_average", model.algorithms[:prediction][:method] + assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method] + assert_equal algorithms[:similarity][:min], model.algorithms[:similarity][:min] + assert_equal algorithms[:prediction][:parameters], model.algorithms[:prediction][:parameters] + substance = training_dataset.substances[10] + prediction = model.predict substance + assert_equal 0.83, prediction[:value].round(2) + end + +end diff --git a/test/regression-validation.rb b/test/regression-validation.rb new file mode 100644 index 0000000..44162c0 --- /dev/null +++ b/test/regression-validation.rb @@ -0,0 +1,91 @@ +require_relative "setup.rb" + +class ValidationRegressionTest < MiniTest::Test + include OpenTox::Validation + + # defaults + + def test_default_regression_crossvalidation + dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM_log10.csv" + model = Model::Lazar.create training_dataset: dataset + cv = RegressionCrossValidation.create model + assert cv.rmse[:all] < 1.5, "RMSE #{cv.rmse[:all]} should be smaller than 1.5, this may occur due to unfavorable training/test set splits" + assert cv.mae[:all] < 1.1, "MAE #{cv.mae[:all]} should be smaller than 1.1, this may occur due to unfavorable training/test set splits" + assert cv.within_prediction_interval[:all]/cv.nr_predictions[:all] > 0.8, "Only #{(100*cv.within_prediction_interval[:all]/cv.nr_predictions[:all]).round(2)}% of measurement within prediction interval. This may occur due to unfavorable training/test set splits" + end + + # parameters + + def test_regression_crossvalidation_params + dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv" + algorithms = { + :prediction => { :method => "OpenTox::Algorithm::Regression.weighted_average" }, + :descriptors => { :type => "MACCS", }, + :similarity => {:min => 0.7} + } + model = Model::Lazar.create training_dataset: dataset, algorithms: algorithms + assert_equal algorithms[:descriptors][:type], model.algorithms[:descriptors][:type] + cv = RegressionCrossValidation.create model + cv.validation_ids.each do |vid| + model = Model::Lazar.find(Validation.find(vid).model_id) + assert_equal algorithms[:descriptors][:type], model.algorithms[:descriptors][:type] + assert_equal algorithms[:similarity][:min], model.algorithms[:similarity][:min] + refute_nil model.training_dataset_id + refute_equal dataset.id, model.training_dataset_id + end + + refute_nil cv.rmse[:all] + refute_nil cv.mae[:all] + end + + def test_physchem_regression_crossvalidation + training_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv") + model = Model::Lazar.create training_dataset:training_dataset + cv = RegressionCrossValidation.create model + refute_nil cv.rmse[:all] + refute_nil cv.mae[:all] + end + + # LOO + + def test_regression_loo_validation + dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv") + model = Model::Lazar.create training_dataset: dataset + loo = RegressionLeaveOneOut.create model + assert loo.r_squared[:all] > 0.34, "R^2 (#{loo.r_squared[:all]}) should be larger than 0.034" + end + + def test_regression_loo_validation_with_feature_selection + dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv") + algorithms = { + :descriptors => { + :method => "calculate_properties", + :features => PhysChem.openbabel_descriptors, + }, + :similarity => { + :method => "Algorithm::Similarity.weighted_cosine", + :min => 0.5 + }, + :feature_selection => { + :method => "Algorithm::FeatureSelection.correlation_filter", + }, + } + model = Model::Lazar.create training_dataset: dataset, algorithms: algorithms + assert_raises OpenTox::BadRequestError do + loo = RegressionLeaveOneOut.create model + end + end + + # repeated CV + + def test_repeated_crossvalidation + dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv") + model = Model::Lazar.create training_dataset: dataset + repeated_cv = RepeatedCrossValidation.create model + repeated_cv.crossvalidations.each do |cv| + assert cv.r_squared[:all] > 0.34, "R^2 (#{cv.r_squared[:all]}) should be larger than 0.034" + assert cv.rmse[:all] < 1.5, "RMSE (#{cv.rmse[:all]}) should be smaller than 0.5" + end + end + +end diff --git a/test/setup.rb b/test/setup.rb index 51871a2..c4c04cb 100644 --- a/test/setup.rb +++ b/test/setup.rb @@ -3,8 +3,8 @@ require 'minitest/autorun' require_relative '../lib/lazar.rb' #require 'lazar' include OpenTox -$mongo.database.drop -$gridfs = $mongo.database.fs # recreate GridFS indexes +#$mongo.database.drop +#$gridfs = $mongo.database.fs # recreate GridFS indexes #PhysChem.descriptors TEST_DIR ||= File.expand_path(File.dirname(__FILE__)) DATA_DIR ||= File.join(TEST_DIR,"data") diff --git a/test/validation-classification.rb b/test/validation-classification.rb deleted file mode 100644 index 6b727d6..0000000 --- a/test/validation-classification.rb +++ /dev/null @@ -1,113 +0,0 @@ -require_relative "setup.rb" - -class ValidationClassificationTest < MiniTest::Test - include OpenTox::Validation - - # defaults - - def test_default_classification_crossvalidation - dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" - model = Model::Lazar.create training_dataset: dataset - cv = ClassificationCrossValidation.create model - assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7, this may occur due to an unfavorable training/test set split" - assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy (#{cv.weighted_accuracy}) should be larger than accuracy (#{cv.accuracy})." - File.open("/tmp/tmp.pdf","w+"){|f| f.puts cv.probability_plot(format:"pdf")} - p `file -b /tmp/tmp.pdf` - File.open("/tmp/tmp.png","w+"){|f| f.puts cv.probability_plot(format:"png")} - p `file -b /tmp/tmp.png` - end - - # parameters - - def test_classification_crossvalidation_parameters - dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" - algorithms = { - :similarity => { :min => 0.3, }, - :descriptors => { :type => "FP3" } - } - model = Model::Lazar.create training_dataset: dataset, algorithms: algorithms - cv = ClassificationCrossValidation.create model - params = model.algorithms - params = Hash[params.map{ |k, v| [k.to_s, v] }] # convert symbols to string - - cv.validations.each do |validation| - validation_params = validation.model.algorithms - refute_nil model.training_dataset_id - refute_nil validation.model.training_dataset_id - refute_equal model.training_dataset_id, validation.model.training_dataset_id - ["min_sim","type","prediction_feature_id"].each do |k| - assert_equal params[k], validation_params[k] - end - end - end - - # LOO - - def test_classification_loo_validation - dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" - model = Model::Lazar.create training_dataset: dataset - loo = ClassificationLeaveOneOut.create model - assert_equal 24, loo.nr_unpredicted - refute_empty loo.confusion_matrix - assert loo.accuracy > 0.77 - assert loo.weighted_accuracy > loo.accuracy, "Weighted accuracy (#{loo.weighted_accuracy}) should be larger than accuracy (#{loo.accuracy})." - end - - # repeated CV - - def test_repeated_crossvalidation - dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" - model = Model::Lazar.create training_dataset: dataset - repeated_cv = RepeatedCrossValidation.create model - repeated_cv.crossvalidations.each do |cv| - assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split" - end - end - - def test_carcinogenicity_rf_classification - skip "Caret rf classification may run into a (endless?) loop for some compounds." - dataset = Dataset.from_csv_file "#{DATA_DIR}/multi_cell_call.csv" - algorithms = { - :prediction => { - :method => "Algorithm::Caret.rf", - }, - } - model = Model::Lazar.create training_dataset: dataset, algorithms: algorithms - cv = ClassificationCrossValidation.create model -# cv = ClassificationCrossValidation.find "5bbc822dca626919731e2822" - puts cv.statistics - puts cv.id - - end - - def test_mutagenicity_classification_algorithms - skip "Caret rf classification may run into a (endless?) loop for some compounds." - source_feature = Feature.where(:name => "Ames test categorisation").first - target_feature = Feature.where(:name => "Mutagenicity").first - kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" - hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv" - efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv" - dataset = Dataset.merge [kazius,hansen,efsa], {source_feature => target_feature}, {1 => "mutagen", 0 => "nonmutagen"} - model = Model::Lazar.create training_dataset: dataset - repeated_cv = RepeatedCrossValidation.create model - puts repeated_cv.id - repeated_cv.crossvalidations.each do |cv| - puts cv.accuracy - puts cv.confusion_matrix - end - algorithms = { - :prediction => { - :method => "Algorithm::Caret.rf", - }, - } - model = Model::Lazar.create training_dataset: dataset, algorithms: algorithms - repeated_cv = RepeatedCrossValidation.create model - puts repeated_cv.id - repeated_cv.crossvalidations.each do |cv| - puts cv.accuracy - puts cv.confusion_matrix - end - - end - -end diff --git a/test/validation-nanoparticle.rb b/test/validation-nanoparticle.rb deleted file mode 100644 index 0c7d355..0000000 --- a/test/validation-nanoparticle.rb +++ /dev/null @@ -1,133 +0,0 @@ -require_relative "setup.rb" - -class NanoparticleValidationTest < MiniTest::Test - include OpenTox::Validation - - def setup - @training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - @prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first - end - - def test_validate_default_nanoparticle_model - model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature - cv = CrossValidation.create model - p cv.id - #File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot format:"pdf"} - refute_nil cv.r_squared - refute_nil cv.rmse - end - - def test_validate_pls_pchem_model - algorithms = { - :descriptors => { - :method => "properties", - :categories => ["P-CHEM"] - }, - :prediction => {:method => 'Algorithm::Caret.pls' }, - :feature_selection => { - :method => "Algorithm::FeatureSelection.correlation_filter", - }, - } - model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms - assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method] - cv = CrossValidation.create model - p cv.id - #File.open("tmp2.pdf","w+"){|f| f.puts cv.correlation_plot format:"pdf"} - refute_nil cv.r_squared - refute_nil cv.rmse - end - -=begin - def test_validate_proteomics_pls_pchem_model - algorithms = { - :descriptors => { - :method => "properties", - :categories => ["Proteomics"] - }, - :prediction => {:method => 'Algorithm::Caret.pls' }, - :feature_selection => { - :method => "Algorithm::FeatureSelection.correlation_filter", - }, - } - model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms - assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method] - cv = CrossValidation.create model - refute_nil cv.r_squared - refute_nil cv.rmse - end -=end - - def test_validate_proteomics_pchem_default_model - algorithms = { - :descriptors => { - :method => "properties", - :categories => ["Proteomics","P-CHEM"] - }, - :feature_selection => { - :method => "Algorithm::FeatureSelection.correlation_filter", - }, - } - model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms - cv = CrossValidation.create model - refute_nil cv.r_squared - refute_nil cv.rmse - end - - def test_nanoparticle_fingerprint_model_without_feature_selection - algorithms = { - :descriptors => { - :method => "fingerprint", - :type => "MP2D", - }, - :similarity => { - :method => "Algorithm::Similarity.tanimoto", - :min => 0.1 - }, - :feature_selection => nil - } - model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms - cv = CrossValidation.create model - refute_nil cv.r_squared - refute_nil cv.rmse - end - - def test_nanoparticle_fingerprint_weighted_average_model_without_feature_selection - algorithms = { - :descriptors => { - :method => "fingerprint", - :type => "MP2D", - }, - :similarity => { - :method => "Algorithm::Similarity.tanimoto", - :min => 0.1 - }, - :prediction => { :method => "OpenTox::Algorithm::Regression.weighted_average" }, - :feature_selection => nil - } - model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms - cv = CrossValidation.create model - refute_nil cv.r_squared - refute_nil cv.rmse - end - - def test_nanoparticle_fingerprint_model_with_feature_selection - algorithms = { - :descriptors => { - :method => "fingerprint", - :type => "MP2D", - }, - :similarity => { - :method => "Algorithm::Similarity.tanimoto", - :min => 0.1 - }, - :feature_selection => { - :method => "Algorithm::FeatureSelection.correlation_filter", - }, - } - model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms - cv = CrossValidation.create model - refute_nil cv.r_squared - refute_nil cv.rmse - end - -end diff --git a/test/validation-nanoparticle.rb~ b/test/validation-nanoparticle.rb~ new file mode 100644 index 0000000..0c7d355 --- /dev/null +++ b/test/validation-nanoparticle.rb~ @@ -0,0 +1,133 @@ +require_relative "setup.rb" + +class NanoparticleValidationTest < MiniTest::Test + include OpenTox::Validation + + def setup + @training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + @prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first + end + + def test_validate_default_nanoparticle_model + model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature + cv = CrossValidation.create model + p cv.id + #File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot format:"pdf"} + refute_nil cv.r_squared + refute_nil cv.rmse + end + + def test_validate_pls_pchem_model + algorithms = { + :descriptors => { + :method => "properties", + :categories => ["P-CHEM"] + }, + :prediction => {:method => 'Algorithm::Caret.pls' }, + :feature_selection => { + :method => "Algorithm::FeatureSelection.correlation_filter", + }, + } + model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms + assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method] + cv = CrossValidation.create model + p cv.id + #File.open("tmp2.pdf","w+"){|f| f.puts cv.correlation_plot format:"pdf"} + refute_nil cv.r_squared + refute_nil cv.rmse + end + +=begin + def test_validate_proteomics_pls_pchem_model + algorithms = { + :descriptors => { + :method => "properties", + :categories => ["Proteomics"] + }, + :prediction => {:method => 'Algorithm::Caret.pls' }, + :feature_selection => { + :method => "Algorithm::FeatureSelection.correlation_filter", + }, + } + model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms + assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method] + cv = CrossValidation.create model + refute_nil cv.r_squared + refute_nil cv.rmse + end +=end + + def test_validate_proteomics_pchem_default_model + algorithms = { + :descriptors => { + :method => "properties", + :categories => ["Proteomics","P-CHEM"] + }, + :feature_selection => { + :method => "Algorithm::FeatureSelection.correlation_filter", + }, + } + model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms + cv = CrossValidation.create model + refute_nil cv.r_squared + refute_nil cv.rmse + end + + def test_nanoparticle_fingerprint_model_without_feature_selection + algorithms = { + :descriptors => { + :method => "fingerprint", + :type => "MP2D", + }, + :similarity => { + :method => "Algorithm::Similarity.tanimoto", + :min => 0.1 + }, + :feature_selection => nil + } + model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms + cv = CrossValidation.create model + refute_nil cv.r_squared + refute_nil cv.rmse + end + + def test_nanoparticle_fingerprint_weighted_average_model_without_feature_selection + algorithms = { + :descriptors => { + :method => "fingerprint", + :type => "MP2D", + }, + :similarity => { + :method => "Algorithm::Similarity.tanimoto", + :min => 0.1 + }, + :prediction => { :method => "OpenTox::Algorithm::Regression.weighted_average" }, + :feature_selection => nil + } + model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms + cv = CrossValidation.create model + refute_nil cv.r_squared + refute_nil cv.rmse + end + + def test_nanoparticle_fingerprint_model_with_feature_selection + algorithms = { + :descriptors => { + :method => "fingerprint", + :type => "MP2D", + }, + :similarity => { + :method => "Algorithm::Similarity.tanimoto", + :min => 0.1 + }, + :feature_selection => { + :method => "Algorithm::FeatureSelection.correlation_filter", + }, + } + model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms + cv = CrossValidation.create model + refute_nil cv.r_squared + refute_nil cv.rmse + end + +end diff --git a/test/validation-regression.rb b/test/validation-regression.rb deleted file mode 100644 index 0328c88..0000000 --- a/test/validation-regression.rb +++ /dev/null @@ -1,91 +0,0 @@ -require_relative "setup.rb" - -class ValidationRegressionTest < MiniTest::Test - include OpenTox::Validation - - # defaults - - def test_default_regression_crossvalidation - dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv" - model = Model::Lazar.create training_dataset: dataset - cv = RegressionCrossValidation.create model - assert cv.rmse < 1.5, "RMSE #{cv.rmse} should be smaller than 1.5, this may occur due to unfavorable training/test set splits" - assert cv.mae < 1.1, "MAE #{cv.mae} should be smaller than 1.1, this may occur due to unfavorable training/test set splits" - assert cv.percent_within_prediction_interval > 80, "Only #{cv.percent_within_prediction_interval.round(2)}% of measurement within prediction interval. This may occur due to unfavorable training/test set splits" - end - - # parameters - - def test_regression_crossvalidation_params - dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv" - algorithms = { - :prediction => { :method => "OpenTox::Algorithm::Regression.weighted_average" }, - :descriptors => { :type => "MACCS", }, - :similarity => {:min => 0.7} - } - model = Model::Lazar.create training_dataset: dataset, algorithms: algorithms - assert_equal algorithms[:descriptors][:type], model.algorithms[:descriptors][:type] - cv = RegressionCrossValidation.create model - cv.validation_ids.each do |vid| - model = Model::Lazar.find(Validation.find(vid).model_id) - assert_equal algorithms[:descriptors][:type], model.algorithms[:descriptors][:type] - assert_equal algorithms[:similarity][:min], model.algorithms[:similarity][:min] - refute_nil model.training_dataset_id - refute_equal dataset.id, model.training_dataset_id - end - - refute_nil cv.rmse - refute_nil cv.mae - end - - def test_physchem_regression_crossvalidation - training_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv") - model = Model::Lazar.create training_dataset:training_dataset - cv = RegressionCrossValidation.create model - refute_nil cv.rmse - refute_nil cv.mae - end - - # LOO - - def test_regression_loo_validation - dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv") - model = Model::Lazar.create training_dataset: dataset - loo = RegressionLeaveOneOut.create model - assert loo.r_squared > 0.34, "R^2 (#{loo.r_squared}) should be larger than 0.034" - end - - def test_regression_loo_validation_with_feature_selection - dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv") - algorithms = { - :descriptors => { - :method => "calculate_properties", - :features => PhysChem.openbabel_descriptors, - }, - :similarity => { - :method => "Algorithm::Similarity.weighted_cosine", - :min => 0.5 - }, - :feature_selection => { - :method => "Algorithm::FeatureSelection.correlation_filter", - }, - } - model = Model::Lazar.create training_dataset: dataset, algorithms: algorithms - assert_raises OpenTox::BadRequestError do - loo = RegressionLeaveOneOut.create model - end - end - - # repeated CV - - def test_repeated_crossvalidation - dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv") - model = Model::Lazar.create training_dataset: dataset - repeated_cv = RepeatedCrossValidation.create model - repeated_cv.crossvalidations.each do |cv| - assert cv.r_squared > 0.34, "R^2 (#{cv.r_squared}) should be larger than 0.034" - assert cv.rmse < 1.5, "RMSE (#{cv.rmse}) should be smaller than 0.5" - end - end - -end -- cgit v1.2.3