From ad7ec6a1e33f69557fe64371581d5f42a65ecaa8 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 13 Oct 2016 17:34:31 +0200 Subject: classification fixed --- lib/model.rb | 63 +++++++++++++++---------- test/classification.rb | 96 --------------------------------------- test/model-classification.rb | 106 +++++++++++++++++++++++++++++++++++++++++++ test/model.rb | 7 ++- test/nanoparticles.rb | 13 ++---- 5 files changed, 153 insertions(+), 132 deletions(-) delete mode 100644 test/classification.rb create mode 100644 test/model-classification.rb diff --git a/lib/model.rb b/lib/model.rb index 7029c31..b949042 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -57,7 +57,10 @@ module OpenTox if substance_classes.first == "OpenTox::Compound" model.algorithms = { - :descriptors => ['MP2D'], + :descriptors => { + :method => "fingerprint", + :type => "MP2D", + }, :similarity => { :method => "Algorithm::Similarity.tanimoto", :min => 0.1 @@ -77,7 +80,10 @@ module OpenTox elsif substance_classes.first == "OpenTox::Nanoparticle" model.algorithms = { - :descriptors => ["P-CHEM"], + :descriptors => { + :method => "properties", + :category => "P-CHEM", + }, #:descriptors => ["P-CHEM","Proteomics"], :similarity => { :method => "Algorithm::Similarity.weighted_cosine", @@ -115,34 +121,41 @@ module OpenTox end if values end + descriptor_method = model.algorithms[:descriptors][:method] + case descriptor_method # parse fingerprints - if model.fingerprints? - model.algorithms[:descriptors].each do |type| - model.substances.each_with_index do |s,i| - model.fingerprints[i] ||= [] - model.fingerprints[i] += s.fingerprint(type) - model.fingerprints[i].uniq! - end + when "fingerprint" + type = model.algorithms[:descriptors][:type] + model.substances.each_with_index do |s,i| + model.fingerprints[i] ||= [] + model.fingerprints[i] += s.fingerprint(type) + model.fingerprints[i].uniq! end model.descriptor_ids = model.fingerprints.flatten.uniq model.descriptor_ids.each do |d| - # resulting model may break BSON size limit (e.g. f Kazius dataset + # resulting model may break BSON size limit (e.g. f Kazius dataset) model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/ end - else - # parse independent_variables - if (model.algorithms[:descriptors] & [PhysChem::OPENBABEL,PhysChem::CDK,PhysChem::JOELIB]).empty? - properties = model.substances.collect { |s| s.properties } - all_property_ids = properties.collect{|p| p.keys}.flatten.uniq - model.descriptor_ids = all_property_ids.select{|id| model.algorithms[:descriptors].include? Feature.find(id).category } - model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} - - # calculate physchem properties - else - properties = model.substances.collect { |s| s.calculate_properties(model.algorithms[:descriptors]) } - model.descriptor_ids = properties.collect{|p| p.keys}.flatten.uniq - model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i]}} + # calculate physchem properties + when "calculate_properties" + features = model.algorithms[:descriptors][:features] + model.descriptor_ids = features.collect{|f| f.id.to_s} + model.algorithms[:descriptors].delete(:features) + model.algorithms[:descriptors].delete(:type) + model.substances.each_with_index do |s,i| + s.calculate_properties(features).each_with_index do |v,j| + model.independent_variables[j] ||= [] + model.independent_variables[j][i] = v + end end + # parse independent_variables + when "properties" + properties = model.substances.collect { |s| s.properties } + all_property_ids = properties.collect{|p| p.keys}.flatten.uniq + model.descriptor_ids = all_property_ids.select{|id| model.algorithms[:descriptors].include? Feature.find(id).category } + model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} + else + bad_request_error "Descriptor method '#{descriptor_method}' not implemented." end if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method] @@ -165,7 +178,7 @@ module OpenTox case algorithms[:similarity][:method] when /tanimoto/ # binary features - similarity_descriptors = algorithms[:descriptors].collect{|type| substance.fingerprint(type)}.flatten.uniq + similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type] # TODO this excludes descriptors only present in the query substance query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id} when /euclid|cosine/ # quantitative features @@ -295,7 +308,7 @@ module OpenTox end def fingerprints? - algorithms[:similarity][:method].match("tanimoto") ? true : false + algorithms[:descriptors][:method] == "fingerprint" ? true : false end end diff --git a/test/classification.rb b/test/classification.rb deleted file mode 100644 index c670bb5..0000000 --- a/test/classification.rb +++ /dev/null @@ -1,96 +0,0 @@ -require_relative "setup.rb" - -class LazarClassificationTest < MiniTest::Test - - def test_classification_default - algorithms = { - :descriptors => [ "MP2D" ], - :similarity => { - :method => "Algorithm::Similarity.tanimoto", - :min => 0.1 - }, - :prediction => { - :method => "Algorithm::Classification.weighted_majority_vote", - }, - :feature_selection => nil, - } - training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") - model = Model::Lazar.create training_dataset: training_dataset - assert_kind_of Model::LazarClassification, model - assert_equal algorithms, model.algorithms - substance = training_dataset.substances[10] - prediction = model.predict substance - assert_equal "false", prediction[:value] - [ { - :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"), - :prediction => "false", - },{ - :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"), - :prediction => "false", - } ].each do |example| - prediction = model.predict example[:compound] - assert_equal example[:prediction], prediction[:value] - end - - compound = Compound.from_smiles "CCO" - prediction = model.predict compound - assert_equal "true", prediction[:value] - assert_equal ["false"], prediction[:measurements] - - # make a dataset prediction - compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv") - prediction_dataset = model.predict compound_dataset - assert_equal compound_dataset.compounds, prediction_dataset.compounds - - cid = prediction_dataset.compounds[7].id.to_s - assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warning] - prediction_dataset.predictions.each do |cid,pred| - assert_equal "Could not find similar substances with experimental data in the training dataset.", pred[:warning] if pred[:value].nil? - end - cid = Compound.from_smiles("CCOC(=O)N").id.to_s - assert_match "excluded", prediction_dataset.predictions[cid][:warning] - # cleanup - [training_dataset,model,compound_dataset,prediction_dataset].each{|o| o.delete} - end - - def test_classification_parameters - algorithms = { - :descriptors => ['MACCS'], - :similarity => { - :min => 0.4 - }, - } - training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") - model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms - assert_kind_of Model::LazarClassification, model - assert_equal "Algorithm::Classification.weighted_majority_vote", model.algorithms[:prediction][:method] - assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method] - assert_equal algorithms[:similarity][:min], model.algorithms[:similarity][:min] - substance = training_dataset.substances[10] - prediction = model.predict substance - assert_equal "false", prediction[:value] - assert_equal 4, prediction[:neighbors].size - end - - def test_kazius - t = Time.now - training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"kazius.csv") - t = Time.now - model = Model::Lazar.create training_dataset: training_dataset - t = Time.now - 2.times do - compound = Compound.from_smiles("Clc1ccccc1NN") - prediction = model.predict compound - assert_equal "1", prediction[:value] - end - training_dataset.delete - end - - def test_fingerprint_feature_selection - skip - end - - def test_physchem_classification - skip - end -end diff --git a/test/model-classification.rb b/test/model-classification.rb new file mode 100644 index 0000000..1424f6a --- /dev/null +++ b/test/model-classification.rb @@ -0,0 +1,106 @@ +require_relative "setup.rb" + +class LazarClassificationTest < MiniTest::Test + + def test_classification_default + algorithms = { + :descriptors => { + :method => "fingerprint", + :type => "MP2D" + }, + :similarity => { + :method => "Algorithm::Similarity.tanimoto", + :min => 0.1 + }, + :prediction => { + :method => "Algorithm::Classification.weighted_majority_vote", + }, + :feature_selection => nil, + } + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") + model = Model::Lazar.create training_dataset: training_dataset + assert_kind_of Model::LazarClassification, model + assert_equal algorithms, model.algorithms + substance = training_dataset.substances[10] + prediction = model.predict substance + assert_equal "false", prediction[:value] + [ { + :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"), + :prediction => "false", + },{ + :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"), + :prediction => "false", + } ].each do |example| + prediction = model.predict example[:compound] + assert_equal example[:prediction], prediction[:value] + end + + compound = Compound.from_smiles "CCO" + prediction = model.predict compound + assert_equal "true", prediction[:value] + assert_equal ["false"], prediction[:measurements] + + # make a dataset prediction + compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini_log10.csv") + prediction_dataset = model.predict compound_dataset + assert_equal compound_dataset.compounds, prediction_dataset.compounds + + cid = prediction_dataset.compounds[7].id.to_s + assert_equal "Could not find similar substances with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warning] + prediction_dataset.predictions.each do |cid,pred| + assert_equal "Could not find similar substances with experimental data in the training dataset.", pred[:warning] if pred[:value].nil? + end + cid = Compound.from_smiles("CCOC(=O)N").id.to_s + assert_match "excluded", prediction_dataset.predictions[cid][:warning] + # cleanup + [training_dataset,model,compound_dataset,prediction_dataset].each{|o| o.delete} + end + + def test_classification_parameters + algorithms = { + :descriptors => { + :method => "fingerprint", + :type => "MACCS" + }, + :similarity => { + :min => 0.4 + }, + } + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") + model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms + assert_kind_of Model::LazarClassification, model + assert_equal "Algorithm::Classification.weighted_majority_vote", model.algorithms[:prediction][:method] + assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method] + assert_equal algorithms[:similarity][:min], model.algorithms[:similarity][:min] + substance = training_dataset.substances[10] + prediction = model.predict substance + assert_equal "false", prediction[:value] + assert_equal 4, prediction[:neighbors].size + end + + def test_kazius + t = Time.now + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"kazius.csv") + t = Time.now + model = Model::Lazar.create training_dataset: training_dataset + t = Time.now + 2.times do + compound = Compound.from_smiles("Clc1ccccc1NN") + prediction = model.predict compound + assert_equal "1", prediction[:value] + end + training_dataset.delete + end + + def test_caret_classification + skip + end + + def test_fingerprint_chisq_feature_selection + skip + end + + def test_physchem_classification + skip + end +end diff --git a/test/model.rb b/test/model.rb index 322ad90..027efe4 100644 --- a/test/model.rb +++ b/test/model.rb @@ -49,7 +49,10 @@ class ModelTest < MiniTest::Test def test_physchem_regression algorithms = { - :descriptors => [PhysChem::OPENBABEL], + :descriptors => { + :method => "calculate_properties", + :features => PhysChem.openbabel_descriptors, + }, :similarity => { :method => "Algorithm::Similarity.cosine", } @@ -60,9 +63,9 @@ class ModelTest < MiniTest::Test assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method] assert_equal "Algorithm::Similarity.cosine", model.algorithms[:similarity][:method] assert_equal 0.1, model.algorithms[:similarity][:min] + algorithms[:descriptors].delete :features assert_equal algorithms[:descriptors], model.algorithms[:descriptors] prediction = model.predict training_dataset.substances[10] - p prediction refute_nil prediction[:value] # TODO test predictin end diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb index c489cb7..9a67e63 100644 --- a/test/nanoparticles.rb +++ b/test/nanoparticles.rb @@ -1,6 +1,5 @@ require_relative "setup.rb" - class NanoparticleTest < MiniTest::Test include OpenTox::Validation @@ -13,7 +12,7 @@ class NanoparticleTest < MiniTest::Test @prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first end - def test_create_model + def test_nanoparticle_model model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature nanoparticle = @training_dataset.nanoparticles[-34] prediction = model.predict nanoparticle @@ -23,6 +22,8 @@ class NanoparticleTest < MiniTest::Test model.delete end + # validations + def test_validate_default_nanoparticle_model model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature cv = CrossValidation.create model @@ -77,15 +78,9 @@ class NanoparticleTest < MiniTest::Test refute_nil cv.rmse end - def test_export - skip - Dataset.all.each do |d| - puts d.to_csv - end - end def test_import_ld - skip + skip # Ambit JSON-LD export defunct dataset_ids = Import::Enanomapper.import_ld end end -- cgit v1.2.3