From 8d325866dd7cacdd04bd2306a9144a5e7300c7c8 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 13 Oct 2016 10:11:09 +0200 Subject: molecular_weight fixed --- lib/compound.rb | 5 +-- lib/model.rb | 4 +-- lib/regression.rb | 1 - test/classification.rb | 53 ++++++++++++++++++++++++---- test/compound.rb | 93 +++----------------------------------------------- test/model.rb | 44 ------------------------ test/regression.rb | 5 +-- 7 files changed, 58 insertions(+), 147 deletions(-) diff --git a/lib/compound.rb b/lib/compound.rb index 0f178ce..ca9d5e3 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -75,9 +75,10 @@ module OpenTox fingerprints[type] end - def calculated_properties types=["OPENBABEL"] + def calculated_properties types=["PhysChem::OPENBABEL"] descriptors = [] types.each do |t| + p t descriptors += PhysChem.descriptors OpenTox.const_get(t) end # TODO: speedup java descriptors @@ -304,7 +305,7 @@ module OpenTox # @return [Float] molecular weight def molecular_weight mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW") - calculated_physchem([mw_feature])[mw_feature.id.to_s] + calculated_properties[mw_feature.id.to_s] end private diff --git a/lib/model.rb b/lib/model.rb index f3f0603..859df8b 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -126,7 +126,8 @@ module OpenTox end model.descriptor_ids = model.fingerprints.flatten.uniq model.descriptor_ids.each do |d| - model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} + # resulting model may break BSON size limit (e.g. f Kazius dataset + model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/ end else # parse independent_variables @@ -225,7 +226,6 @@ module OpenTox else # call prediction algorithm result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors - p result prediction.merge! result prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}} end diff --git a/lib/regression.rb b/lib/regression.rb index bed6df8..d1724fd 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -4,7 +4,6 @@ module OpenTox class Regression def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables: - #def self.weighted_average descriptors:nil, neighbors:, parameters:nil, method:nil, relevant_features:nil # TODO: prediction_interval weighted_sum = 0.0 sim_sum = 0.0 diff --git a/test/classification.rb b/test/classification.rb index 6638a79..c670bb5 100644 --- a/test/classification.rb +++ b/test/classification.rb @@ -2,10 +2,25 @@ require_relative "setup.rb" class LazarClassificationTest < MiniTest::Test - def test_lazar_classification + def test_classification_default + algorithms = { + :descriptors => [ "MP2D" ], + :similarity => { + :method => "Algorithm::Similarity.tanimoto", + :min => 0.1 + }, + :prediction => { + :method => "Algorithm::Classification.weighted_majority_vote", + }, + :feature_selection => nil, + } training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") - model = Model::Lazar.create training_dataset: training_dataset - + model = Model::Lazar.create training_dataset: training_dataset + assert_kind_of Model::LazarClassification, model + assert_equal algorithms, model.algorithms + substance = training_dataset.substances[10] + prediction = model.predict substance + assert_equal "false", prediction[:value] [ { :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"), :prediction => "false", @@ -33,12 +48,31 @@ class LazarClassificationTest < MiniTest::Test assert_equal "Could not find similar substances with experimental data in the training dataset.", pred[:warning] if pred[:value].nil? end cid = Compound.from_smiles("CCOC(=O)N").id.to_s - assert_equal "1 substances have been removed from neighbors, because they are identical with the query substance.", prediction_dataset.predictions[cid][:warning] + assert_match "excluded", prediction_dataset.predictions[cid][:warning] # cleanup [training_dataset,model,compound_dataset,prediction_dataset].each{|o| o.delete} end + + def test_classification_parameters + algorithms = { + :descriptors => ['MACCS'], + :similarity => { + :min => 0.4 + }, + } + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") + model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms + assert_kind_of Model::LazarClassification, model + assert_equal "Algorithm::Classification.weighted_majority_vote", model.algorithms[:prediction][:method] + assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method] + assert_equal algorithms[:similarity][:min], model.algorithms[:similarity][:min] + substance = training_dataset.substances[10] + prediction = model.predict substance + assert_equal "false", prediction[:value] + assert_equal 4, prediction[:neighbors].size + end - def test_lazar_kazius + def test_kazius t = Time.now training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"kazius.csv") t = Time.now @@ -48,8 +82,15 @@ class LazarClassificationTest < MiniTest::Test compound = Compound.from_smiles("Clc1ccccc1NN") prediction = model.predict compound assert_equal "1", prediction[:value] - #assert_in_delta 0.019858401199860445, prediction[:confidence], 0.001 end training_dataset.delete end + + def test_fingerprint_feature_selection + skip + end + + def test_physchem_classification + skip + end end diff --git a/test/compound.rb b/test/compound.rb index c78acb1..76471ac 100644 --- a/test/compound.rb +++ b/test/compound.rb @@ -2,19 +2,16 @@ require_relative "setup.rb" class CompoundTest < MiniTest::Test - def test_0_compound_from_smiles + def test_compound_from_smiles c = OpenTox::Compound.from_smiles "F[B-](F)(F)F.[Na+]" assert_equal "InChI=1S/BF4.Na/c2-1(3,4)5;/q-1;+1", c.inchi.chomp assert_equal "F[B-](F)(F)F.[Na+]", c.smiles, "A failure here might be caused by a compound webservice running on 64bit architectures using an outdated version of OpenBabel. Please install OpenBabel version 2.3.2 or higher." # seems to be fixed in 2.3.2 end - def test_1_compound_from_smiles + def test_compound_from_smiles c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N" assert_equal "InChI=1S/C6H9NO/c1-5(4-7)3-6(2)8/h5H,3H2,1-2H3", c.inchi assert_equal "CC(C#N)CC(=O)C", c.smiles - end - - def test_2_compound_from_smiles c = OpenTox::Compound.from_smiles "N#[N+]C1=CC=CC=C1.F[B-](F)(F)F" assert_equal "InChI=1S/C6H5N2.BF4/c7-8-6-4-2-1-3-5-6;2-1(3,4)5/h1-5H;/q+1;-1", c.inchi assert_equal "F[B-](F)(F)F.N#[N+]c1ccccc1", c.smiles @@ -79,22 +76,6 @@ print c.sdf assert_equal 9, c.fingerprint("FP4").size end - def test_neighbors - d = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM_log10.csv") - d.compounds.each do |c| - refute_nil c.fingerprint("MP2D") - end - c = d.compounds[371] - n = c.neighbors( - descriptors: {:method => "fingerprint", :type => "FP4"}, - similarity: {:method => "Algorithm::Similarity.tanimoto", :min => 0.7}, - dataset_id: d.id, - prediction_feature_id: d.features.first.id - ) - - assert n.size >= 8, "Neighbors size (#{n.size}) should be larger than 7" - end - def test_openbabel_segfault inchi = "InChI=1S/C19H27NO7/c1-11-9-19(12(2)27-19)17(23)26-14-6-8-20(4)7-5-13(15(14)21)10-25-16(22)18(11,3)24/h5,11-12,14,24H,6-10H2,1-4H3/b13-5-/t11-,12-,14-,18-,19?/m1/s1" @@ -113,30 +94,6 @@ print c.sdf end end - def test_fingerprint_neighbors - types = ["FP2", "FP3", "FP4", "MACCS"] - min_sim = 0.7 - training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM_log10.csv") - [ - "CC(=O)CC(C)C#N", - "CC(=O)CC(C)C", - "C(=O)CC(C)C#N", - ].each do |smi| - c = OpenTox::Compound.from_smiles smi - types.each do |type| - neighbors = c.fingerprint_neighbors( - descriptors: {:method => "fingerprint",:type => type}, - dataset_id: training_dataset.id, - similarity: {:method => "Algorithm::Similarity.tanimoto", :min => min_sim}, - prediction_feature_id: training_dataset.features.first.id - ) - unless type == "FP2" and smi == "CC(=O)CC(C)C#N" or smi == "C(=O)CC(C)C#N" and (type == "FP2" or type == "MACCS") - refute_empty neighbors - end - end - end - end - def test_mna c = OpenTox::Compound.from_smiles "N#[N+]C1=CC=CC=C1.F[B-](F)(F)F" assert_equal 18, c.fingerprint("MNA").size @@ -149,47 +106,6 @@ print c.sdf assert 7, c.fingerprint("MP2D").uniq.size end - def test_fingerprint_count_neighbors - skip - types = ["MP2D", "MNA"] - min_sim = 0.0 - training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM_log10.csv") - [ - "CC(=O)CC(C)C#N", - "CC(=O)CC(C)C", - "C(=O)CC(C)C#N", - ].each do |smi| - c = OpenTox::Compound.from_smiles smi - types.each do |type| - neighbors = c.fingerprint_count_neighbors({:type => type, :dataset_id => training_dataset.id, :min_sim => min_sim, :prediction_feature_id => training_dataset.features.first.id}) - if type == "FP4" - fp4_neighbors = c.neighbors - neighbors.each do |n| - assert_includes fp4_neighbors, n - end - end - end - end - end - - def test_fingerprint_db_neighbors - skip - training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM_log10.csv") - [ - "CC(=O)CC(C)C#N", - "CC(=O)CC(C)C", - "C(=O)CC(C)C#N", - ].each do |smi| - c = OpenTox::Compound.from_smiles smi - neighbors = c.db_neighbors(:dataset_id => training_dataset.id, :min_sim => 0.2) - neighbors2 = c.fingerprint_neighbors({:type => "MP2D", :dataset_id => training_dataset.id, :min_sim => 0.2, :prediction_feature_id => training_dataset.features.first.id}) - #p neighbors - #p neighbors2 - #p neighbors2 - neighbors - assert_equal neighbors, neighbors2 - end - end - def test_molecular_weight c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C" assert_equal 100.15888, c.molecular_weight @@ -208,8 +124,9 @@ print c.sdf def test_physchem c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C" - assert_equal PhysChem::OBDESCRIPTORS.size, c.calculated_physchem.size - assert_equal PhysChem::OBDESCRIPTORS.size, c.calculated_physchem(PhysChem.openbabel_descriptors).size + p c.calculated_properties + assert_equal PhysChem::OPENBABEL.size, c.calculated_properties.size + assert_equal PhysChem::OPENBABEL.size, c.calculated_properties(PhysChem.openbabel_descriptors).size assert_equal PhysChem::unique_descriptors.size, c.calculated_physchem(PhysChem.unique_descriptors).size end end diff --git a/test/model.rb b/test/model.rb index 9f30928..017ce10 100644 --- a/test/model.rb +++ b/test/model.rb @@ -100,48 +100,4 @@ class ModelTest < MiniTest::Test assert_equal algorithms[:feature_selection][:method], model.algorithms[:feature_selection][:method] end - def test_caret_parameters - skip - end - - def test_default_classification - algorithms = { - :descriptors => [ "MP2D" ], - :similarity => { - :method => "Algorithm::Similarity.tanimoto", - :min => 0.1 - }, - :prediction => { - :method => "Algorithm::Classification.weighted_majority_vote", - }, - :feature_selection => nil, - } - training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") - model = Model::Lazar.create training_dataset: training_dataset - assert_kind_of Model::LazarClassification, model - assert_equal algorithms, model.algorithms - substance = training_dataset.substances[10] - prediction = model.predict substance - assert_equal "false", prediction[:value] - end - - def test_classification_parameters - algorithms = { - :descriptors => ['MACCS'], - :similarity => { - :min => 0.4 - }, - } - training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") - model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms - assert_kind_of Model::LazarClassification, model - assert_equal "Algorithm::Classification.weighted_majority_vote", model.algorithms[:prediction][:method] - assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method] - assert_equal algorithms[:similarity][:min], model.algorithms[:similarity][:min] - substance = training_dataset.substances[10] - prediction = model.predict substance - assert_equal "false", prediction[:value] - assert_equal 4, prediction[:neighbors].size - end - end diff --git a/test/regression.rb b/test/regression.rb index aad4195..b1051f1 100644 --- a/test/regression.rb +++ b/test/regression.rb @@ -22,10 +22,7 @@ class LazarRegressionTest < MiniTest::Test def test_mpd_fingerprints training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv" algorithms = { - :descriptors => { - :method => "fingerprint", - :type => "MP2D" - } + :descriptors => [ "MP2D" ] } model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms compound = Compound.from_smiles "CCCSCCSCC" -- cgit v1.2.3