summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-10-13 19:17:03 +0200
committerChristoph Helma <helma@in-silico.ch>2016-10-13 19:17:03 +0200
commit160e75e696452ac61e651664ac56d16ce1c9c4b6 (patch)
tree03b7d96d9f6c30a1062919df1f9ad2e4f2935e70
parentad7ec6a1e33f69557fe64371581d5f42a65ecaa8 (diff)
model tests separated and cleaned
-rw-r--r--lib/model.rb40
-rw-r--r--lib/similarity.rb1
-rw-r--r--test/model-nanoparticle.rb42
-rw-r--r--test/model-regression.rb (renamed from test/model.rb)136
-rw-r--r--test/regression.rb86
-rw-r--r--test/validation-nanoparticle.rb (renamed from test/nanoparticles.rb)12
6 files changed, 169 insertions, 148 deletions
diff --git a/lib/model.rb b/lib/model.rb
index b949042..4bbb7da 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -82,7 +82,7 @@ module OpenTox
model.algorithms = {
:descriptors => {
:method => "properties",
- :category => "P-CHEM",
+ :categories => ["P-CHEM"],
},
#:descriptors => ["P-CHEM","Proteomics"],
:similarity => {
@@ -150,9 +150,14 @@ module OpenTox
end
# parse independent_variables
when "properties"
+ categories = model.algorithms[:descriptors][:categories]
+ feature_ids = []
+ categories.each do |category|
+ Feature.where(category:category).each{|f| feature_ids << f.id.to_s}
+ end
properties = model.substances.collect { |s| s.properties }
- all_property_ids = properties.collect{|p| p.keys}.flatten.uniq
- model.descriptor_ids = all_property_ids.select{|id| model.algorithms[:descriptors].include? Feature.find(id).category }
+ property_ids = properties.collect{|p| p.keys}.flatten.uniq
+ model.descriptor_ids = feature_ids & property_ids
model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}}
else
bad_request_error "Descriptor method '#{descriptor_method}' not implemented."
@@ -180,18 +185,25 @@ module OpenTox
when /tanimoto/ # binary features
similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type]
# TODO this excludes descriptors only present in the query substance
+ # use for applicability domain?
query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id}
when /euclid|cosine/ # quantitative features
- similarity_descriptors = descriptor_ids.collect_with_index{|id,i|
- prop = substance.properties[id]
- prop = prop.median if prop.is_a? Array # measured
- (prop-descriptor_means[i])/descriptor_sds[i]
- }
- query_descriptors = descriptor_ids.collect_with_index{|id,i|
- prop = substance.properties[id]
- prop = prop.median if prop.is_a? Array # measured
- substance.properties[id]
- }
+ if algorithms[:descriptors][:method] == "calculate_properties" # calculate descriptors
+ features = descriptor_ids.collect{|id| Feature.find(id)}
+ query_descriptors = substance.calculate_properties(features)
+ similarity_descriptors = query_descriptors.collect_with_index{|v,i| (v-descriptor_means[i])/descriptor_sds[i]}
+ else
+ similarity_descriptors = descriptor_ids.collect_with_index{|id,i|
+ prop = substance.properties[id]
+ prop = prop.median if prop.is_a? Array # measured
+ (prop-descriptor_means[i])/descriptor_sds[i]
+ }
+ query_descriptors = descriptor_ids.collect_with_index{|id,i|
+ prop = substance.properties[id]
+ prop = prop.median if prop.is_a? Array # measured
+ substance.properties[id]
+ }
+ end
else
bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
end
@@ -218,7 +230,7 @@ module OpenTox
neighbor_descriptors = scaled_variables.collect{|v| v[i]}
end
sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights]
- if sim > algorithms[:similarity][:min]
+ if sim >= algorithms[:similarity][:min]
neighbor_ids << s
neighbor_similarities << sim
neighbor_dependent_variables << dependent_variables[i]
diff --git a/lib/similarity.rb b/lib/similarity.rb
index 328d42a..772e812 100644
--- a/lib/similarity.rb
+++ b/lib/similarity.rb
@@ -32,6 +32,7 @@ module OpenTox
def self.weighted_cosine scaled_properties # [a,b,weights]
a,b,w = remove_nils scaled_properties
+ return cosine(scaled_properties) if w.uniq.size == 1
dot_product = 0
magnitude_a = 0
magnitude_b = 0
diff --git a/test/model-nanoparticle.rb b/test/model-nanoparticle.rb
new file mode 100644
index 0000000..fb81b83
--- /dev/null
+++ b/test/model-nanoparticle.rb
@@ -0,0 +1,42 @@
+require_relative "setup.rb"
+
+class NanoparticleTest < MiniTest::Test
+ include OpenTox::Validation
+
+ def setup
+ @training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
+ unless @training_dataset
+ Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm")
+ @training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
+ end
+ @prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first
+ end
+
+ def test_nanoparticle_model
+ assert true, @prediction_feature.measured
+ model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature
+ refute_empty model.dependent_variables
+ refute_empty model.descriptor_ids
+ refute_empty model.independent_variables
+ assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
+ assert_equal "Algorithm::Similarity.weighted_cosine", model.algorithms[:similarity][:method]
+ nanoparticle = @training_dataset.nanoparticles[-34]
+ assert_includes nanoparticle.dataset_ids, @training_dataset.id
+ prediction = model.predict nanoparticle
+ refute_nil prediction[:value]
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
+ prediction = model.predict @training_dataset.substances[14]
+ refute_nil prediction[:value]
+ assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
+ model.delete
+ end
+
+ def test_nanoparticle_parameters
+ skip
+ end
+
+ def test_import_ld
+ skip # Ambit JSON-LD export defunct
+ dataset_ids = Import::Enanomapper.import_ld
+ end
+end
diff --git a/test/model.rb b/test/model-regression.rb
index 027efe4..644ca1c 100644
--- a/test/model.rb
+++ b/test/model-regression.rb
@@ -1,10 +1,13 @@
require_relative "setup.rb"
-class ModelTest < MiniTest::Test
+class LazarRegressionTest < MiniTest::Test
def test_default_regression
algorithms = {
- :descriptors => [ "MP2D" ],
+ :descriptors => {
+ :method => "fingerprint",
+ :type => "MP2D"
+ },
:similarity => {
:method => "Algorithm::Similarity.tanimoto",
:min => 0.1
@@ -21,33 +24,85 @@ class ModelTest < MiniTest::Test
substance = training_dataset.substances[10]
prediction = model.predict substance
assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
+ substance = Compound.from_smiles "NC(=O)OCCC"
+ prediction = model.predict substance
+ refute_nil prediction[:value]
+ refute_nil prediction[:prediction_interval]
+ refute_empty prediction[:neighbors]
end
- def test_regression_parameters
+ def test_weighted_average
+ training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
algorithms = {
- :descriptors => [ "MP2D" ],
:similarity => {
- :method => "Algorithm::Similarity.tanimoto",
- :min => 0.3
+ :min => 0
},
:prediction => {
:method => "Algorithm::Regression.weighted_average",
},
- :feature_selection => nil,
}
- training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv")
- model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
- assert_kind_of Model::LazarRegression, model
- assert_equal "Algorithm::Regression.weighted_average", model.algorithms[:prediction][:method]
- assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
- assert_equal algorithms[:similarity][:min], model.algorithms[:similarity][:min]
- assert_equal algorithms[:prediction][:parameters], model.algorithms[:prediction][:parameters]
- substance = training_dataset.substances[10]
- prediction = model.predict substance
- assert_equal 0.83, prediction[:value].round(2)
+ model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
+ compound = Compound.from_smiles "CC(C)(C)CN"
+ prediction = model.predict compound
+ assert_equal -0.86, prediction[:value].round(2)
+ assert_equal model.substance_ids.size, prediction[:neighbors].size
end
- def test_physchem_regression
+ def test_mpd_fingerprints
+ training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
+ algorithms = {
+ :descriptors => {
+ :method => "fingerprint",
+ :type => "MP2D"
+ },
+ }
+ model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
+ compound = Compound.from_smiles "CCCSCCSCC"
+ prediction = model.predict compound
+ assert_equal 4, prediction[:neighbors].size
+ assert_equal 1.37, prediction[:value].round(2)
+ end
+
+ def test_local_physchem_regression
+ training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
+ algorithms = {
+ :descriptors => {
+ :method => "calculate_properties",
+ :features => PhysChem.openbabel_descriptors,
+ },
+ :similarity => {
+ :method => "Algorithm::Similarity.weighted_cosine",
+ :min => 0.5
+ },
+ }
+ model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms)
+ compound = Compound.from_smiles "NC(=O)OCCC"
+ prediction = model.predict compound
+ refute_nil prediction[:value]
+ end
+
+ def test_local_physchem_regression_with_feature_selection
+ training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
+ algorithms = {
+ :descriptors => {
+ :method => "calculate_properties",
+ :features => PhysChem.openbabel_descriptors,
+ },
+ :similarity => {
+ :method => "Algorithm::Similarity.weighted_cosine",
+ :min => 0.5
+ },
+ :feature_selection => {
+ :method => "Algorithm::FeatureSelection.correlation_filter",
+ },
+ }
+ model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms)
+ compound = Compound.from_smiles "NC(=O)OCCC"
+ prediction = model.predict compound
+ refute_nil prediction[:value]
+ end
+
+ def test_unweighted_cosine_physchem_regression
algorithms = {
:descriptors => {
:method => "calculate_properties",
@@ -70,24 +125,6 @@ class ModelTest < MiniTest::Test
# TODO test predictin
end
- def test_nanoparticle_default
- training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
- unless training_dataset
- Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm")
- training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
- end
- model = Model::Lazar.create training_dataset: training_dataset
- assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
- assert_equal "Algorithm::Similarity.weighted_cosine", model.algorithms[:similarity][:method]
- prediction = model.predict training_dataset.substances[14]
- assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
-
- end
-
- def test_nanoparticle_parameters
- skip
- end
-
def test_regression_with_feature_selection
algorithms = {
:feature_selection => {
@@ -103,4 +140,31 @@ class ModelTest < MiniTest::Test
assert_equal algorithms[:feature_selection][:method], model.algorithms[:feature_selection][:method]
end
+ def test_regression_parameters
+ algorithms = {
+ :descriptors => {
+ :method => "fingerprint",
+ :type => "MP2D"
+ },
+ :similarity => {
+ :method => "Algorithm::Similarity.tanimoto",
+ :min => 0.3
+ },
+ :prediction => {
+ :method => "Algorithm::Regression.weighted_average",
+ },
+ :feature_selection => nil,
+ }
+ training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv")
+ model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
+ assert_kind_of Model::LazarRegression, model
+ assert_equal "Algorithm::Regression.weighted_average", model.algorithms[:prediction][:method]
+ assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
+ assert_equal algorithms[:similarity][:min], model.algorithms[:similarity][:min]
+ assert_equal algorithms[:prediction][:parameters], model.algorithms[:prediction][:parameters]
+ substance = training_dataset.substances[10]
+ prediction = model.predict substance
+ assert_equal 0.83, prediction[:value].round(2)
+ end
+
end
diff --git a/test/regression.rb b/test/regression.rb
deleted file mode 100644
index cdbac4b..0000000
--- a/test/regression.rb
+++ /dev/null
@@ -1,86 +0,0 @@
-require_relative "setup.rb"
-
-class LazarRegressionTest < MiniTest::Test
-
- def test_weighted_average
- training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
- algorithms = {
- :similarity => {
- :min => 0
- },
- :prediction => {
- :method => "Algorithm::Regression.weighted_average",
- },
- }
- model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
- compound = Compound.from_smiles "CC(C)(C)CN"
- prediction = model.predict compound
- assert_equal -0.86, prediction[:value].round(2)
- assert_equal 88, prediction[:neighbors].size
- end
-
- def test_mpd_fingerprints
- training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
- algorithms = {
- :descriptors => [ "MP2D" ]
- }
- model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
- compound = Compound.from_smiles "CCCSCCSCC"
- prediction = model.predict compound
- assert_equal 3, prediction[:neighbors].size
- assert_equal 1.37, prediction[:value].round(2)
- end
-
- def test_local_fingerprint_regression
- training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
- model = Model::Lazar.create training_dataset: training_dataset
- compound = Compound.from_smiles "NC(=O)OCCC"
- prediction = model.predict compound
- refute_nil prediction[:value]
- refute_nil prediction[:prediction_interval]
- refute_empty prediction[:neighbors]
- end
-
- def test_local_physchem_regression
- training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
- algorithms = {
- :descriptors => [PhysChem::OPENBABEL],
- :similarity => {
- :method => "Algorithm::Similarity.weighted_cosine",
- :min => 0.5
- },
- }
- model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms)
- p model
- compound = Compound.from_smiles "NC(=O)OCCC"
- prediction = model.predict compound
- refute_nil prediction[:value]
- end
-
- def test_local_physchem_regression_with_feature_selection
- training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv"
- algorithms = {
- :descriptors => {
- :method => "calculated_properties",
- :types => ["OPENBABEL"]
- },
- :similarity => {
- :method => "Algorithm::Similarity.weighted_cosine",
- :min => 0.5
- },
- :feature_selection => {
- :method => "Algorithm::FeatureSelection.correlation_filter",
- },
- }
- model = Model::Lazar.create(training_dataset.features.first, training_dataset, algorithms)
- p model
- compound = Compound.from_smiles "NC(=O)OCCC"
- prediction = model.predict compound
- refute_nil prediction[:value]
- end
-
- def test_local_physchem_classification
- skip
- end
-
-end
diff --git a/test/nanoparticles.rb b/test/validation-nanoparticle.rb
index 9a67e63..3692515 100644
--- a/test/nanoparticles.rb
+++ b/test/validation-nanoparticle.rb
@@ -12,18 +12,6 @@ class NanoparticleTest < MiniTest::Test
@prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first
end
- def test_nanoparticle_model
- model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature
- nanoparticle = @training_dataset.nanoparticles[-34]
- prediction = model.predict nanoparticle
- refute_nil prediction[:value]
- assert_includes nanoparticle.dataset_ids, @training_dataset.id
- assert true, @prediction_feature.measured
- model.delete
- end
-
- # validations
-
def test_validate_default_nanoparticle_model
model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature
cv = CrossValidation.create model