From 9a06f2ff5ae6bdbe7dc90555599e186f1585e0d2 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 10 Nov 2016 15:27:26 +0100 Subject: Model::NanoPrediction parameters --- lib/caret.rb | 2 +- lib/import.rb | 7 +++- lib/model.rb | 51 +++++++++++------------------ lib/similarity.rb | 4 +++ test/model-nanoparticle.rb | 30 +++++++++++++++++ test/nanomaterial-prediction-models.rb | 60 ++++++++++++++++++++++++++++++++++ test/validation-nanoparticle.rb | 19 +++++++++++ 7 files changed, 139 insertions(+), 34 deletions(-) create mode 100644 test/nanomaterial-prediction-models.rb diff --git a/lib/caret.rb b/lib/caret.rb index 18bfc41..7e4f771 100644 --- a/lib/caret.rb +++ b/lib/caret.rb @@ -12,7 +12,7 @@ module OpenTox independent_variables.delete_at i query_variables.delete_at i end - if independent_variables.flatten.uniq == ["NA"] + if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == [] prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." elsif diff --git a/lib/import.rb b/lib/import.rb index 541c9b5..8f640b1 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -5,7 +5,12 @@ module OpenTox class Enanomapper include OpenTox - def self.mirror dir="." + def self.mirror dir=nil + # clean download dir + dir ||= File.join(File.dirname(__FILE__),"..","data","enm") + FileUtils.rm_rf dir + FileUtils.mkdir_p dir + #get list of bundle URIs bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] File.open(File.join(dir,"bundles.json"),"w+"){|f| f.puts JSON.pretty_generate(bundles)} diff --git a/lib/model.rb b/lib/model.rb index 549cbd2..809dc48 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -106,7 +106,7 @@ module OpenTox else model.algorithms[type] = parameters end - end + end if algorithms # parse dependent_variables from training dataset training_dataset.substances.each do |substance| @@ -249,6 +249,7 @@ module OpenTox elsif neighbor_similarities.size == 1 prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]}) else + query_descriptors.collect!{|d| d ? 1 : 0} if independent_variables[0][0].numeric? # call prediction algorithm result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors prediction.merge! result @@ -343,7 +344,7 @@ module OpenTox field :unit, type: String field :model_id, type: BSON::ObjectId field :repeated_crossvalidation_id, type: BSON::ObjectId - field :leave_one_out_validation_id, type: BSON::ObjectId + #field :leave_one_out_validation_id, type: BSON::ObjectId def predict object model.predict object @@ -398,42 +399,28 @@ module OpenTox class NanoPrediction < Prediction - def self.from_json_dump dir, category - Import::Enanomapper.import dir - training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - unless training_dataset - Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm") + def self.create training_dataset: nil, prediction_feature:nil, algorithms: nil + + # find/import training_dataset + training_dataset ||= Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + unless training_dataset # try to import from json dump + Import::Enanomapper.import training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + unless training_dataset + Import::Enanomapper.mirror + Import::Enanomapper.import + training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset + end end - prediction_model = self.new( - :endpoint => "log2(Net cell association)", - :source => "https://data.enanomapper.net/", - :species => "A549 human lung epithelial carcinoma cells", - :unit => "log2(ug/Mg)" - ) - prediction_feature = Feature.where(name: "log2(Net cell association)", category: "TOX").first - model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset) - prediction_model[:model_id] = model.id - repeated_cv = Validation::RepeatedCrossValidation.create model - prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id - #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id - prediction_model.save - prediction_model - end + prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first - def self.create dir: dir, algorithms: algorithms - training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - unless training_dataset - Import::Enanomapper.import dir - training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - end prediction_model = self.new( - :endpoint => "log2(Net cell association)", - :source => "https://data.enanomapper.net/", + :endpoint => prediction_feature.name, + :source => prediction_feature.source, :species => "A549 human lung epithelial carcinoma cells", - :unit => "log2(ug/Mg)" + :unit => prediction_feature.unit ) - prediction_feature = Feature.where(name: "log2(Net cell association)", category: "TOX").first model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms) prediction_model[:model_id] = model.id repeated_cv = Validation::RepeatedCrossValidation.create model diff --git a/lib/similarity.rb b/lib/similarity.rb index 772e812..0901936 100644 --- a/lib/similarity.rb +++ b/lib/similarity.rb @@ -19,6 +19,10 @@ module OpenTox ( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f end + #def self.weighted_tanimoto fingerprints + #( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f + #end + def self.euclid scaled_properties sq = scaled_properties[0].zip(scaled_properties[1]).map{|a,b| (a - b) ** 2} Math.sqrt(sq.inject(0) {|s,c| s + c}) diff --git a/test/model-nanoparticle.rb b/test/model-nanoparticle.rb index 88032bc..c5f3223 100644 --- a/test/model-nanoparticle.rb +++ b/test/model-nanoparticle.rb @@ -61,6 +61,36 @@ class NanoparticleModelTest < MiniTest::Test model.delete end + def test_nanoparticle_fingerprint_model_with_feature_selection + assert true, @prediction_feature.measured + algorithms = { + :descriptors => { + :method => "fingerprint", + :type => "MP2D", + }, + :similarity => { + :method => "Algorithm::Similarity.tanimoto", + :min => 0.1 + }, + } + model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms + refute_empty model.algorithms[:feature_selection] + refute_empty model.dependent_variables + refute_empty model.descriptor_ids + refute_empty model.independent_variables + assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method] + assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method] + nanoparticle = @training_dataset.nanoparticles[-34] + assert_includes nanoparticle.dataset_ids, @training_dataset.id + prediction = model.predict nanoparticle + refute_nil prediction[:value] + assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." + prediction = model.predict @training_dataset.substances[14] + refute_nil prediction[:value] + assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." + model.delete + end + def test_nanoparticle_calculated_properties_model skip "Nanoparticle calculate_properties similarity not yet implemented" assert true, @prediction_feature.measured diff --git a/test/nanomaterial-prediction-models.rb b/test/nanomaterial-prediction-models.rb new file mode 100644 index 0000000..b0c05f3 --- /dev/null +++ b/test/nanomaterial-prediction-models.rb @@ -0,0 +1,60 @@ +require_relative "setup.rb" + +class NanomaterialPredictionModelTest < MiniTest::Test + + def setup + @training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + unless @training_dataset + Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm") + @training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + end + @prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first + end + + def test_default_nanomaterial_prediction_model + prediction_model = Model::NanoPrediction.create + p prediction_model + [:endpoint,:species,:source].each do |p| + refute_empty prediction_model[p] + end + assert prediction_model.regression? + refute prediction_model.classification? + prediction_model.crossvalidations.each do |cv| + refute_nil cv.r_squared + refute_nil cv.rmse + end + nanoparticle = @training_dataset.nanoparticles[-34] + assert_includes nanoparticle.dataset_ids, @training_dataset.id + prediction = prediction_model.predict nanoparticle + refute_nil prediction[:value] + assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." + prediction_model.delete + end + + def test_nanomaterial_prediction_model_parameters + algorithms = { + :descriptors => { + :method => "fingerprint", + :type => "MP2D", + }, + :similarity => { + :method => "Algorithm::Similarity.tanimoto", + :min => 0.1 + }, + :prediction => { :method => "OpenTox::Algorithm::Regression.weighted_average" }, + :feature_selection => nil + } + prediction_model = Model::NanoPrediction.create algorithms: algorithms + assert prediction_model.regression? + refute prediction_model.classification? + prediction_model.crossvalidations.each do |cv| + refute_nil cv.r_squared + refute_nil cv.rmse + end + nanoparticle = @training_dataset.nanoparticles[-34] + assert_includes nanoparticle.dataset_ids, @training_dataset.id + prediction = prediction_model.predict nanoparticle + refute_nil prediction[:value] + assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions." + end +end diff --git a/test/validation-nanoparticle.rb b/test/validation-nanoparticle.rb index 7391f21..5ed70f2 100644 --- a/test/validation-nanoparticle.rb +++ b/test/validation-nanoparticle.rb @@ -113,4 +113,23 @@ class NanoparticleValidationTest < MiniTest::Test refute_nil cv.rmse end + def test_nanoparticle_fingerprint_model_with_feature_selection + algorithms = { + :descriptors => { + :method => "fingerprint", + :type => "MP2D", + }, + :similarity => { + :method => "Algorithm::Similarity.tanimoto", + :min => 0.1 + }, + } + model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms + cv = CrossValidation.create model + p cv.rmse + p cv.r_squared + refute_nil cv.r_squared + refute_nil cv.rmse + end + end -- cgit v1.2.3