From 9a06f2ff5ae6bdbe7dc90555599e186f1585e0d2 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 10 Nov 2016 15:27:26 +0100
Subject: Model::NanoPrediction parameters

---
 lib/caret.rb                           |  2 +-
 lib/import.rb                          |  7 +++-
 lib/model.rb                           | 51 +++++++++++------------------
 lib/similarity.rb                      |  4 +++
 test/model-nanoparticle.rb             | 30 +++++++++++++++++
 test/nanomaterial-prediction-models.rb | 60 ++++++++++++++++++++++++++++++++++
 test/validation-nanoparticle.rb        | 19 +++++++++++
 7 files changed, 139 insertions(+), 34 deletions(-)
 create mode 100644 test/nanomaterial-prediction-models.rb

diff --git a/lib/caret.rb b/lib/caret.rb
index 18bfc41..7e4f771 100644
--- a/lib/caret.rb
+++ b/lib/caret.rb
@@ -12,7 +12,7 @@ module OpenTox
           independent_variables.delete_at i
           query_variables.delete_at i
         end
-        if independent_variables.flatten.uniq == ["NA"] 
+        if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == [] 
           prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
           prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
         elsif
diff --git a/lib/import.rb b/lib/import.rb
index 541c9b5..8f640b1 100644
--- a/lib/import.rb
+++ b/lib/import.rb
@@ -5,7 +5,12 @@ module OpenTox
     class Enanomapper
       include OpenTox
 
-      def self.mirror dir="."
+      def self.mirror dir=nil
+        # clean download dir
+        dir ||= File.join(File.dirname(__FILE__),"..","data","enm")
+        FileUtils.rm_rf dir
+        FileUtils.mkdir_p dir
+
         #get list of bundle URIs
         bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
         File.open(File.join(dir,"bundles.json"),"w+"){|f| f.puts JSON.pretty_generate(bundles)}
diff --git a/lib/model.rb b/lib/model.rb
index 549cbd2..809dc48 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -106,7 +106,7 @@ module OpenTox
           else
             model.algorithms[type] = parameters
           end
-        end
+        end if algorithms
 
         # parse dependent_variables from training dataset
         training_dataset.substances.each do |substance|
@@ -249,6 +249,7 @@ module OpenTox
         elsif neighbor_similarities.size == 1
           prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]})
         else
+          query_descriptors.collect!{|d| d ? 1 : 0} if independent_variables[0][0].numeric?
           # call prediction algorithm
           result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
           prediction.merge! result
@@ -343,7 +344,7 @@ module OpenTox
       field :unit, type: String
       field :model_id, type: BSON::ObjectId
       field :repeated_crossvalidation_id, type: BSON::ObjectId
-      field :leave_one_out_validation_id, type: BSON::ObjectId
+      #field :leave_one_out_validation_id, type: BSON::ObjectId
 
       def predict object
         model.predict object
@@ -398,42 +399,28 @@ module OpenTox
 
     class NanoPrediction < Prediction
 
-      def self.from_json_dump dir, category
-        Import::Enanomapper.import dir
-        training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
-        unless training_dataset
-          Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm")
+      def self.create training_dataset: nil, prediction_feature:nil, algorithms: nil
+        
+        # find/import training_dataset
+        training_dataset ||= Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
+        unless training_dataset # try to import from json dump
+          Import::Enanomapper.import
           training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
+          unless training_dataset
+            Import::Enanomapper.mirror
+            Import::Enanomapper.import
+            training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
+            bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset
+          end
         end
-        prediction_model = self.new(
-          :endpoint => "log2(Net cell association)",
-          :source => "https://data.enanomapper.net/",
-          :species => "A549 human lung epithelial carcinoma cells",
-          :unit => "log2(ug/Mg)"
-        )
-        prediction_feature = Feature.where(name: "log2(Net cell association)", category: "TOX").first
-        model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset)
-        prediction_model[:model_id] = model.id
-        repeated_cv = Validation::RepeatedCrossValidation.create model
-        prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id
-        #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id
-        prediction_model.save
-        prediction_model
-      end
+        prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first
 
-      def self.create dir: dir, algorithms: algorithms
-        training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
-        unless training_dataset
-          Import::Enanomapper.import dir
-          training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
-        end
         prediction_model = self.new(
-          :endpoint => "log2(Net cell association)",
-          :source => "https://data.enanomapper.net/",
+          :endpoint => prediction_feature.name,
+          :source => prediction_feature.source,
           :species => "A549 human lung epithelial carcinoma cells",
-          :unit => "log2(ug/Mg)"
+          :unit => prediction_feature.unit
         )
-        prediction_feature = Feature.where(name: "log2(Net cell association)", category: "TOX").first
         model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms)
         prediction_model[:model_id] = model.id
         repeated_cv = Validation::RepeatedCrossValidation.create model
diff --git a/lib/similarity.rb b/lib/similarity.rb
index 772e812..0901936 100644
--- a/lib/similarity.rb
+++ b/lib/similarity.rb
@@ -19,6 +19,10 @@ module OpenTox
         ( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f
       end
 
+      #def self.weighted_tanimoto fingerprints
+        #( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f
+      #end
+
       def self.euclid scaled_properties
         sq = scaled_properties[0].zip(scaled_properties[1]).map{|a,b| (a - b) ** 2}
         Math.sqrt(sq.inject(0) {|s,c| s + c})
diff --git a/test/model-nanoparticle.rb b/test/model-nanoparticle.rb
index 88032bc..c5f3223 100644
--- a/test/model-nanoparticle.rb
+++ b/test/model-nanoparticle.rb
@@ -61,6 +61,36 @@ class NanoparticleModelTest  < MiniTest::Test
     model.delete
   end
 
+  def test_nanoparticle_fingerprint_model_with_feature_selection
+    assert true, @prediction_feature.measured
+    algorithms = {
+      :descriptors => {
+        :method => "fingerprint",
+        :type => "MP2D",
+      },
+      :similarity => {
+        :method => "Algorithm::Similarity.tanimoto",
+        :min => 0.1
+      },
+    }
+    model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms
+    refute_empty model.algorithms[:feature_selection]
+    refute_empty model.dependent_variables
+    refute_empty model.descriptor_ids
+    refute_empty model.independent_variables
+    assert_equal "Algorithm::Caret.rf", model.algorithms[:prediction][:method]
+    assert_equal "Algorithm::Similarity.tanimoto", model.algorithms[:similarity][:method]
+    nanoparticle = @training_dataset.nanoparticles[-34]
+    assert_includes nanoparticle.dataset_ids, @training_dataset.id
+    prediction = model.predict nanoparticle
+    refute_nil prediction[:value]
+    assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
+    prediction = model.predict @training_dataset.substances[14]
+    refute_nil prediction[:value]
+    assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
+    model.delete
+  end
+
   def test_nanoparticle_calculated_properties_model
     skip "Nanoparticle calculate_properties similarity not yet implemented"
     assert true, @prediction_feature.measured
diff --git a/test/nanomaterial-prediction-models.rb b/test/nanomaterial-prediction-models.rb
new file mode 100644
index 0000000..b0c05f3
--- /dev/null
+++ b/test/nanomaterial-prediction-models.rb
@@ -0,0 +1,60 @@
+require_relative "setup.rb"
+
+class NanomaterialPredictionModelTest < MiniTest::Test
+
+  def setup
+    @training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
+    unless @training_dataset
+      Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm")
+      @training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
+    end
+    @prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first
+  end
+
+  def test_default_nanomaterial_prediction_model
+    prediction_model = Model::NanoPrediction.create
+    p prediction_model
+    [:endpoint,:species,:source].each do |p|
+      refute_empty prediction_model[p]
+    end
+    assert prediction_model.regression?
+    refute prediction_model.classification?
+    prediction_model.crossvalidations.each do |cv|
+      refute_nil cv.r_squared
+      refute_nil cv.rmse
+    end
+    nanoparticle = @training_dataset.nanoparticles[-34]
+    assert_includes nanoparticle.dataset_ids, @training_dataset.id
+    prediction = prediction_model.predict nanoparticle
+    refute_nil prediction[:value]
+    assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
+    prediction_model.delete
+  end
+
+  def test_nanomaterial_prediction_model_parameters
+    algorithms = {
+      :descriptors => {
+        :method => "fingerprint",
+        :type => "MP2D",
+      },
+      :similarity => {
+        :method => "Algorithm::Similarity.tanimoto",
+        :min => 0.1
+      },
+      :prediction => { :method => "OpenTox::Algorithm::Regression.weighted_average" },
+      :feature_selection => nil
+    }
+    prediction_model = Model::NanoPrediction.create algorithms: algorithms
+    assert prediction_model.regression?
+    refute prediction_model.classification?
+    prediction_model.crossvalidations.each do |cv|
+      refute_nil cv.r_squared
+      refute_nil cv.rmse
+    end
+    nanoparticle = @training_dataset.nanoparticles[-34]
+    assert_includes nanoparticle.dataset_ids, @training_dataset.id
+    prediction = prediction_model.predict nanoparticle
+    refute_nil prediction[:value]
+    assert_includes prediction[:prediction_interval][0]..prediction[:prediction_interval][1], prediction[:measurements].median, "This assertion assures that measured values are within the prediction interval. It may fail in 5% of the predictions."
+  end
+end
diff --git a/test/validation-nanoparticle.rb b/test/validation-nanoparticle.rb
index 7391f21..5ed70f2 100644
--- a/test/validation-nanoparticle.rb
+++ b/test/validation-nanoparticle.rb
@@ -113,4 +113,23 @@ class NanoparticleValidationTest  < MiniTest::Test
     refute_nil cv.rmse
   end
 
+  def test_nanoparticle_fingerprint_model_with_feature_selection
+    algorithms = {
+      :descriptors => {
+        :method => "fingerprint",
+        :type => "MP2D",
+      },
+      :similarity => {
+        :method => "Algorithm::Similarity.tanimoto",
+        :min => 0.1
+      },
+    }
+    model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms
+    cv = CrossValidation.create model
+    p cv.rmse
+    p cv.r_squared
+    refute_nil cv.r_squared
+    refute_nil cv.rmse
+  end
+
 end
-- 
cgit v1.2.3