From 05386e748270c337c66f6f379317ea4b25905236 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Wed, 4 May 2016 19:24:42 +0200
Subject: first reasonable results for nanoparticle crossvalidation

---
 lib/crossvalidation.rb       |   4 +-
 lib/model.rb                 | 101 +++++++++++++++++++------------------------
 lib/nanoparticle.rb          |  18 ++++++--
 lib/regression.rb            |  38 ++++++++--------
 lib/validation-statistics.rb |   7 ++-
 scripts/import-enm.rb        |   6 +++
 scripts/mg2mmol.rb           |  17 ++++++++
 scripts/mmol2-log10.rb       |  17 ++++++++
 test/nanoparticles.rb        |  23 +++++++++-
 9 files changed, 146 insertions(+), 85 deletions(-)
 create mode 100644 scripts/import-enm.rb
 create mode 100644 scripts/mg2mmol.rb
 create mode 100644 scripts/mmol2-log10.rb

diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 0ae36c4..e1f956b 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -141,7 +141,7 @@ module OpenTox
             :measured => p[1],
             :predicted => p[2],
             #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs,
-            :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs,
+            :error => (p[1]-p[2]).abs,
             :relative_error => (p[1]-p[2]).abs/p[1],
             :confidence => p[3],
             :neighbors => neighbors
@@ -152,7 +152,7 @@ module OpenTox
 
     def confidence_plot
       tmpfile = "/tmp/#{id.to_s}_confidence.png"
-      sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact
+      sorted_predictions = predictions.collect{|p| [(p[1]-p[2]).abs,p[3]] if p[1] and p[2]}.compact
       R.assign "error", sorted_predictions.collect{|p| p[0]}
       R.assign "confidence", sorted_predictions.collect{|p| p[1]}
       # TODO fix axis names
diff --git a/lib/model.rb b/lib/model.rb
index f61368e..841ab20 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -2,7 +2,7 @@ module OpenTox
 
   module Model
 
-    class Model
+    class Lazar 
       include OpenTox
       include Mongoid::Document
       include Mongoid::Timestamps
@@ -10,27 +10,13 @@ module OpenTox
 
       field :name, type: String
       field :creator, type: String, default: __FILE__
-      # datasets
       field :training_dataset_id, type: BSON::ObjectId
-      # algorithms
       field :prediction_algorithm, type: String
-      # prediction feature
       field :prediction_feature_id, type: BSON::ObjectId
-
-      def training_dataset
-        Dataset.find(training_dataset_id)
-      end
-
-      def prediction_feature
-        Feature.find(prediction_feature_id)
-      end
-    end
-
-    class Lazar < Model
-
-      # algorithms
       field :neighbor_algorithm, type: String
       field :neighbor_algorithm_parameters, type: Hash, default: {}
+      field :feature_selection_algorithm, type: String
+      field :relevant_features, type: Hash
 
       # Create a lazar model from a training_dataset and a feature_dataset
       # @param [OpenTox::Dataset] training_dataset
@@ -45,10 +31,43 @@ module OpenTox
         self.name ||= "#{training_dataset.name} #{prediction_feature.name}" 
         self.neighbor_algorithm_parameters ||= {}
         self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id
+
+        Algorithm.run(feature_selection_algorithm, self) if feature_selection_algorithm
         save
         self
       end
 
+      def correlation_filter
+        toxicities = []
+        substances = []
+        training_dataset.substances.each do |s|
+          s["toxicities"][prediction_feature_id].each do |act|
+            toxicities << act
+            substances << s
+          end
+        end
+        R.assign "tox", toxicities
+        feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq
+        feature_ids.each do |feature_id|
+          feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id]}
+          R.assign "feature", feature_values
+          begin
+            #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')"
+            R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='complete')"
+            pvalue = R.eval("cor$p.value").to_ruby
+            if pvalue <= 0.05
+              r = R.eval("cor$estimate").to_ruby
+              relevant_features[feature] = {}
+              relevant_features[feature]["pvalue"] = pvalue
+              relevant_features[feature]["r"] = r
+            end
+          rescue
+            warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{toxicities}) failed."
+          end
+        end
+        relevant_features.sort!{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h
+      end
+
       def predict_compound compound
         neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
         # remove neighbors without prediction_feature
@@ -63,7 +82,6 @@ module OpenTox
           prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound."
           neighbors.delete_if{|n| n["_id"] == compound.id}
         end
-        #neighbors.delete_if{|n| n['toxicities'].empty? or n['toxicities'][prediction_feature.id.to_s] == [nil] }
         if neighbors.empty?
           prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []})
         else
@@ -123,6 +141,14 @@ module OpenTox
 
       end
 
+      def training_dataset
+        Dataset.find(training_dataset_id)
+      end
+
+      def prediction_feature
+        Feature.find(prediction_feature_id)
+      end
+
     end
 
     class LazarClassification < Lazar
@@ -229,45 +255,6 @@ module OpenTox
       end
     end
 
-    class NanoLazar
-      include OpenTox
-      include Mongoid::Document
-      include Mongoid::Timestamps
-      store_in collection: "models"
-
-      field :name, type: String
-      field :creator, type: String, default: __FILE__
-      # datasets
-      field :training_dataset_id, type: BSON::ObjectId
-      # algorithms
-      field :prediction_algorithm, type: String
-      # prediction feature
-      field :prediction_feature_id, type: BSON::ObjectId
-      field :training_particle_ids, type: Array
-
-      def self.create_all
-        nanoparticles = Nanoparticle.all
-        toxfeatures = Nanoparticle.all.collect{|np| np.toxicities.keys}.flatten.uniq.collect{|id| Feature.find id}
-        tox = {}
-        toxfeatures.each do |t|
-          tox[t] = nanoparticles.select{|np| np.toxicities.keys.include? t.id.to_s}
-        end
-        tox.select!{|t,nps| nps.size > 50}
-        tox.collect do |t,nps|
-          find_or_create_by(:prediction_feature_id => t.id, :training_particle_ids => nps.collect{|np| np.id})
-        end
-      end
-
-      def predict nanoparticle
-        training = training_particle_ids.collect{|id| Nanoparticle.find id}
-        training_features = training.collect{|t| t.physchem_descriptors.keys}.flatten.uniq
-        query_features = nanoparticle.physchem_descriptors.keys
-        common_features = (training_features & query_features)
-        #p common_features
-      end
-
-    end
-
   end
 
 end
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
index 83b97a9..dda4a9f 100644
--- a/lib/nanoparticle.rb
+++ b/lib/nanoparticle.rb
@@ -8,7 +8,7 @@ module OpenTox
     field :bundles, type: Array, default: []
 
     def nanoparticle_neighbors params
-      Dataset.find(params[:training_dataset_id]).nanoparticles.collect{|np| {"_id" => np.id, "tanimoto" => 1}}
+      Dataset.find(params[:training_dataset_id]).nanoparticles.collect{|np| np["tanimoto"] = 1; np}
     end
 
     def add_feature feature, value
@@ -19,7 +19,19 @@ module OpenTox
         physchem_descriptors[feature.id.to_s].uniq!
       when "TOX"
         toxicities[feature.id.to_s] ||= []
-        toxicities[feature.id.to_s] << value
+        # TODO generic way of parsing TOX values
+        if feature.name == "7.99 Toxicity (other) ICP-AES" and feature.unit == "mL/ug(Mg)" 
+          toxicities[feature.id.to_s] << -Math.log10(value)
+        #if value.numeric?
+          #begin
+          #rescue
+            #p feature
+            #p value
+            #exit
+          #end
+        else
+          toxicities[feature.id.to_s] << value
+        end
         toxicities[feature.id.to_s].uniq!
       else
         warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted."
@@ -29,7 +41,7 @@ module OpenTox
 
     def parse_ambit_value feature, v
       v.delete "unit"
-      # TODO: mmol/log10 conversion
+      # TODO: ppm instead of weights
       if v.keys == ["textValue"]
         add_feature feature, v["textValue"]
       elsif v.keys == ["loValue"]
diff --git a/lib/regression.rb b/lib/regression.rb
index 694a2dc..d2c4e91 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -12,16 +12,15 @@ module OpenTox
           sim ||= 1 # TODO: sim f nanoparticles
           if row["toxicities"][params[:prediction_feature_id].to_s]
             row["toxicities"][params[:prediction_feature_id].to_s].each do |act|
-              weighted_sum += sim*Math.log10(act)
+              weighted_sum += sim*act
               sim_sum += sim
             end
           end
         end
-        sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
+        sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum
         {:value => prediction}
       end
 
-      # TODO explicit neighbors, also for physchem
       def self.local_fingerprint_regression  compound, params, method='pls'#, method_params="sigma=0.05"
         neighbors = params[:neighbors]
         return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
@@ -35,7 +34,7 @@ module OpenTox
           fingerprint = neighbor.fingerprint
           if row["toxicities"][params[:prediction_feature_id].to_s]
             row["toxicities"][params[:prediction_feature_id].to_s].each do |act|
-              activities << Math.log10(act)
+              activities << act
               weights << row["tanimoto"]
               fingerprint_ids.each_with_index do |id,j|
                 fingerprints[id] ||= []
@@ -67,9 +66,9 @@ module OpenTox
             prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
             return prediction
           else
-            prediction[:prediction_interval] = [10**(prediction[:value]-1.96*prediction[:rmse]), 10**(prediction[:value]+1.96*prediction[:rmse])]
-            prediction[:value] = 10**prediction[:value]
-            prediction[:rmse] = 10**prediction[:rmse]
+            prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]]
+            prediction[:value] = prediction[:value]
+            prediction[:rmse] = prediction[:rmse]
             prediction
           end
         end
@@ -96,7 +95,7 @@ module OpenTox
             n["tanimoto"] ?  weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
             neighbor.physchem_descriptors.each do |pid,values| 
               values.uniq!
-              warn "More than one value for #{Feature.find(pid).name}: #{values.join(', ')}" unless values.size == 1
+              warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1
               j = pc_ids.index(pid)+1
               data_frame[j] ||= []
               data_frame[j][i] = values.for_R
@@ -121,7 +120,9 @@ module OpenTox
           result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
           return result
         else
-          query_descriptors = pc_ids.collect{|i| compound.physchem_descriptors[i].for_R if compound.physchem_descriptors[i]}.compact
+          query_descriptors = pc_ids.collect do |i|
+            compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA"
+          end
           remove_idx = []
           query_descriptors.each_with_index do |v,i|
             remove_idx << i if v == "NA"
@@ -137,7 +138,6 @@ module OpenTox
             prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
             return prediction
           else
-            prediction[:value] = 10**prediction[:value]
             prediction
           end
         end
@@ -148,6 +148,7 @@ module OpenTox
         R.assign "weights", training_weights
         r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
 rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
+=begin
         File.open("tmp.R","w+"){|f|
           f.puts "suppressPackageStartupMessages({
   library(iterators,lib=\"#{rlib}\")
@@ -170,20 +171,21 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
           f.puts "names(fingerprint) <- features" 
           f.puts "prediction <- predict(model,fingerprint)"
         }
+=end
         
         R.eval "data <- #{r_data_frame}"
         R.assign "features", training_features
         begin
           R.eval "names(data) <- append(c('activities'),features)" #
           R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)"
-        R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
-        R.eval "names(fingerprint) <- features" 
-        R.eval "prediction <- predict(model,fingerprint)"
-        {
-          :value => R.eval("prediction").to_f,
-          :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f,
-          :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f,
-        }
+          R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
+          R.eval "names(fingerprint) <- features" 
+          R.eval "prediction <- predict(model,fingerprint)"
+          {
+            :value => R.eval("prediction").to_f,
+            :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f,
+            :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f,
+          }
         rescue 
           return nil
         end
diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index c6b2a07..b7c95f6 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -63,16 +63,15 @@ module OpenTox
     end
 
     def self.regression predictions
-      # TODO: prediction intervals
       rmse = 0
       mae = 0
       x = []
       y = []
       predictions.each do |cid,pred|
         if pred[:value] and pred[:measured] #and pred[:measured] != [nil]
-          x << -Math.log10(pred[:measured].median)
-          y << -Math.log10(pred[:value])
-          error = Math.log10(pred[:value])-Math.log10(pred[:measured].median)
+          x << pred[:measured].median
+          y << pred[:value]
+          error = pred[:value]-pred[:measured].median
           rmse += error**2
           mae += error.abs
         else
diff --git a/scripts/import-enm.rb b/scripts/import-enm.rb
new file mode 100644
index 0000000..9cbe5d4
--- /dev/null
+++ b/scripts/import-enm.rb
@@ -0,0 +1,6 @@
+require_relative '../lib/lazar'
+include OpenTox
+$mongo.database.drop
+$gridfs = $mongo.database.fs # recreate GridFS indexes
+Import::Enanomapper.import
+`mongodump -h 127.0.0.1 -d production`
diff --git a/scripts/mg2mmol.rb b/scripts/mg2mmol.rb
new file mode 100644
index 0000000..dc6b953
--- /dev/null
+++ b/scripts/mg2mmol.rb
@@ -0,0 +1,17 @@
+#!/usr/bin/env ruby
+require_relative '../lazar/lib/lazar'
+include OpenTox
+newfile = ARGV[0].sub(/.csv/,"_mmol.csv") 
+p newfile
+CSV.open(newfile, "wb") do |csv|
+  CSV.read(ARGV[0]).each do |line|
+    smi,mg = line
+    if mg.numeric?
+      c = Compound.from_smiles smi
+      mmol = c.mg_to_mmol mg.to_f
+      csv << [smi, mmol]
+    else
+      csv << [smi, mg.gsub(/mg/,'mmol')]
+    end
+  end
+end
diff --git a/scripts/mmol2-log10.rb b/scripts/mmol2-log10.rb
new file mode 100644
index 0000000..0c99a0b
--- /dev/null
+++ b/scripts/mmol2-log10.rb
@@ -0,0 +1,17 @@
+#!/usr/bin/env ruby
+require_relative '../lib/lazar'
+include OpenTox
+newfile = ARGV[0].sub(/.csv/,"_log10.csv") 
+p newfile
+CSV.open(newfile, "wb") do |csv|
+  CSV.read(ARGV[0]).each do |line|
+    smi,mmol = line
+    if mmol.numeric?
+      c = Compound.from_smiles smi
+      mmol = -Math.log10(mmol.to_f)
+      csv << [smi, mmol]
+    else
+      csv << [smi, "-log10(#{mmol})"]
+    end
+  end
+end
diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb
index 46c6620..7308a83 100644
--- a/test/nanoparticles.rb
+++ b/test/nanoparticles.rb
@@ -1,8 +1,14 @@
 require_relative "setup.rb"
 
+
 class NanoparticleTest  < MiniTest::Test
 
+  def setup
+    `mongorestore --db=development #{File.join(File.dirname(__FILE__),"..","dump","production")}`
+  end
+
   def test_import
+    skip
     dataset_ids = Import::Enanomapper.import
     assert_operator Nanoparticle.count , :>, 570, "Only #{Nanoparticle.count} nanoparticles imported"
     assert_operator dataset_ids.size, :>, 8, "Only #{dataset_ids.size} bundles imported"
@@ -17,6 +23,7 @@ class NanoparticleTest  < MiniTest::Test
   end
 
   def test_summaries
+    skip
     features = Feature.all.to_a
     #p features.collect do |f|
       #f if f.category == "TOX"
@@ -51,6 +58,18 @@ class NanoparticleTest  < MiniTest::Test
     end
   end
 
+  def test_create_model_with_feature_selection
+    training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
+    feature = Feature.find_or_create_by(name: "7.99 Toxicity (other) ICP-AES", category: "TOX", unit: "mL/ug(Mg)")
+    model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors"})
+    nanoparticle = training_dataset.nanoparticles[-34]
+    #p nanoparticle.neighbors
+    prediction = model.predict nanoparticle
+    p prediction
+    #p prediction
+    refute_nil prediction[:value]
+  end
+
   def test_create_model
     training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
     feature = Feature.find_or_create_by(name: "7.99 Toxicity (other) ICP-AES", category: "TOX", unit: "mL/ug(Mg)")
@@ -66,7 +85,9 @@ class NanoparticleTest  < MiniTest::Test
   def test_validate_model
     training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
     feature = Feature.find_or_create_by(name: "7.99 Toxicity (other) ICP-AES", category: "TOX", unit: "mL/ug(Mg)")
-    model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors"})
+    #model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors"})
+    model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average", :neighbor_algorithm => "nanoparticle_neighbors"})
+    p model
     cv = RegressionCrossValidation.create model
     p cv
   end
-- 
cgit v1.2.3