From 05386e748270c337c66f6f379317ea4b25905236 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Wed, 4 May 2016 19:24:42 +0200
Subject: first reasonable results for nanoparticle crossvalidation

---
 lib/crossvalidation.rb       |   4 +-
 lib/model.rb                 | 101 +++++++++++++++++++------------------------
 lib/nanoparticle.rb          |  18 ++++++--
 lib/regression.rb            |  38 ++++++++--------
 lib/validation-statistics.rb |   7 ++-
 5 files changed, 84 insertions(+), 84 deletions(-)

(limited to 'lib')

diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 0ae36c4..e1f956b 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -141,7 +141,7 @@ module OpenTox
             :measured => p[1],
             :predicted => p[2],
             #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs,
-            :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs,
+            :error => (p[1]-p[2]).abs,
             :relative_error => (p[1]-p[2]).abs/p[1],
             :confidence => p[3],
             :neighbors => neighbors
@@ -152,7 +152,7 @@ module OpenTox
 
     def confidence_plot
       tmpfile = "/tmp/#{id.to_s}_confidence.png"
-      sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact
+      sorted_predictions = predictions.collect{|p| [(p[1]-p[2]).abs,p[3]] if p[1] and p[2]}.compact
       R.assign "error", sorted_predictions.collect{|p| p[0]}
       R.assign "confidence", sorted_predictions.collect{|p| p[1]}
       # TODO fix axis names
diff --git a/lib/model.rb b/lib/model.rb
index f61368e..841ab20 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -2,7 +2,7 @@ module OpenTox
 
   module Model
 
-    class Model
+    class Lazar 
       include OpenTox
       include Mongoid::Document
       include Mongoid::Timestamps
@@ -10,27 +10,13 @@ module OpenTox
 
       field :name, type: String
       field :creator, type: String, default: __FILE__
-      # datasets
       field :training_dataset_id, type: BSON::ObjectId
-      # algorithms
       field :prediction_algorithm, type: String
-      # prediction feature
       field :prediction_feature_id, type: BSON::ObjectId
-
-      def training_dataset
-        Dataset.find(training_dataset_id)
-      end
-
-      def prediction_feature
-        Feature.find(prediction_feature_id)
-      end
-    end
-
-    class Lazar < Model
-
-      # algorithms
       field :neighbor_algorithm, type: String
       field :neighbor_algorithm_parameters, type: Hash, default: {}
+      field :feature_selection_algorithm, type: String
+      field :relevant_features, type: Hash
 
       # Create a lazar model from a training_dataset and a feature_dataset
       # @param [OpenTox::Dataset] training_dataset
@@ -45,10 +31,43 @@ module OpenTox
         self.name ||= "#{training_dataset.name} #{prediction_feature.name}" 
         self.neighbor_algorithm_parameters ||= {}
         self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id
+
+        Algorithm.run(feature_selection_algorithm, self) if feature_selection_algorithm
         save
         self
       end
 
+      def correlation_filter
+        toxicities = []
+        substances = []
+        training_dataset.substances.each do |s|
+          s["toxicities"][prediction_feature_id].each do |act|
+            toxicities << act
+            substances << s
+          end
+        end
+        R.assign "tox", toxicities
+        feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq
+        feature_ids.each do |feature_id|
+          feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id]}
+          R.assign "feature", feature_values
+          begin
+            #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')"
+            R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='complete')"
+            pvalue = R.eval("cor$p.value").to_ruby
+            if pvalue <= 0.05
+              r = R.eval("cor$estimate").to_ruby
+              relevant_features[feature] = {}
+              relevant_features[feature]["pvalue"] = pvalue
+              relevant_features[feature]["r"] = r
+            end
+          rescue
+            warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{toxicities}) failed."
+          end
+        end
+        relevant_features.sort!{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h
+      end
+
       def predict_compound compound
         neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
         # remove neighbors without prediction_feature
@@ -63,7 +82,6 @@ module OpenTox
           prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound."
           neighbors.delete_if{|n| n["_id"] == compound.id}
         end
-        #neighbors.delete_if{|n| n['toxicities'].empty? or n['toxicities'][prediction_feature.id.to_s] == [nil] }
         if neighbors.empty?
           prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []})
         else
@@ -123,6 +141,14 @@ module OpenTox
 
       end
 
+      def training_dataset
+        Dataset.find(training_dataset_id)
+      end
+
+      def prediction_feature
+        Feature.find(prediction_feature_id)
+      end
+
     end
 
     class LazarClassification < Lazar
@@ -229,45 +255,6 @@ module OpenTox
       end
     end
 
-    class NanoLazar
-      include OpenTox
-      include Mongoid::Document
-      include Mongoid::Timestamps
-      store_in collection: "models"
-
-      field :name, type: String
-      field :creator, type: String, default: __FILE__
-      # datasets
-      field :training_dataset_id, type: BSON::ObjectId
-      # algorithms
-      field :prediction_algorithm, type: String
-      # prediction feature
-      field :prediction_feature_id, type: BSON::ObjectId
-      field :training_particle_ids, type: Array
-
-      def self.create_all
-        nanoparticles = Nanoparticle.all
-        toxfeatures = Nanoparticle.all.collect{|np| np.toxicities.keys}.flatten.uniq.collect{|id| Feature.find id}
-        tox = {}
-        toxfeatures.each do |t|
-          tox[t] = nanoparticles.select{|np| np.toxicities.keys.include? t.id.to_s}
-        end
-        tox.select!{|t,nps| nps.size > 50}
-        tox.collect do |t,nps|
-          find_or_create_by(:prediction_feature_id => t.id, :training_particle_ids => nps.collect{|np| np.id})
-        end
-      end
-
-      def predict nanoparticle
-        training = training_particle_ids.collect{|id| Nanoparticle.find id}
-        training_features = training.collect{|t| t.physchem_descriptors.keys}.flatten.uniq
-        query_features = nanoparticle.physchem_descriptors.keys
-        common_features = (training_features & query_features)
-        #p common_features
-      end
-
-    end
-
   end
 
 end
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
index 83b97a9..dda4a9f 100644
--- a/lib/nanoparticle.rb
+++ b/lib/nanoparticle.rb
@@ -8,7 +8,7 @@ module OpenTox
     field :bundles, type: Array, default: []
 
     def nanoparticle_neighbors params
-      Dataset.find(params[:training_dataset_id]).nanoparticles.collect{|np| {"_id" => np.id, "tanimoto" => 1}}
+      Dataset.find(params[:training_dataset_id]).nanoparticles.collect{|np| np["tanimoto"] = 1; np}
     end
 
     def add_feature feature, value
@@ -19,7 +19,19 @@ module OpenTox
         physchem_descriptors[feature.id.to_s].uniq!
       when "TOX"
         toxicities[feature.id.to_s] ||= []
-        toxicities[feature.id.to_s] << value
+        # TODO generic way of parsing TOX values
+        if feature.name == "7.99 Toxicity (other) ICP-AES" and feature.unit == "mL/ug(Mg)" 
+          toxicities[feature.id.to_s] << -Math.log10(value)
+        #if value.numeric?
+          #begin
+          #rescue
+            #p feature
+            #p value
+            #exit
+          #end
+        else
+          toxicities[feature.id.to_s] << value
+        end
         toxicities[feature.id.to_s].uniq!
       else
         warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted."
@@ -29,7 +41,7 @@ module OpenTox
 
     def parse_ambit_value feature, v
       v.delete "unit"
-      # TODO: mmol/log10 conversion
+      # TODO: ppm instead of weights
       if v.keys == ["textValue"]
         add_feature feature, v["textValue"]
       elsif v.keys == ["loValue"]
diff --git a/lib/regression.rb b/lib/regression.rb
index 694a2dc..d2c4e91 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -12,16 +12,15 @@ module OpenTox
           sim ||= 1 # TODO: sim f nanoparticles
           if row["toxicities"][params[:prediction_feature_id].to_s]
             row["toxicities"][params[:prediction_feature_id].to_s].each do |act|
-              weighted_sum += sim*Math.log10(act)
+              weighted_sum += sim*act
               sim_sum += sim
             end
           end
         end
-        sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
+        sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum
         {:value => prediction}
       end
 
-      # TODO explicit neighbors, also for physchem
       def self.local_fingerprint_regression  compound, params, method='pls'#, method_params="sigma=0.05"
         neighbors = params[:neighbors]
         return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
@@ -35,7 +34,7 @@ module OpenTox
           fingerprint = neighbor.fingerprint
           if row["toxicities"][params[:prediction_feature_id].to_s]
             row["toxicities"][params[:prediction_feature_id].to_s].each do |act|
-              activities << Math.log10(act)
+              activities << act
               weights << row["tanimoto"]
               fingerprint_ids.each_with_index do |id,j|
                 fingerprints[id] ||= []
@@ -67,9 +66,9 @@ module OpenTox
             prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
             return prediction
           else
-            prediction[:prediction_interval] = [10**(prediction[:value]-1.96*prediction[:rmse]), 10**(prediction[:value]+1.96*prediction[:rmse])]
-            prediction[:value] = 10**prediction[:value]
-            prediction[:rmse] = 10**prediction[:rmse]
+            prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]]
+            prediction[:value] = prediction[:value]
+            prediction[:rmse] = prediction[:rmse]
             prediction
           end
         end
@@ -96,7 +95,7 @@ module OpenTox
             n["tanimoto"] ?  weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
             neighbor.physchem_descriptors.each do |pid,values| 
               values.uniq!
-              warn "More than one value for #{Feature.find(pid).name}: #{values.join(', ')}" unless values.size == 1
+              warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1
               j = pc_ids.index(pid)+1
               data_frame[j] ||= []
               data_frame[j][i] = values.for_R
@@ -121,7 +120,9 @@ module OpenTox
           result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
           return result
         else
-          query_descriptors = pc_ids.collect{|i| compound.physchem_descriptors[i].for_R if compound.physchem_descriptors[i]}.compact
+          query_descriptors = pc_ids.collect do |i|
+            compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA"
+          end
           remove_idx = []
           query_descriptors.each_with_index do |v,i|
             remove_idx << i if v == "NA"
@@ -137,7 +138,6 @@ module OpenTox
             prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
             return prediction
           else
-            prediction[:value] = 10**prediction[:value]
             prediction
           end
         end
@@ -148,6 +148,7 @@ module OpenTox
         R.assign "weights", training_weights
         r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
 rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
+=begin
         File.open("tmp.R","w+"){|f|
           f.puts "suppressPackageStartupMessages({
   library(iterators,lib=\"#{rlib}\")
@@ -170,20 +171,21 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
           f.puts "names(fingerprint) <- features" 
           f.puts "prediction <- predict(model,fingerprint)"
         }
+=end
         
         R.eval "data <- #{r_data_frame}"
         R.assign "features", training_features
         begin
           R.eval "names(data) <- append(c('activities'),features)" #
           R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)"
-        R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
-        R.eval "names(fingerprint) <- features" 
-        R.eval "prediction <- predict(model,fingerprint)"
-        {
-          :value => R.eval("prediction").to_f,
-          :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f,
-          :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f,
-        }
+          R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
+          R.eval "names(fingerprint) <- features" 
+          R.eval "prediction <- predict(model,fingerprint)"
+          {
+            :value => R.eval("prediction").to_f,
+            :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f,
+            :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f,
+          }
         rescue 
           return nil
         end
diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index c6b2a07..b7c95f6 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -63,16 +63,15 @@ module OpenTox
     end
 
     def self.regression predictions
-      # TODO: prediction intervals
       rmse = 0
       mae = 0
       x = []
       y = []
       predictions.each do |cid,pred|
         if pred[:value] and pred[:measured] #and pred[:measured] != [nil]
-          x << -Math.log10(pred[:measured].median)
-          y << -Math.log10(pred[:value])
-          error = Math.log10(pred[:value])-Math.log10(pred[:measured].median)
+          x << pred[:measured].median
+          y << pred[:value]
+          error = pred[:value]-pred[:measured].median
           rmse += error**2
           mae += error.abs
         else
-- 
cgit v1.2.3