improved handling of duplicates in validations

author: Christoph Helma <helma@in-silico.ch> 2016-02-13 13:15:29 +0100
committer: Christoph Helma <helma@in-silico.ch> 2016-02-13 13:15:29 +0100
commit: e778475c578f13f30af4437845716d7e781c2609 (patch)
tree: 82c14dabc4cf29df1f097a9f8c5c4d8b0b406c4d /lib/regression.rb
parent: f61b7d3c65d084747dc1bf87214e5ec0c57326be (diff)
1 files changed, 37 insertions, 25 deletions
diff --git a/lib/regression.rb b/lib/regression.rb
index 7c64d8f..2b41851 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -4,23 +4,19 @@ module OpenTox
     class Regression
 
       def self.weighted_average compound, params
-        #p params.keys
         weighted_sum = 0.0
         sim_sum = 0.0
         confidence = 0.0
         neighbors = params[:neighbors]
-        #activities = []
         neighbors.each do |row|
-          #if row["dataset_ids"].include? params[:training_dataset_id]
-            sim = row["tanimoto"]
-            confidence = sim if sim > confidence # distance to nearest neighbor
-            # TODO add LOO errors
-            row["features"][params[:prediction_feature_id].to_s].each do |act|
-              weighted_sum += sim*Math.log10(act)
-              #activities << act # TODO: Transformation??
-              sim_sum += sim
-            end
-          #end
+          sim = row["tanimoto"]
+          confidence = sim if sim > confidence # distance to nearest neighbor
+          # TODO add LOO errors
+          row["features"][params[:prediction_feature_id].to_s].each do |act|
+            weighted_sum += sim*Math.log10(act)
+            #activities << act # TODO: Transformation??
+            sim_sum += sim
+          end
         end
         #R.assign "activities", activities
         #R.eval "cv = cv(activities)"
@@ -35,7 +31,7 @@ module OpenTox
 
       def self.local_pls_regression  compound, params
         neighbors = params[:neighbors]
-        return {:value => nil, :confidence => nil} unless neighbors.size > 0
+        return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
         activities = []
         fingerprints = {}
         weights = []
@@ -62,21 +58,37 @@ module OpenTox
         fingerprints.each do |k,v| 
           unless v.uniq.size == 1
             data_frame << "factor(c(#{v.collect{|m| m ? "T" : "F"}.join ","}))"
-            variables << "'#{k}'"
+            variables << k
           end
         end
-        begin
+        if variables.empty?
+            result = weighted_average(compound, params)
+            result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
+            return result
+          return {:value => nil, :confidence => nil} # TODO confidence
+        else
           R.eval "data <- data.frame(#{data_frame.join ","})"
-          R.eval "names(data) <- c('activities',#{variables.join ','})"
-          R.eval "model <- plsr(activities ~ .,data = data, ncomp = 3, weights = weights)"
-          compound_features = fingerprint_ids.collect{|f| compound.fingerprint.include? f }
-          R.eval "fingerprint <- rbind(c(#{compound_features.collect{|f| f ? "T" : "F"}.join ','}))"
-          R.eval "names(fingerprint) <- c(#{variables.join ','})"
-          R.eval "prediction <- predict(model,fingerprint)"
-          prediction = 10**R.eval("prediction").to_f
-          {:value => prediction, :confidence => 1} # TODO confidence
-        rescue
-          {:value => nil, :confidence => nil} # TODO confidence
+          R.assign "features", variables
+          R.eval "names(data) <- append(c('activities'),features)" #
+          begin
+            R.eval "model <- plsr(activities ~ .,data = data, ncomp = 4, weights = weights)"
+          rescue # fall back to weighted average
+            result = weighted_average(compound, params)
+            result[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
+            return result
+          end
+          #begin
+            #compound_features = fingerprint_ids.collect{|f| compound.fingerprint.include? f } # FIX
+            compound_features = variables.collect{|f| compound.fingerprint.include? f } 
+            R.eval "fingerprint <- rbind(c(#{compound_features.collect{|f| f ? "T" : "F"}.join ','}))"
+            R.eval "names(fingerprint) <- features" #
+            R.eval "prediction <- predict(model,fingerprint)"
+            prediction = 10**R.eval("prediction").to_f
+            return {:value => prediction, :confidence => 1} # TODO confidence
+          #rescue
+            #p "Prediction failed"
+            #return {:value => nil, :confidence => nil} # TODO confidence
+          #end
         end
       
       end
author	Christoph Helma <helma@in-silico.ch>	2016-02-13 13:15:29 +0100
committer	Christoph Helma <helma@in-silico.ch>	2016-02-13 13:15:29 +0100
commit	e778475c578f13f30af4437845716d7e781c2609 (patch)
tree	82c14dabc4cf29df1f097a9f8c5c4d8b0b406c4d /lib/regression.rb
parent	f61b7d3c65d084747dc1bf87214e5ec0c57326be (diff)