From dc4ab1f4e64d738d6c0b70f0b690a2359685080f Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Wed, 12 Oct 2016 21:32:27 +0200
Subject: physchem regression, correlation_filter for fingerprints

---
 lib/feature_selection.rb | 60 +++++++++++++++++++++++-------------------------
 1 file changed, 29 insertions(+), 31 deletions(-)

(limited to 'lib/feature_selection.rb')

diff --git a/lib/feature_selection.rb b/lib/feature_selection.rb
index 43e3bea..f599539 100644
--- a/lib/feature_selection.rb
+++ b/lib/feature_selection.rb
@@ -3,41 +3,39 @@ module OpenTox
     
     class FeatureSelection
 
-      def self.correlation_filter dataset:, prediction_feature:, types:nil
-        # TODO: speedup, single assignment of all features to R+ parallel computation of significance?
+      def self.correlation_filter model
         relevant_features = {}
-        measurements = []
-        substances = []
-        dataset.substances.each do |s|
-          dataset.values(s,prediction_feature).each do |act|
-            measurements << act
-            substances << s
-          end
-        end
-        R.assign "tox", measurements
-        feature_ids = dataset.substances.collect{ |s| s["properties"].keys}.flatten.uniq
-        feature_ids.select!{|fid| types.include? Feature.find(fid).category} if types
-        feature_ids.each do |feature_id|
-          feature_values = substances.collect{|s| s["properties"][feature_id].first if s["properties"][feature_id]}
-          unless feature_values.uniq.size == 1
-            R.assign "feature", feature_values
-            begin
-              R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')"
-              pvalue = R.eval("cor$p.value").to_ruby
-              if pvalue <= 0.05
-                r = R.eval("cor$estimate").to_ruby
-                relevant_features[feature_id] = {}
-                relevant_features[feature_id]["pvalue"] = pvalue
-                relevant_features[feature_id]["r"] = r
-                relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby
-                relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby
-              end
-            rescue
-              warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed."
+        R.assign "dependent", model.dependent_variables.collect{|v| to_r(v)}
+        model.descriptor_weights = []
+        selected_variables = []
+        selected_descriptor_ids = []
+        model.independent_variables.each_with_index do |v,i|
+          R.assign "independent", v.collect{|n| to_r(n)}
+          begin
+            R.eval "cor <- cor.test(dependent,independent,method = 'pearson',use='pairwise')"
+            pvalue = R.eval("cor$p.value").to_ruby
+            if pvalue <= 0.05
+              model.descriptor_weights << R.eval("cor$estimate").to_ruby**2
+              selected_variables << v
+              selected_descriptor_ids << model.descriptor_ids[i]
             end
+          rescue
+            #warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with '#{Feature.find(model.descriptor_ids[i]).name}' (#{v}) failed."
+            warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with (#{v}) failed."
           end
         end
-        relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h
+
+        model.independent_variables = selected_variables
+        model.descriptor_ids = selected_descriptor_ids
+        model
+      end
+
+      def self.to_r v
+        return 0 if v == false
+        return 1 if v == true
+        return "NA" if v.nil? 
+        return "NA" if v.is_a? Float and v.nan?
+        v
       end
 
     end
-- 
cgit v1.2.3