From 753fcc204d93d86c76860bee6e2f7d0468c3c940 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 14 Apr 2016 19:43:24 +0200
Subject: features/toxicities fixed

---
 lib/regression.rb | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index 5021fb3..cb17f25 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -9,8 +9,8 @@ module OpenTox
         neighbors = params[:neighbors]
         neighbors.each do |row|
           sim = row["tanimoto"]
-          if row["features"][params[:prediction_feature_id].to_s]
-            row["features"][params[:prediction_feature_id].to_s].each do |act|
+          if row["toxicities"][params[:prediction_feature_id].to_s]
+            row["toxicities"][params[:prediction_feature_id].to_s].each do |act|
               weighted_sum += sim*Math.log10(act)
               sim_sum += sim
             end
@@ -32,8 +32,8 @@ module OpenTox
         neighbors.each_with_index do |row,i|
           neighbor = Compound.find row["_id"]
           fingerprint = neighbor.fingerprint
-          if row["features"][params[:prediction_feature_id].to_s]
-            row["features"][params[:prediction_feature_id].to_s].each do |act|
+          if row["toxicities"][params[:prediction_feature_id].to_s]
+            row["toxicities"][params[:prediction_feature_id].to_s].each do |act|
               activities << Math.log10(act)
               weights << row["tanimoto"]
               fingerprint_ids.each_with_index do |id,j|
@@ -79,21 +79,24 @@ module OpenTox
 
         neighbors = params[:neighbors]
         return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
-        return {:value => neighbors.first["features"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
+        return {:value => neighbors.first["toxicities"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
 
         activities = []
         weights = []
         physchem = {}
         
-        neighbors.each_with_index do |row,i|
-          neighbor = Compound.find row["_id"]
-          if row["features"][params[:prediction_feature_id].to_s]
-            row["features"][params[:prediction_feature_id].to_s].each do |act|
-              activities << Math.log10(act)
-              weights << row["tanimoto"] # TODO cosine ?
-              neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity
+        neighbors.each_with_index do |n,i|
+          if n["toxicities"][params[:prediction_feature_id].to_s]
+            n["toxicities"][params[:prediction_feature_id].to_s].each do |act|
+              # TODO fix!!!!
+              activities << -Math.log10(act)
+              #if act.numeric?
+              #activities << act
+              n["tanimoto"] ?  weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
+              neighbor = Substance.find(n["_id"])
+              neighbor.physchem_descriptors.each do |pid,v| # insert physchem only if there is an activity
                 physchem[pid] ||= []
-                physchem[pid] <<  v
+                physchem[pid] +=  v
               end
             end
           end
@@ -110,8 +113,8 @@ module OpenTox
           return result
 
         else
-          data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] }
-          prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]}
+          data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid].collect{|v| "\"#{v.sub('[','').sub(']','')}\"" if v.is_a? String }}
+          prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem_descriptors[pid]}
           if prediction.nil?
             prediction = local_weighted_average(compound, params)
             prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
@@ -127,6 +130,8 @@ module OpenTox
       def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
         R.assign "weights", training_weights
         r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
+        #p r_data_frame
+        File.open("tmp.R","w+"){|f| f.puts "data <- #{r_data_frame}\n"}
         R.eval "data <- #{r_data_frame}"
         R.assign "features", training_features
         R.eval "names(data) <- append(c('activities'),features)" #
-- 
cgit v1.2.3


From cfc64a2966ab38698e499f0b44f41208ee77a07f Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Tue, 26 Apr 2016 17:38:15 +0200
Subject: first nanomaterial prediction

---
 lib/regression.rb | 99 ++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 69 insertions(+), 30 deletions(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index cb17f25..5610a77 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -75,46 +75,62 @@ module OpenTox
       
       end
 
-      def self.local_physchem_regression  compound, params, method="plsr"#, method_params="ncomp = 4"
+      def self.local_physchem_regression  compound, params, method="pls"#, method_params="ncomp = 4"
+
+        neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s]} # use only neighbors with measured activities
 
-        neighbors = params[:neighbors]
         return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
         return {:value => neighbors.first["toxicities"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
 
         activities = []
         weights = []
-        physchem = {}
+        pc_ids = neighbors.collect{|n| n.physchem_descriptors.keys}.flatten.uniq
+        data_frame = []
+        data_frame[0] = []
         
         neighbors.each_with_index do |n,i|
-          if n["toxicities"][params[:prediction_feature_id].to_s]
-            n["toxicities"][params[:prediction_feature_id].to_s].each do |act|
-              # TODO fix!!!!
-              activities << -Math.log10(act)
-              #if act.numeric?
-              #activities << act
-              n["tanimoto"] ?  weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
-              neighbor = Substance.find(n["_id"])
-              neighbor.physchem_descriptors.each do |pid,v| # insert physchem only if there is an activity
-                physchem[pid] ||= []
-                physchem[pid] +=  v
-              end
+          neighbor = Substance.find(n["_id"])
+          n["toxicities"][params[:prediction_feature_id].to_s].each do |act|
+            data_frame[0][i] = act
+            n["tanimoto"] ?  weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
+            neighbor.physchem_descriptors.each do |pid,values| 
+              values.uniq!
+              warn "More than one value for #{Feature.find(pid).name}: #{values.join(', ')}" unless values.size == 1
+              j = pc_ids.index(pid)+1
+              data_frame[j] ||= []
+              data_frame[j][i] = values.for_R
             end
           end
+          (0..pc_ids.size+1).each do |j| # for R: fill empty values with NA
+            data_frame[j] ||= []
+            data_frame[j][i] ||= "NA"
+          end
         end
-
-        # remove properties with a single value
-        physchem.each do |pid,v|
-          physchem.delete(pid) if v.uniq.size <= 1
+        remove_idx = []
+        data_frame.each_with_index do |r,i|
+          remove_idx << i if r.uniq.size == 1 # remove properties with a single value
+        end
+        remove_idx.reverse.each do |i|
+          data_frame.delete_at i
+          pc_ids.delete_at i
         end
 
-        if physchem.empty?
+        if pc_ids.empty?
           result = local_weighted_average(compound, params)
           result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
           return result
-
         else
-          data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid].collect{|v| "\"#{v.sub('[','').sub(']','')}\"" if v.is_a? String }}
-          prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem_descriptors[pid]}
+          query_descriptors = pc_ids.collect{|i| compound.physchem_descriptors[i].for_R}
+          remove_idx = []
+          query_descriptors.each_with_index do |v,i|
+            remove_idx << i if v == "NA"
+          end
+          remove_idx.reverse.each do |i|
+            data_frame.delete_at i
+            pc_ids.delete_at i
+            query_descriptors.delete_at i
+          end
+          prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors
           if prediction.nil?
             prediction = local_weighted_average(compound, params)
             prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
@@ -130,16 +146,39 @@ module OpenTox
       def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
         R.assign "weights", training_weights
         r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
-        #p r_data_frame
-        File.open("tmp.R","w+"){|f| f.puts "data <- #{r_data_frame}\n"}
+rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
+        File.open("tmp.R","w+"){|f|
+          f.puts "suppressPackageStartupMessages({
+  library(iterators,lib=\"#{rlib}\")
+  library(foreach,lib=\"#{rlib}\")
+  library(ggplot2,lib=\"#{rlib}\")
+  library(grid,lib=\"#{rlib}\")
+  library(gridExtra,lib=\"#{rlib}\")
+  library(pls,lib=\"#{rlib}\")
+  library(caret,lib=\"#{rlib}\")
+  library(doMC,lib=\"#{rlib}\")
+  registerDoMC(#{NR_CORES})
+})"
+
+          f.puts "data <- #{r_data_frame}\n"
+          f.puts "weights <- c(#{training_weights.join(', ')})"
+          f.puts "features <- c(#{training_features.join(', ')})"
+          f.puts "names(data) <- append(c('activities'),features)" #
+          f.puts "model <- train(activities ~ ., data = data, method = '#{method}')"
+          f.puts "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
+          f.puts "names(fingerprint) <- features" 
+          f.puts "prediction <- predict(model,fingerprint)"
+        }
+        
         R.eval "data <- #{r_data_frame}"
         R.assign "features", training_features
         R.eval "names(data) <- append(c('activities'),features)" #
-        begin
-          R.eval "model <- train(activities ~ ., data = data, method = '#{method}')"
-        rescue 
-          return nil
-        end
+        #begin
+          R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)"
+        #rescue 
+          #return nil
+        #end
+        p query_feature_values
         R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
         R.eval "names(fingerprint) <- features" 
         R.eval "prediction <- predict(model,fingerprint)"
-- 
cgit v1.2.3


From 32d767ee7cfcc19337892551906950621f348174 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 28 Apr 2016 08:11:12 +0200
Subject: nanoparticle crossvalidation technically working

---
 lib/regression.rb | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index 5610a77..3a59c14 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -9,6 +9,7 @@ module OpenTox
         neighbors = params[:neighbors]
         neighbors.each do |row|
           sim = row["tanimoto"]
+          sim ||= 1 # TODO: sim f nanoparticles
           if row["toxicities"][params[:prediction_feature_id].to_s]
             row["toxicities"][params[:prediction_feature_id].to_s].each do |act|
               weighted_sum += sim*Math.log10(act)
@@ -120,7 +121,7 @@ module OpenTox
           result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
           return result
         else
-          query_descriptors = pc_ids.collect{|i| compound.physchem_descriptors[i].for_R}
+          query_descriptors = pc_ids.collect{|i| compound.physchem_descriptors[i].for_R if compound.physchem_descriptors[i]}.compact
           remove_idx = []
           query_descriptors.each_with_index do |v,i|
             remove_idx << i if v == "NA"
@@ -172,13 +173,9 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
         
         R.eval "data <- #{r_data_frame}"
         R.assign "features", training_features
-        R.eval "names(data) <- append(c('activities'),features)" #
-        #begin
+        begin
+          R.eval "names(data) <- append(c('activities'),features)" #
           R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)"
-        #rescue 
-          #return nil
-        #end
-        p query_feature_values
         R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
         R.eval "names(fingerprint) <- features" 
         R.eval "prediction <- predict(model,fingerprint)"
@@ -187,6 +184,9 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
           :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f,
           :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f,
         }
+        rescue 
+          return nil
+        end
       end
 
     end
-- 
cgit v1.2.3


From 79238bddb59607aa9f759caa9e3c8db176709703 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 28 Apr 2016 12:19:48 +0200
Subject: compound validations fixed

---
 lib/regression.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index 3a59c14..694a2dc 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -85,7 +85,7 @@ module OpenTox
 
         activities = []
         weights = []
-        pc_ids = neighbors.collect{|n| n.physchem_descriptors.keys}.flatten.uniq
+        pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq
         data_frame = []
         data_frame[0] = []
         
-- 
cgit v1.2.3


From 05386e748270c337c66f6f379317ea4b25905236 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Wed, 4 May 2016 19:24:42 +0200
Subject: first reasonable results for nanoparticle crossvalidation

---
 lib/regression.rb | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index 694a2dc..d2c4e91 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -12,16 +12,15 @@ module OpenTox
           sim ||= 1 # TODO: sim f nanoparticles
           if row["toxicities"][params[:prediction_feature_id].to_s]
             row["toxicities"][params[:prediction_feature_id].to_s].each do |act|
-              weighted_sum += sim*Math.log10(act)
+              weighted_sum += sim*act
               sim_sum += sim
             end
           end
         end
-        sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
+        sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum
         {:value => prediction}
       end
 
-      # TODO explicit neighbors, also for physchem
       def self.local_fingerprint_regression  compound, params, method='pls'#, method_params="sigma=0.05"
         neighbors = params[:neighbors]
         return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
@@ -35,7 +34,7 @@ module OpenTox
           fingerprint = neighbor.fingerprint
           if row["toxicities"][params[:prediction_feature_id].to_s]
             row["toxicities"][params[:prediction_feature_id].to_s].each do |act|
-              activities << Math.log10(act)
+              activities << act
               weights << row["tanimoto"]
               fingerprint_ids.each_with_index do |id,j|
                 fingerprints[id] ||= []
@@ -67,9 +66,9 @@ module OpenTox
             prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
             return prediction
           else
-            prediction[:prediction_interval] = [10**(prediction[:value]-1.96*prediction[:rmse]), 10**(prediction[:value]+1.96*prediction[:rmse])]
-            prediction[:value] = 10**prediction[:value]
-            prediction[:rmse] = 10**prediction[:rmse]
+            prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]]
+            prediction[:value] = prediction[:value]
+            prediction[:rmse] = prediction[:rmse]
             prediction
           end
         end
@@ -96,7 +95,7 @@ module OpenTox
             n["tanimoto"] ?  weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
             neighbor.physchem_descriptors.each do |pid,values| 
               values.uniq!
-              warn "More than one value for #{Feature.find(pid).name}: #{values.join(', ')}" unless values.size == 1
+              warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1
               j = pc_ids.index(pid)+1
               data_frame[j] ||= []
               data_frame[j][i] = values.for_R
@@ -121,7 +120,9 @@ module OpenTox
           result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
           return result
         else
-          query_descriptors = pc_ids.collect{|i| compound.physchem_descriptors[i].for_R if compound.physchem_descriptors[i]}.compact
+          query_descriptors = pc_ids.collect do |i|
+            compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA"
+          end
           remove_idx = []
           query_descriptors.each_with_index do |v,i|
             remove_idx << i if v == "NA"
@@ -137,7 +138,6 @@ module OpenTox
             prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
             return prediction
           else
-            prediction[:value] = 10**prediction[:value]
             prediction
           end
         end
@@ -148,6 +148,7 @@ module OpenTox
         R.assign "weights", training_weights
         r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
 rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
+=begin
         File.open("tmp.R","w+"){|f|
           f.puts "suppressPackageStartupMessages({
   library(iterators,lib=\"#{rlib}\")
@@ -170,20 +171,21 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
           f.puts "names(fingerprint) <- features" 
           f.puts "prediction <- predict(model,fingerprint)"
         }
+=end
         
         R.eval "data <- #{r_data_frame}"
         R.assign "features", training_features
         begin
           R.eval "names(data) <- append(c('activities'),features)" #
           R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)"
-        R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
-        R.eval "names(fingerprint) <- features" 
-        R.eval "prediction <- predict(model,fingerprint)"
-        {
-          :value => R.eval("prediction").to_f,
-          :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f,
-          :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f,
-        }
+          R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
+          R.eval "names(fingerprint) <- features" 
+          R.eval "prediction <- predict(model,fingerprint)"
+          {
+            :value => R.eval("prediction").to_f,
+            :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f,
+            :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f,
+          }
         rescue 
           return nil
         end
-- 
cgit v1.2.3


From ab7b37541b4f8a762be737009631d3eefd898b4a Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 5 May 2016 16:14:02 +0200
Subject: ambit mirror, import from mirrored json, proteomics import

---
 lib/regression.rb | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index d2c4e91..fe45f99 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -84,7 +84,7 @@ module OpenTox
 
         activities = []
         weights = []
-        pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq
+        pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem.keys}.flatten.uniq
         data_frame = []
         data_frame[0] = []
         
@@ -93,7 +93,7 @@ module OpenTox
           n["toxicities"][params[:prediction_feature_id].to_s].each do |act|
             data_frame[0][i] = act
             n["tanimoto"] ?  weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
-            neighbor.physchem_descriptors.each do |pid,values| 
+            neighbor.physchem.each do |pid,values| 
               values.uniq!
               warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1
               j = pc_ids.index(pid)+1
@@ -121,7 +121,7 @@ module OpenTox
           return result
         else
           query_descriptors = pc_ids.collect do |i|
-            compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA"
+            compound.physchem[i] ? compound.physchem_descriptors[i].for_R : "NA"
           end
           remove_idx = []
           query_descriptors.each_with_index do |v,i|
-- 
cgit v1.2.3


From 51f57e2858b60bed74ebcc97189b2188c900c283 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Fri, 6 May 2016 12:49:28 +0200
Subject: dataset tests cleanup

---
 lib/regression.rb | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index fe45f99..d2c4e91 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -84,7 +84,7 @@ module OpenTox
 
         activities = []
         weights = []
-        pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem.keys}.flatten.uniq
+        pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq
         data_frame = []
         data_frame[0] = []
         
@@ -93,7 +93,7 @@ module OpenTox
           n["toxicities"][params[:prediction_feature_id].to_s].each do |act|
             data_frame[0][i] = act
             n["tanimoto"] ?  weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
-            neighbor.physchem.each do |pid,values| 
+            neighbor.physchem_descriptors.each do |pid,values| 
               values.uniq!
               warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1
               j = pc_ids.index(pid)+1
@@ -121,7 +121,7 @@ module OpenTox
           return result
         else
           query_descriptors = pc_ids.collect do |i|
-            compound.physchem[i] ? compound.physchem_descriptors[i].for_R : "NA"
+            compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA"
           end
           remove_idx = []
           query_descriptors.each_with_index do |v,i|
-- 
cgit v1.2.3


From 06fc914653face2c58fd4e6c47161cb03e217582 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Sun, 8 May 2016 12:22:58 +0200
Subject: default validations fixed

---
 lib/regression.rb | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index d2c4e91..13e1380 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -11,7 +11,7 @@ module OpenTox
           sim = row["tanimoto"]
           sim ||= 1 # TODO: sim f nanoparticles
           if row["toxicities"][params[:prediction_feature_id].to_s]
-            row["toxicities"][params[:prediction_feature_id].to_s].each do |act|
+            row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act|
               weighted_sum += sim*act
               sim_sum += sim
             end
@@ -33,7 +33,7 @@ module OpenTox
           neighbor = Compound.find row["_id"]
           fingerprint = neighbor.fingerprint
           if row["toxicities"][params[:prediction_feature_id].to_s]
-            row["toxicities"][params[:prediction_feature_id].to_s].each do |act|
+            row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act|
               activities << act
               weights << row["tanimoto"]
               fingerprint_ids.each_with_index do |id,j|
@@ -77,10 +77,10 @@ module OpenTox
 
       def self.local_physchem_regression  compound, params, method="pls"#, method_params="ncomp = 4"
 
-        neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s]} # use only neighbors with measured activities
+        neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s] and n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s]} # use only neighbors with measured activities
 
         return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
-        return {:value => neighbors.first["toxicities"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
+        return {:value => neighbors.first["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
 
         activities = []
         weights = []
@@ -90,7 +90,7 @@ module OpenTox
         
         neighbors.each_with_index do |n,i|
           neighbor = Substance.find(n["_id"])
-          n["toxicities"][params[:prediction_feature_id].to_s].each do |act|
+          n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act|
             data_frame[0][i] = act
             n["tanimoto"] ?  weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
             neighbor.physchem_descriptors.each do |pid,values| 
-- 
cgit v1.2.3


From 7794086d367fb256c3673d7578b23ec2fb83e6ed Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Mon, 9 May 2016 14:05:29 +0200
Subject: physchem crossvalidation fixed

---
 lib/regression.rb | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index 13e1380..b8a7e5f 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -80,7 +80,7 @@ module OpenTox
         neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s] and n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s]} # use only neighbors with measured activities
 
         return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
-        return {:value => neighbors.first["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
+        return {:value => neighbors.first["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].median, :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
 
         activities = []
         weights = []
@@ -94,6 +94,7 @@ module OpenTox
             data_frame[0][i] = act
             n["tanimoto"] ?  weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
             neighbor.physchem_descriptors.each do |pid,values| 
+              values = [values] if values.is_a? Float
               values.uniq!
               warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1
               j = pc_ids.index(pid)+1
-- 
cgit v1.2.3


From 611bac891177f8d9185d45486dd574b6ef4d1912 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Mon, 9 May 2016 15:11:46 +0200
Subject: nanoparticle models fixed

---
 lib/regression.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index b8a7e5f..691f903 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -10,7 +10,7 @@ module OpenTox
         neighbors.each do |row|
           sim = row["tanimoto"]
           sim ||= 1 # TODO: sim f nanoparticles
-          if row["toxicities"][params[:prediction_feature_id].to_s]
+          if row["toxicities"][params[:prediction_feature_id].to_s] and row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s]
             row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act|
               weighted_sum += sim*act
               sim_sum += sim
-- 
cgit v1.2.3


From c1be8fe66f640d44dbbc9bfe5212733994bfb9c5 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Mon, 9 May 2016 15:44:29 +0200
Subject: physchem crossvalidation fixed, test_compound_descriptor_parameters
 assertions fixed

---
 lib/regression.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index 691f903..2eaae73 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -94,7 +94,7 @@ module OpenTox
             data_frame[0][i] = act
             n["tanimoto"] ?  weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
             neighbor.physchem_descriptors.each do |pid,values| 
-              values = [values] if values.is_a? Float
+              values = [values] unless values.is_a? Array
               values.uniq!
               warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1
               j = pc_ids.index(pid)+1
-- 
cgit v1.2.3


From b8bb12c8a163c238d7d4387c1914e2100bb660df Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 12 May 2016 15:23:01 +0200
Subject: enm study import fixed

---
 lib/regression.rb | 102 +++++++++++++++++++++++++-----------------------------
 1 file changed, 47 insertions(+), 55 deletions(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index 2eaae73..9d305a6 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -3,49 +3,43 @@ module OpenTox
     
     class Regression
 
-      def self.local_weighted_average compound, params
+      def self.local_weighted_average substance, neighbors
         weighted_sum = 0.0
         sim_sum = 0.0
-        neighbors = params[:neighbors]
-        neighbors.each do |row|
-          sim = row["tanimoto"]
-          sim ||= 1 # TODO: sim f nanoparticles
-          if row["toxicities"][params[:prediction_feature_id].to_s] and row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s]
-            row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act|
-              weighted_sum += sim*act
-              sim_sum += sim
-            end
-          end
+        neighbors.each do |neighbor|
+          sim = neighbor["similarity"]
+          activities = neighbor["toxicities"]
+          activities.each do |act|
+            weighted_sum += sim*act
+            sim_sum += sim
+          end if activities
         end
         sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum
         {:value => prediction}
       end
 
-      def self.local_fingerprint_regression  compound, params, method='pls'#, method_params="sigma=0.05"
-        neighbors = params[:neighbors]
-        return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
-        activities = []
+      def self.local_fingerprint_regression substance, neighbors, method='pls'#, method_params="sigma=0.05"
+        values = []
         fingerprints = {}
         weights = []
-        fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort
-        
-        neighbors.each_with_index do |row,i|
-          neighbor = Compound.find row["_id"]
-          fingerprint = neighbor.fingerprint
-          if row["toxicities"][params[:prediction_feature_id].to_s]
-            row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act|
-              activities << act
-              weights << row["tanimoto"]
-              fingerprint_ids.each_with_index do |id,j|
-                fingerprints[id] ||= []
-                fingerprints[id] << fingerprint.include?(id) 
-              end
+        fingerprint_ids = neighbors.collect{|n| Compound.find(n["_id"]).fingerprint}.flatten.uniq.sort
+
+        neighbors.each do |n|
+          fingerprint = Substance.find(n["_id"]).fingerprint
+          activities = n["toxicities"]
+          activities.each do |act|
+            values << act
+            weights << n["similarity"]
+            fingerprint_ids.each do |id|
+              fingerprints[id] ||= []
+              fingerprints[id] << fingerprint.include?(id) 
             end
-          end
+          end if activities
         end
 
         variables = []
-        data_frame = [activities]
+        data_frame = [values]
+
         fingerprints.each do |k,v| 
           unless v.uniq.size == 1
             data_frame << v.collect{|m| m ? "T" : "F"}
@@ -54,17 +48,16 @@ module OpenTox
         end
 
         if variables.empty?
-            result = local_weighted_average(compound, params)
-            result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
-            return result
-
+          prediction = local_weighted_average substance, neighbors
+          prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
+          prediction
         else
-          compound_features = variables.collect{|f| compound.fingerprint.include?(f) ? "T" : "F"} 
-          prediction = r_model_prediction method, data_frame, variables, weights, compound_features
+          substance_features = variables.collect{|f| substance.fingerprint.include?(f) ? "T" : "F"} 
+          prediction = r_model_prediction method, data_frame, variables, weights, substance_features
           if prediction.nil? or prediction[:value].nil?
-            prediction = local_weighted_average(compound, params)
-            prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
-            return prediction
+            prediction = local_weighted_average substance, neighbors
+            prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances."
+            prediction
           else
             prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]]
             prediction[:value] = prediction[:value]
@@ -75,13 +68,10 @@ module OpenTox
       
       end
 
-      def self.local_physchem_regression  compound, params, method="pls"#, method_params="ncomp = 4"
-
-        neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s] and n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s]} # use only neighbors with measured activities
-
-        return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
-        return {:value => neighbors.first["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].median, :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
+      #def self.local_physchem_regression(substance:, neighbors:, feature_id:, dataset_id:, method: 'pls')#, method_params="ncomp = 4"
+      def self.local_physchem_regression substance, neighbors, method='pls' #, method_params="ncomp = 4"
 
+        #dataset = Dataset.find dataset_id
         activities = []
         weights = []
         pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq
@@ -90,9 +80,11 @@ module OpenTox
         
         neighbors.each_with_index do |n,i|
           neighbor = Substance.find(n["_id"])
-          n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act|
+          activities = neighbor["toxicities"]
+          activities.each do |act|
             data_frame[0][i] = act
-            n["tanimoto"] ?  weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
+            # TODO: update with cosine similarity for physchem
+            weights << n["similarity"]
             neighbor.physchem_descriptors.each do |pid,values| 
               values = [values] unless values.is_a? Array
               values.uniq!
@@ -101,7 +93,7 @@ module OpenTox
               data_frame[j] ||= []
               data_frame[j][i] = values.for_R
             end
-          end
+          end if activities
           (0..pc_ids.size+1).each do |j| # for R: fill empty values with NA
             data_frame[j] ||= []
             data_frame[j][i] ||= "NA"
@@ -117,12 +109,12 @@ module OpenTox
         end
 
         if pc_ids.empty?
-          result = local_weighted_average(compound, params)
-          result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
-          return result
+          prediction = local_weighted_average substance, neighbors
+          prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
+          prediction
         else
           query_descriptors = pc_ids.collect do |i|
-            compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA"
+            substance.physchem_descriptors[i] ? substance.physchem_descriptors[i].for_R : "NA"
           end
           remove_idx = []
           query_descriptors.each_with_index do |v,i|
@@ -135,9 +127,9 @@ module OpenTox
           end
           prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors
           if prediction.nil?
-            prediction = local_weighted_average(compound, params)
-            prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
-            return prediction
+            prediction = local_weighted_average substance, neighbors
+            prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances."
+            prediction
           else
             prediction
           end
-- 
cgit v1.2.3


From f46ba3b7262f5b551c81fc9396c5b7f0cac7f030 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Fri, 27 May 2016 19:16:16 +0200
Subject: first correlation of nanoparticle predictions

---
 lib/regression.rb | 2 --
 1 file changed, 2 deletions(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index 9d305a6..6487557 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -71,7 +71,6 @@ module OpenTox
       #def self.local_physchem_regression(substance:, neighbors:, feature_id:, dataset_id:, method: 'pls')#, method_params="ncomp = 4"
       def self.local_physchem_regression substance, neighbors, method='pls' #, method_params="ncomp = 4"
 
-        #dataset = Dataset.find dataset_id
         activities = []
         weights = []
         pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq
@@ -83,7 +82,6 @@ module OpenTox
           activities = neighbor["toxicities"]
           activities.each do |act|
             data_frame[0][i] = act
-            # TODO: update with cosine similarity for physchem
             weights << n["similarity"]
             neighbor.physchem_descriptors.each do |pid,values| 
               values = [values] unless values.is_a? Array
-- 
cgit v1.2.3


From b515a0cfedb887a2af753db6e4a08ae1af430cad Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Tue, 31 May 2016 18:08:08 +0200
Subject: cleanup of validation modules/classes

---
 lib/regression.rb | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index 6487557..cffcbbf 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -8,7 +8,7 @@ module OpenTox
         sim_sum = 0.0
         neighbors.each do |neighbor|
           sim = neighbor["similarity"]
-          activities = neighbor["toxicities"]
+          activities = neighbor["measurements"]
           activities.each do |act|
             weighted_sum += sim*act
             sim_sum += sim
@@ -26,7 +26,7 @@ module OpenTox
 
         neighbors.each do |n|
           fingerprint = Substance.find(n["_id"]).fingerprint
-          activities = n["toxicities"]
+          activities = n["measurements"]
           activities.each do |act|
             values << act
             weights << n["similarity"]
@@ -79,7 +79,7 @@ module OpenTox
         
         neighbors.each_with_index do |n,i|
           neighbor = Substance.find(n["_id"])
-          activities = neighbor["toxicities"]
+          activities = neighbor["measurements"]
           activities.each do |act|
             data_frame[0][i] = act
             weights << n["similarity"]
-- 
cgit v1.2.3


From 85f2308c101b4778508c2d767e08af4cfd671b7b Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 2 Jun 2016 12:22:39 +0200
Subject: local pls regression for nanoparticles

---
 lib/regression.rb | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index cffcbbf..5028c78 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -73,23 +73,19 @@ module OpenTox
 
         activities = []
         weights = []
-        pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq
+        pc_ids = neighbors.collect{|n| n["common_descriptors"].collect{|d| d[:id]}}.flatten.uniq.sort
         data_frame = []
         data_frame[0] = []
         
         neighbors.each_with_index do |n,i|
-          neighbor = Substance.find(n["_id"])
-          activities = neighbor["measurements"]
+          activities = n["measurements"]
           activities.each do |act|
             data_frame[0][i] = act
             weights << n["similarity"]
-            neighbor.physchem_descriptors.each do |pid,values| 
-              values = [values] unless values.is_a? Array
-              values.uniq!
-              warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1
-              j = pc_ids.index(pid)+1
+            n["common_descriptors"].each do |d| 
+              j = pc_ids.index(d[:id])+1
               data_frame[j] ||= []
-              data_frame[j][i] = values.for_R
+              data_frame[j][i] = d[:scaled_value]
             end
           end if activities
           (0..pc_ids.size+1).each do |j| # for R: fill empty values with NA
@@ -97,10 +93,12 @@ module OpenTox
             data_frame[j][i] ||= "NA"
           end
         end
+
         remove_idx = []
         data_frame.each_with_index do |r,i|
           remove_idx << i if r.uniq.size == 1 # remove properties with a single value
         end
+
         remove_idx.reverse.each do |i|
           data_frame.delete_at i
           pc_ids.delete_at i
@@ -112,7 +110,7 @@ module OpenTox
           prediction
         else
           query_descriptors = pc_ids.collect do |i|
-            substance.physchem_descriptors[i] ? substance.physchem_descriptors[i].for_R : "NA"
+            substance.scaled_values[i] ? substance.scaled_values[i] : "NA"
           end
           remove_idx = []
           query_descriptors.each_with_index do |v,i|
@@ -127,10 +125,9 @@ module OpenTox
           if prediction.nil?
             prediction = local_weighted_average substance, neighbors
             prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances."
-            prediction
-          else
-            prediction
           end
+          p prediction
+          prediction
         end
       
       end
@@ -172,10 +169,15 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
           R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
           R.eval "names(fingerprint) <- features" 
           R.eval "prediction <- predict(model,fingerprint)"
+          value = R.eval("prediction").to_f
+          rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f
+          r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f
+          prediction_interval = value-1.96*rmse, value+1.96*rmse
           {
-            :value => R.eval("prediction").to_f,
-            :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f,
-            :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f,
+            :value => value,
+            :rmse => rmse,
+            :r_squared => r_squared,
+            :prediction_interval => prediction_interval
           }
         rescue 
           return nil
-- 
cgit v1.2.3


From eec5bddbd35c9ecee8021128508d8718bccb4fe3 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 2 Jun 2016 17:54:48 +0200
Subject: local pls regression for nanoparticle proteomics

---
 lib/regression.rb | 41 +++++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 16 deletions(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index 5028c78..b9067c6 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -88,35 +88,42 @@ module OpenTox
               data_frame[j][i] = d[:scaled_value]
             end
           end if activities
-          (0..pc_ids.size+1).each do |j| # for R: fill empty values with NA
+          #(0..pc_ids.size+1).each do |j| # for R: fill empty values with NA
+          (0..pc_ids.size).each do |j| # for R: fill empty values with NA
             data_frame[j] ||= []
             data_frame[j][i] ||= "NA"
           end
         end
 
-        remove_idx = []
-        data_frame.each_with_index do |r,i|
-          remove_idx << i if r.uniq.size == 1 # remove properties with a single value
-        end
+        #remove_idx = []
+        #data_frame.each_with_index do |r,i|
+          #remove_idx << i if r.uniq.size == 1 # remove properties with a single value TODO: don't break R names assignment
+        #end
 
-        remove_idx.reverse.each do |i|
-          data_frame.delete_at i
-          pc_ids.delete_at i
-        end
+        #p data_frame.size
+        #p pc_ids.size
+        #data_frame.delete_if.with_index { |_, index| remove_idx.include? index }
+        #pc_ids.delete_if.with_index { |_, index| remove_idx.include? index-1 }
+        #remove_idx.sort.reverse.each do |i|
+          #p i
+          #data_frame.delete_at i
+          #pc_ids.delete_at i
+        #end
+        #p data_frame.size
+        #p pc_ids.size
 
         if pc_ids.empty?
           prediction = local_weighted_average substance, neighbors
           prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
           prediction
         else
-          query_descriptors = pc_ids.collect do |i|
-            substance.scaled_values[i] ? substance.scaled_values[i] : "NA"
-          end
+          query_descriptors = pc_ids.collect { |i| substance.scaled_values[i] }
           remove_idx = []
           query_descriptors.each_with_index do |v,i|
-            remove_idx << i if v == "NA"
+            #remove_idx << i if v == "NA"
+            remove_idx << i unless v
           end
-          remove_idx.reverse.each do |i|
+          remove_idx.sort.reverse.each do |i|
             data_frame.delete_at i
             pc_ids.delete_at i
             query_descriptors.delete_at i
@@ -135,8 +142,9 @@ module OpenTox
       def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
         R.assign "weights", training_weights
         r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
-rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
 =begin
+=end
+rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
         File.open("tmp.R","w+"){|f|
           f.puts "suppressPackageStartupMessages({
   library(iterators,lib=\"#{rlib}\")
@@ -159,10 +167,11 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
           f.puts "names(fingerprint) <- features" 
           f.puts "prediction <- predict(model,fingerprint)"
         }
-=end
         
         R.eval "data <- #{r_data_frame}"
         R.assign "features", training_features
+        p training_features.size
+        p R.eval("names(data)").to_ruby.size
         begin
           R.eval "names(data) <- append(c('activities'),features)" #
           R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)"
-- 
cgit v1.2.3


From 290c7f86950c4051d018b8019ff4e72ec406c58c Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Fri, 3 Jun 2016 19:15:36 +0200
Subject: random forest regression

---
 lib/regression.rb | 63 +++++++++++++++++++++++++------------------------------
 1 file changed, 28 insertions(+), 35 deletions(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index b9067c6..c4c83d2 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -3,7 +3,7 @@ module OpenTox
     
     class Regression
 
-      def self.local_weighted_average substance, neighbors
+      def self.local_weighted_average substance:, neighbors:
         weighted_sum = 0.0
         sim_sum = 0.0
         neighbors.each do |neighbor|
@@ -18,7 +18,7 @@ module OpenTox
         {:value => prediction}
       end
 
-      def self.local_fingerprint_regression substance, neighbors, method='pls'#, method_params="sigma=0.05"
+      def self.local_fingerprint_regression substance:, neighbors:, method: pls#, method_params="sigma=0.05"
         values = []
         fingerprints = {}
         weights = []
@@ -68,8 +68,7 @@ module OpenTox
       
       end
 
-      #def self.local_physchem_regression(substance:, neighbors:, feature_id:, dataset_id:, method: 'pls')#, method_params="ncomp = 4"
-      def self.local_physchem_regression substance, neighbors, method='pls' #, method_params="ncomp = 4"
+      def self.local_physchem_regression substance:, neighbors:, method: pls
 
         activities = []
         weights = []
@@ -88,46 +87,39 @@ module OpenTox
               data_frame[j][i] = d[:scaled_value]
             end
           end if activities
-          #(0..pc_ids.size+1).each do |j| # for R: fill empty values with NA
           (0..pc_ids.size).each do |j| # for R: fill empty values with NA
             data_frame[j] ||= []
             data_frame[j][i] ||= "NA"
           end
         end
 
-        #remove_idx = []
-        #data_frame.each_with_index do |r,i|
-          #remove_idx << i if r.uniq.size == 1 # remove properties with a single value TODO: don't break R names assignment
-        #end
-
-        #p data_frame.size
-        #p pc_ids.size
-        #data_frame.delete_if.with_index { |_, index| remove_idx.include? index }
-        #pc_ids.delete_if.with_index { |_, index| remove_idx.include? index-1 }
-        #remove_idx.sort.reverse.each do |i|
-          #p i
-          #data_frame.delete_at i
-          #pc_ids.delete_at i
-        #end
-        #p data_frame.size
-        #p pc_ids.size
+        data_frame = data_frame.each_with_index.collect do |r,i|
+          if r.uniq.size == 1 # remove properties with a single value 
+            r = nil
+            pc_ids[i-1] = nil # data_frame frame has additional activity entry
+          end
+          r
+        end
+        data_frame.compact!
+        pc_ids.compact!
 
         if pc_ids.empty?
           prediction = local_weighted_average substance, neighbors
-          prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
+          prediction[:warning] = "No relevant variables for regression model. Using weighted average of similar substances."
           prediction
         else
           query_descriptors = pc_ids.collect { |i| substance.scaled_values[i] }
-          remove_idx = []
-          query_descriptors.each_with_index do |v,i|
-            #remove_idx << i if v == "NA"
-            remove_idx << i unless v
-          end
-          remove_idx.sort.reverse.each do |i|
-            data_frame.delete_at i
-            pc_ids.delete_at i
-            query_descriptors.delete_at i
+          query_descriptors = query_descriptors.each_with_index.collect do |v,i|
+            unless v
+              v = nil
+              data_frame[i] = nil
+              pc_ids[i] = nil
+            end
+            v
           end
+          query_descriptors.compact!
+          data_frame.compact!
+          pc_ids.compact!
           prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors
           if prediction.nil?
             prediction = local_weighted_average substance, neighbors
@@ -143,7 +135,6 @@ module OpenTox
         R.assign "weights", training_weights
         r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
 =begin
-=end
 rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
         File.open("tmp.R","w+"){|f|
           f.puts "suppressPackageStartupMessages({
@@ -162,19 +153,21 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
           f.puts "weights <- c(#{training_weights.join(', ')})"
           f.puts "features <- c(#{training_features.join(', ')})"
           f.puts "names(data) <- append(c('activities'),features)" #
+          f.puts "ctrl <- rfeControl(functions = #{method}, method = 'repeatedcv', repeats = 5, verbose = T)"
+          f.puts "lmProfile <- rfe(activities ~ ., data = data, rfeControl = ctrl)"
+
           f.puts "model <- train(activities ~ ., data = data, method = '#{method}')"
           f.puts "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
           f.puts "names(fingerprint) <- features" 
           f.puts "prediction <- predict(model,fingerprint)"
         }
+=end
         
         R.eval "data <- #{r_data_frame}"
         R.assign "features", training_features
-        p training_features.size
-        p R.eval("names(data)").to_ruby.size
         begin
           R.eval "names(data) <- append(c('activities'),features)" #
-          R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)"
+          R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)"
           R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
           R.eval "names(fingerprint) <- features" 
           R.eval "prediction <- predict(model,fingerprint)"
-- 
cgit v1.2.3


From f7e87b45f15083e5fcdea64821f06ed93ece4c4e Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Tue, 7 Jun 2016 18:07:28 +0200
Subject: (repeated)crossvalidation plots

---
 lib/regression.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index c4c83d2..51317ac 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -122,7 +122,7 @@ module OpenTox
           pc_ids.compact!
           prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors
           if prediction.nil?
-            prediction = local_weighted_average substance, neighbors
+            prediction = local_weighted_average(substance: substance, neighbors: neighbors)
             prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances."
           end
           p prediction
-- 
cgit v1.2.3


From 0f31c884d1bcfa448a1bf43a41d8fd6cf88bfc52 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Wed, 8 Jun 2016 18:26:07 +0200
Subject: compound tests fixed

---
 lib/regression.rb | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index 51317ac..d034d0b 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -18,7 +18,7 @@ module OpenTox
         {:value => prediction}
       end
 
-      def self.local_fingerprint_regression substance:, neighbors:, method: pls#, method_params="sigma=0.05"
+      def self.local_fingerprint_regression substance:, neighbors:, method: "pls" #, method_params="sigma=0.05"
         values = []
         fingerprints = {}
         weights = []
@@ -55,7 +55,7 @@ module OpenTox
           substance_features = variables.collect{|f| substance.fingerprint.include?(f) ? "T" : "F"} 
           prediction = r_model_prediction method, data_frame, variables, weights, substance_features
           if prediction.nil? or prediction[:value].nil?
-            prediction = local_weighted_average substance, neighbors
+            prediction = local_weighted_average(substance: substance, neighbors: neighbors)
             prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances."
             prediction
           else
@@ -68,7 +68,7 @@ module OpenTox
       
       end
 
-      def self.local_physchem_regression substance:, neighbors:, method: pls
+      def self.local_physchem_regression substance:, neighbors:, method: "pls"
 
         activities = []
         weights = []
-- 
cgit v1.2.3


From 46c628f1757ce8274a0b277b3ec3306609b38c14 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Mon, 25 Jul 2016 15:53:22 +0200
Subject: local_weighted_average fallback fixed, cv predictions pulled from
 validations to avoid mongo document size errors

---
 lib/regression.rb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index d034d0b..269a743 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -48,7 +48,7 @@ module OpenTox
         end
 
         if variables.empty?
-          prediction = local_weighted_average substance, neighbors
+          prediction = local_weighted_average(substance: substance, neighbors: neighbors)
           prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
           prediction
         else
@@ -104,7 +104,7 @@ module OpenTox
         pc_ids.compact!
 
         if pc_ids.empty?
-          prediction = local_weighted_average substance, neighbors
+          prediction = local_weighted_average(substance: substance, neighbors: neighbors)
           prediction[:warning] = "No relevant variables for regression model. Using weighted average of similar substances."
           prediction
         else
-- 
cgit v1.2.3


From 5d4e5e463c2b87241bbb56e4658e1e26c0ed084f Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Wed, 5 Oct 2016 13:22:12 +0200
Subject: substance and nanoparticle model creation and predictions

---
 lib/regression.rb | 67 +++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 60 insertions(+), 7 deletions(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index 269a743..396c9e4 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -3,7 +3,8 @@ module OpenTox
     
     class Regression
 
-      def self.local_weighted_average substance:, neighbors:
+      def self.weighted_average descriptors:nil, neighbors:, parameters:nil
+        # TODO: prediction_interval
         weighted_sum = 0.0
         sim_sum = 0.0
         neighbors.each do |neighbor|
@@ -18,7 +19,57 @@ module OpenTox
         {:value => prediction}
       end
 
-      def self.local_fingerprint_regression substance:, neighbors:, method: "pls" #, method_params="sigma=0.05"
+      def self.caret descriptors:, neighbors:, method: "pls", parameters:nil
+        values = []
+        descriptors = {}
+        weights = []
+        descriptor_ids = neighbors.collect{|n| n["descriptors"]}.flatten.uniq.sort
+
+        neighbors.each do |n|
+          activities = n["measurements"]
+          activities.each do |act|
+            values << act
+            weights << n["similarity"]
+            descriptor_ids.each do |id|
+              descriptors[id] ||= []
+              descriptors[id] << n["descriptors"].include?(id) 
+            end
+          end if activities
+        end
+
+        variables = []
+        data_frame = [values]
+
+        descriptors.each do |k,v| 
+          unless v.uniq.size == 1
+            data_frame << v.collect{|m| m ? "T" : "F"}
+            variables << k
+          end
+        end
+
+        if variables.empty?
+          prediction = weighted_average(descriptors: descriptors, neighbors: neighbors)
+          prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
+          prediction
+        else
+          substance_features = variables.collect{|f| descriptors.include?(f) ? "T" : "F"} 
+          #puts data_frame.to_yaml
+          prediction = r_model_prediction method, data_frame, variables, weights, substance_features
+          if prediction.nil? or prediction[:value].nil?
+            prediction = weighted_average(descriptors: descriptors, neighbors: neighbors)
+            prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances."
+            prediction
+          else
+            prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]]
+            prediction[:value] = prediction[:value]
+            prediction[:rmse] = prediction[:rmse]
+            prediction
+          end
+        end
+      
+      end
+
+      def self.fingerprint_regression substance:, neighbors:, method: "pls" #, method_params="sigma=0.05"
         values = []
         fingerprints = {}
         weights = []
@@ -48,14 +99,14 @@ module OpenTox
         end
 
         if variables.empty?
-          prediction = local_weighted_average(substance: substance, neighbors: neighbors)
+          prediction = weighted_average(substance: substance, neighbors: neighbors)
           prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
           prediction
         else
           substance_features = variables.collect{|f| substance.fingerprint.include?(f) ? "T" : "F"} 
           prediction = r_model_prediction method, data_frame, variables, weights, substance_features
           if prediction.nil? or prediction[:value].nil?
-            prediction = local_weighted_average(substance: substance, neighbors: neighbors)
+            prediction = weighted_average(substance: substance, neighbors: neighbors)
             prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances."
             prediction
           else
@@ -68,7 +119,8 @@ module OpenTox
       
       end
 
-      def self.local_physchem_regression substance:, neighbors:, method: "pls"
+=begin
+      def self.physchem_regression substance:, neighbors:, method: "pls"
 
         activities = []
         weights = []
@@ -104,7 +156,7 @@ module OpenTox
         pc_ids.compact!
 
         if pc_ids.empty?
-          prediction = local_weighted_average(substance: substance, neighbors: neighbors)
+          prediction = weighted_average(substance: substance, neighbors: neighbors)
           prediction[:warning] = "No relevant variables for regression model. Using weighted average of similar substances."
           prediction
         else
@@ -122,7 +174,7 @@ module OpenTox
           pc_ids.compact!
           prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors
           if prediction.nil?
-            prediction = local_weighted_average(substance: substance, neighbors: neighbors)
+            prediction = weighted_average(substance: substance, neighbors: neighbors)
             prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances."
           end
           p prediction
@@ -130,6 +182,7 @@ module OpenTox
         end
       
       end
+=end
 
       def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
         R.assign "weights", training_weights
-- 
cgit v1.2.3


From 4348eec89033e6677c9f628646fc67bd03c73fe6 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 6 Oct 2016 19:14:10 +0200
Subject: nano caret regression fixed

---
 lib/regression.rb | 220 ------------------------------------------------------
 1 file changed, 220 deletions(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index 396c9e4..cf6d9cb 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -19,226 +19,6 @@ module OpenTox
         {:value => prediction}
       end
 
-      def self.caret descriptors:, neighbors:, method: "pls", parameters:nil
-        values = []
-        descriptors = {}
-        weights = []
-        descriptor_ids = neighbors.collect{|n| n["descriptors"]}.flatten.uniq.sort
-
-        neighbors.each do |n|
-          activities = n["measurements"]
-          activities.each do |act|
-            values << act
-            weights << n["similarity"]
-            descriptor_ids.each do |id|
-              descriptors[id] ||= []
-              descriptors[id] << n["descriptors"].include?(id) 
-            end
-          end if activities
-        end
-
-        variables = []
-        data_frame = [values]
-
-        descriptors.each do |k,v| 
-          unless v.uniq.size == 1
-            data_frame << v.collect{|m| m ? "T" : "F"}
-            variables << k
-          end
-        end
-
-        if variables.empty?
-          prediction = weighted_average(descriptors: descriptors, neighbors: neighbors)
-          prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
-          prediction
-        else
-          substance_features = variables.collect{|f| descriptors.include?(f) ? "T" : "F"} 
-          #puts data_frame.to_yaml
-          prediction = r_model_prediction method, data_frame, variables, weights, substance_features
-          if prediction.nil? or prediction[:value].nil?
-            prediction = weighted_average(descriptors: descriptors, neighbors: neighbors)
-            prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances."
-            prediction
-          else
-            prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]]
-            prediction[:value] = prediction[:value]
-            prediction[:rmse] = prediction[:rmse]
-            prediction
-          end
-        end
-      
-      end
-
-      def self.fingerprint_regression substance:, neighbors:, method: "pls" #, method_params="sigma=0.05"
-        values = []
-        fingerprints = {}
-        weights = []
-        fingerprint_ids = neighbors.collect{|n| Compound.find(n["_id"]).fingerprint}.flatten.uniq.sort
-
-        neighbors.each do |n|
-          fingerprint = Substance.find(n["_id"]).fingerprint
-          activities = n["measurements"]
-          activities.each do |act|
-            values << act
-            weights << n["similarity"]
-            fingerprint_ids.each do |id|
-              fingerprints[id] ||= []
-              fingerprints[id] << fingerprint.include?(id) 
-            end
-          end if activities
-        end
-
-        variables = []
-        data_frame = [values]
-
-        fingerprints.each do |k,v| 
-          unless v.uniq.size == 1
-            data_frame << v.collect{|m| m ? "T" : "F"}
-            variables << k
-          end
-        end
-
-        if variables.empty?
-          prediction = weighted_average(substance: substance, neighbors: neighbors)
-          prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
-          prediction
-        else
-          substance_features = variables.collect{|f| substance.fingerprint.include?(f) ? "T" : "F"} 
-          prediction = r_model_prediction method, data_frame, variables, weights, substance_features
-          if prediction.nil? or prediction[:value].nil?
-            prediction = weighted_average(substance: substance, neighbors: neighbors)
-            prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances."
-            prediction
-          else
-            prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]]
-            prediction[:value] = prediction[:value]
-            prediction[:rmse] = prediction[:rmse]
-            prediction
-          end
-        end
-      
-      end
-
-=begin
-      def self.physchem_regression substance:, neighbors:, method: "pls"
-
-        activities = []
-        weights = []
-        pc_ids = neighbors.collect{|n| n["common_descriptors"].collect{|d| d[:id]}}.flatten.uniq.sort
-        data_frame = []
-        data_frame[0] = []
-        
-        neighbors.each_with_index do |n,i|
-          activities = n["measurements"]
-          activities.each do |act|
-            data_frame[0][i] = act
-            weights << n["similarity"]
-            n["common_descriptors"].each do |d| 
-              j = pc_ids.index(d[:id])+1
-              data_frame[j] ||= []
-              data_frame[j][i] = d[:scaled_value]
-            end
-          end if activities
-          (0..pc_ids.size).each do |j| # for R: fill empty values with NA
-            data_frame[j] ||= []
-            data_frame[j][i] ||= "NA"
-          end
-        end
-
-        data_frame = data_frame.each_with_index.collect do |r,i|
-          if r.uniq.size == 1 # remove properties with a single value 
-            r = nil
-            pc_ids[i-1] = nil # data_frame frame has additional activity entry
-          end
-          r
-        end
-        data_frame.compact!
-        pc_ids.compact!
-
-        if pc_ids.empty?
-          prediction = weighted_average(substance: substance, neighbors: neighbors)
-          prediction[:warning] = "No relevant variables for regression model. Using weighted average of similar substances."
-          prediction
-        else
-          query_descriptors = pc_ids.collect { |i| substance.scaled_values[i] }
-          query_descriptors = query_descriptors.each_with_index.collect do |v,i|
-            unless v
-              v = nil
-              data_frame[i] = nil
-              pc_ids[i] = nil
-            end
-            v
-          end
-          query_descriptors.compact!
-          data_frame.compact!
-          pc_ids.compact!
-          prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors
-          if prediction.nil?
-            prediction = weighted_average(substance: substance, neighbors: neighbors)
-            prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances."
-          end
-          p prediction
-          prediction
-        end
-      
-      end
-=end
-
-      def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
-        R.assign "weights", training_weights
-        r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
-=begin
-rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
-        File.open("tmp.R","w+"){|f|
-          f.puts "suppressPackageStartupMessages({
-  library(iterators,lib=\"#{rlib}\")
-  library(foreach,lib=\"#{rlib}\")
-  library(ggplot2,lib=\"#{rlib}\")
-  library(grid,lib=\"#{rlib}\")
-  library(gridExtra,lib=\"#{rlib}\")
-  library(pls,lib=\"#{rlib}\")
-  library(caret,lib=\"#{rlib}\")
-  library(doMC,lib=\"#{rlib}\")
-  registerDoMC(#{NR_CORES})
-})"
-
-          f.puts "data <- #{r_data_frame}\n"
-          f.puts "weights <- c(#{training_weights.join(', ')})"
-          f.puts "features <- c(#{training_features.join(', ')})"
-          f.puts "names(data) <- append(c('activities'),features)" #
-          f.puts "ctrl <- rfeControl(functions = #{method}, method = 'repeatedcv', repeats = 5, verbose = T)"
-          f.puts "lmProfile <- rfe(activities ~ ., data = data, rfeControl = ctrl)"
-
-          f.puts "model <- train(activities ~ ., data = data, method = '#{method}')"
-          f.puts "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
-          f.puts "names(fingerprint) <- features" 
-          f.puts "prediction <- predict(model,fingerprint)"
-        }
-=end
-        
-        R.eval "data <- #{r_data_frame}"
-        R.assign "features", training_features
-        begin
-          R.eval "names(data) <- append(c('activities'),features)" #
-          R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)"
-          R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
-          R.eval "names(fingerprint) <- features" 
-          R.eval "prediction <- predict(model,fingerprint)"
-          value = R.eval("prediction").to_f
-          rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f
-          r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f
-          prediction_interval = value-1.96*rmse, value+1.96*rmse
-          {
-            :value => value,
-            :rmse => rmse,
-            :r_squared => r_squared,
-            :prediction_interval => prediction_interval
-          }
-        rescue 
-          return nil
-        end
-      end
-
     end
   end
 end
-- 
cgit v1.2.3


From 91787edb3682900bc5a2feeca66e5142f387fcc6 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Fri, 7 Oct 2016 10:25:58 +0200
Subject: unified interface for prediction algorithms

---
 lib/regression.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index cf6d9cb..0e5e06b 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -3,7 +3,7 @@ module OpenTox
     
     class Regression
 
-      def self.weighted_average descriptors:nil, neighbors:, parameters:nil
+      def self.weighted_average descriptors:nil, neighbors:, parameters:nil, method:nil, relevant_features:nil
         # TODO: prediction_interval
         weighted_sum = 0.0
         sim_sum = 0.0
-- 
cgit v1.2.3


From dc4ab1f4e64d738d6c0b70f0b690a2359685080f Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Wed, 12 Oct 2016 21:32:27 +0200
Subject: physchem regression, correlation_filter for fingerprints

---
 lib/regression.rb | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index 0e5e06b..bed6df8 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -3,18 +3,15 @@ module OpenTox
     
     class Regression
 
-      def self.weighted_average descriptors:nil, neighbors:, parameters:nil, method:nil, relevant_features:nil
+      def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables:
+      #def self.weighted_average descriptors:nil, neighbors:, parameters:nil, method:nil, relevant_features:nil
         # TODO: prediction_interval
         weighted_sum = 0.0
         sim_sum = 0.0
-        neighbors.each do |neighbor|
-          sim = neighbor["similarity"]
-          activities = neighbor["measurements"]
-          activities.each do |act|
-            weighted_sum += sim*act
-            sim_sum += sim
-          end if activities
-        end
+        dependent_variables.each_with_index do |v,i|
+          weighted_sum += weights[i]*dependent_variables[i]
+          sim_sum += weights[i]
+        end if dependent_variables
         sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum
         {:value => prediction}
       end
-- 
cgit v1.2.3


From 8d325866dd7cacdd04bd2306a9144a5e7300c7c8 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 13 Oct 2016 10:11:09 +0200
Subject: molecular_weight fixed

---
 lib/regression.rb | 1 -
 1 file changed, 1 deletion(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index bed6df8..d1724fd 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -4,7 +4,6 @@ module OpenTox
     class Regression
 
       def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables:
-      #def self.weighted_average descriptors:nil, neighbors:, parameters:nil, method:nil, relevant_features:nil
         # TODO: prediction_interval
         weighted_sum = 0.0
         sim_sum = 0.0
-- 
cgit v1.2.3


From 09452bba5c407c27721223d126e3f45c12b20a0c Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 13 Oct 2016 22:59:45 +0200
Subject: tests pass

---
 lib/regression.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index d1724fd..3890987 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -3,7 +3,7 @@ module OpenTox
     
     class Regression
 
-      def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables:
+      def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables:nil
         # TODO: prediction_interval
         weighted_sum = 0.0
         sim_sum = 0.0
-- 
cgit v1.2.3