From 4348eec89033e6677c9f628646fc67bd03c73fe6 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 6 Oct 2016 19:14:10 +0200
Subject: nano caret regression fixed

---
 lib/regression.rb | 220 ------------------------------------------------------
 1 file changed, 220 deletions(-)

(limited to 'lib/regression.rb')

diff --git a/lib/regression.rb b/lib/regression.rb
index 396c9e4..cf6d9cb 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -19,226 +19,6 @@ module OpenTox
         {:value => prediction}
       end
 
-      def self.caret descriptors:, neighbors:, method: "pls", parameters:nil
-        values = []
-        descriptors = {}
-        weights = []
-        descriptor_ids = neighbors.collect{|n| n["descriptors"]}.flatten.uniq.sort
-
-        neighbors.each do |n|
-          activities = n["measurements"]
-          activities.each do |act|
-            values << act
-            weights << n["similarity"]
-            descriptor_ids.each do |id|
-              descriptors[id] ||= []
-              descriptors[id] << n["descriptors"].include?(id) 
-            end
-          end if activities
-        end
-
-        variables = []
-        data_frame = [values]
-
-        descriptors.each do |k,v| 
-          unless v.uniq.size == 1
-            data_frame << v.collect{|m| m ? "T" : "F"}
-            variables << k
-          end
-        end
-
-        if variables.empty?
-          prediction = weighted_average(descriptors: descriptors, neighbors: neighbors)
-          prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
-          prediction
-        else
-          substance_features = variables.collect{|f| descriptors.include?(f) ? "T" : "F"} 
-          #puts data_frame.to_yaml
-          prediction = r_model_prediction method, data_frame, variables, weights, substance_features
-          if prediction.nil? or prediction[:value].nil?
-            prediction = weighted_average(descriptors: descriptors, neighbors: neighbors)
-            prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances."
-            prediction
-          else
-            prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]]
-            prediction[:value] = prediction[:value]
-            prediction[:rmse] = prediction[:rmse]
-            prediction
-          end
-        end
-      
-      end
-
-      def self.fingerprint_regression substance:, neighbors:, method: "pls" #, method_params="sigma=0.05"
-        values = []
-        fingerprints = {}
-        weights = []
-        fingerprint_ids = neighbors.collect{|n| Compound.find(n["_id"]).fingerprint}.flatten.uniq.sort
-
-        neighbors.each do |n|
-          fingerprint = Substance.find(n["_id"]).fingerprint
-          activities = n["measurements"]
-          activities.each do |act|
-            values << act
-            weights << n["similarity"]
-            fingerprint_ids.each do |id|
-              fingerprints[id] ||= []
-              fingerprints[id] << fingerprint.include?(id) 
-            end
-          end if activities
-        end
-
-        variables = []
-        data_frame = [values]
-
-        fingerprints.each do |k,v| 
-          unless v.uniq.size == 1
-            data_frame << v.collect{|m| m ? "T" : "F"}
-            variables << k
-          end
-        end
-
-        if variables.empty?
-          prediction = weighted_average(substance: substance, neighbors: neighbors)
-          prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
-          prediction
-        else
-          substance_features = variables.collect{|f| substance.fingerprint.include?(f) ? "T" : "F"} 
-          prediction = r_model_prediction method, data_frame, variables, weights, substance_features
-          if prediction.nil? or prediction[:value].nil?
-            prediction = weighted_average(substance: substance, neighbors: neighbors)
-            prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances."
-            prediction
-          else
-            prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]]
-            prediction[:value] = prediction[:value]
-            prediction[:rmse] = prediction[:rmse]
-            prediction
-          end
-        end
-      
-      end
-
-=begin
-      def self.physchem_regression substance:, neighbors:, method: "pls"
-
-        activities = []
-        weights = []
-        pc_ids = neighbors.collect{|n| n["common_descriptors"].collect{|d| d[:id]}}.flatten.uniq.sort
-        data_frame = []
-        data_frame[0] = []
-        
-        neighbors.each_with_index do |n,i|
-          activities = n["measurements"]
-          activities.each do |act|
-            data_frame[0][i] = act
-            weights << n["similarity"]
-            n["common_descriptors"].each do |d| 
-              j = pc_ids.index(d[:id])+1
-              data_frame[j] ||= []
-              data_frame[j][i] = d[:scaled_value]
-            end
-          end if activities
-          (0..pc_ids.size).each do |j| # for R: fill empty values with NA
-            data_frame[j] ||= []
-            data_frame[j][i] ||= "NA"
-          end
-        end
-
-        data_frame = data_frame.each_with_index.collect do |r,i|
-          if r.uniq.size == 1 # remove properties with a single value 
-            r = nil
-            pc_ids[i-1] = nil # data_frame frame has additional activity entry
-          end
-          r
-        end
-        data_frame.compact!
-        pc_ids.compact!
-
-        if pc_ids.empty?
-          prediction = weighted_average(substance: substance, neighbors: neighbors)
-          prediction[:warning] = "No relevant variables for regression model. Using weighted average of similar substances."
-          prediction
-        else
-          query_descriptors = pc_ids.collect { |i| substance.scaled_values[i] }
-          query_descriptors = query_descriptors.each_with_index.collect do |v,i|
-            unless v
-              v = nil
-              data_frame[i] = nil
-              pc_ids[i] = nil
-            end
-            v
-          end
-          query_descriptors.compact!
-          data_frame.compact!
-          pc_ids.compact!
-          prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors
-          if prediction.nil?
-            prediction = weighted_average(substance: substance, neighbors: neighbors)
-            prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances."
-          end
-          p prediction
-          prediction
-        end
-      
-      end
-=end
-
-      def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
-        R.assign "weights", training_weights
-        r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
-=begin
-rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
-        File.open("tmp.R","w+"){|f|
-          f.puts "suppressPackageStartupMessages({
-  library(iterators,lib=\"#{rlib}\")
-  library(foreach,lib=\"#{rlib}\")
-  library(ggplot2,lib=\"#{rlib}\")
-  library(grid,lib=\"#{rlib}\")
-  library(gridExtra,lib=\"#{rlib}\")
-  library(pls,lib=\"#{rlib}\")
-  library(caret,lib=\"#{rlib}\")
-  library(doMC,lib=\"#{rlib}\")
-  registerDoMC(#{NR_CORES})
-})"
-
-          f.puts "data <- #{r_data_frame}\n"
-          f.puts "weights <- c(#{training_weights.join(', ')})"
-          f.puts "features <- c(#{training_features.join(', ')})"
-          f.puts "names(data) <- append(c('activities'),features)" #
-          f.puts "ctrl <- rfeControl(functions = #{method}, method = 'repeatedcv', repeats = 5, verbose = T)"
-          f.puts "lmProfile <- rfe(activities ~ ., data = data, rfeControl = ctrl)"
-
-          f.puts "model <- train(activities ~ ., data = data, method = '#{method}')"
-          f.puts "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
-          f.puts "names(fingerprint) <- features" 
-          f.puts "prediction <- predict(model,fingerprint)"
-        }
-=end
-        
-        R.eval "data <- #{r_data_frame}"
-        R.assign "features", training_features
-        begin
-          R.eval "names(data) <- append(c('activities'),features)" #
-          R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)"
-          R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
-          R.eval "names(fingerprint) <- features" 
-          R.eval "prediction <- predict(model,fingerprint)"
-          value = R.eval("prediction").to_f
-          rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f
-          r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f
-          prediction_interval = value-1.96*rmse, value+1.96*rmse
-          {
-            :value => value,
-            :rmse => rmse,
-            :r_squared => r_squared,
-            :prediction_interval => prediction_interval
-          }
-        rescue 
-          return nil
-        end
-      end
-
     end
   end
 end
-- 
cgit v1.2.3