From 290c7f86950c4051d018b8019ff4e72ec406c58c Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 3 Jun 2016 19:15:36 +0200 Subject: random forest regression --- lib/lazar.rb | 2 ++ lib/model.rb | 29 +++++++++++++++---------- lib/regression.rb | 63 +++++++++++++++++++++++++------------------------------ 3 files changed, 48 insertions(+), 46 deletions(-) (limited to 'lib') diff --git a/lib/lazar.rb b/lib/lazar.rb index 1853aba..46605d3 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -57,6 +57,8 @@ suppressPackageStartupMessages({ library(pls,lib=\"#{rlib}\") library(caret,lib=\"#{rlib}\") library(doMC,lib=\"#{rlib}\") + library(randomForest,lib=\"#{rlib}\") + library(plyr,lib=\"#{rlib}\") registerDoMC(#{NR_CORES}) }) " diff --git a/lib/model.rb b/lib/model.rb index 277bca3..0432c56 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -3,6 +3,7 @@ module OpenTox module Model class Lazar + include OpenTox include Mongoid::Document include Mongoid::Timestamps @@ -11,11 +12,15 @@ module OpenTox field :name, type: String field :creator, type: String, default: __FILE__ field :training_dataset_id, type: BSON::ObjectId - field :prediction_algorithm, type: String field :prediction_feature_id, type: BSON::ObjectId + + field :prediction_algorithm, type: String + field :prediction_algorithm_parameters, type: Hash, default: {} + field :neighbor_algorithm, type: String field :neighbor_algorithm_parameters, type: Hash, default: {} field :feature_selection_algorithm, type: String + field :feature_selection_algorithm_parameters, type: Hash, default: {} field :relevant_features, type: Hash # Create a lazar model from a training_dataset and a feature_dataset @@ -35,7 +40,8 @@ module OpenTox save end - def correlation_filter + def correlation_filter + # TODO: speedup, single assignment of all features to R+ parallel computation of significance? self.relevant_features = {} measurements = [] substances = [] @@ -47,6 +53,7 @@ module OpenTox end R.assign "tox", measurements feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq + feature_ids.select!{|fid| Feature.find(fid).category == feature_selection_algorithm_parameters[:category]} if feature_selection_algorithm_parameters[:category] feature_ids.each do |feature_id| feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]} unless feature_values.uniq.size == 1 @@ -68,7 +75,6 @@ module OpenTox end end self.relevant_features = self.relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h - p self.relevant_features end def predict_substance substance @@ -90,14 +96,14 @@ module OpenTox prediction.merge!({:value => nil,:probabilities => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) elsif neighbors.size == 1 value = nil - tox = neighbors.first["measurements"] - if tox.size == 1 # single measurement - value = tox.first + m = neighbors.first["measurements"] + if m.size == 1 # single measurement + value = m.first else # multiple measurement - if tox.collect{|t| t.numeric?}.uniq == [true] # numeric - value = tox.median - elsif tox.uniq.size == 1 # single value - value = tox.first + if m.collect{|t| t.numeric?}.uniq == [true] # numeric + value = m.median + elsif m.uniq.size == 1 # single value + value = m.first else # contradictory results # TODO add majority vote?? end @@ -106,7 +112,8 @@ module OpenTox else # call prediction algorithm klass,method = prediction_algorithm.split('.') - result = Object.const_get(klass).send(method,substance,neighbors) + params = prediction_algorithm_parameters.merge({:substance => substance, :neighbors => neighbors}) + result = Object.const_get(klass).send(method,params) prediction.merge! result prediction[:neighbors] = neighbors prediction[:neighbors] ||= [] diff --git a/lib/regression.rb b/lib/regression.rb index b9067c6..c4c83d2 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -3,7 +3,7 @@ module OpenTox class Regression - def self.local_weighted_average substance, neighbors + def self.local_weighted_average substance:, neighbors: weighted_sum = 0.0 sim_sum = 0.0 neighbors.each do |neighbor| @@ -18,7 +18,7 @@ module OpenTox {:value => prediction} end - def self.local_fingerprint_regression substance, neighbors, method='pls'#, method_params="sigma=0.05" + def self.local_fingerprint_regression substance:, neighbors:, method: pls#, method_params="sigma=0.05" values = [] fingerprints = {} weights = [] @@ -68,8 +68,7 @@ module OpenTox end - #def self.local_physchem_regression(substance:, neighbors:, feature_id:, dataset_id:, method: 'pls')#, method_params="ncomp = 4" - def self.local_physchem_regression substance, neighbors, method='pls' #, method_params="ncomp = 4" + def self.local_physchem_regression substance:, neighbors:, method: pls activities = [] weights = [] @@ -88,46 +87,39 @@ module OpenTox data_frame[j][i] = d[:scaled_value] end end if activities - #(0..pc_ids.size+1).each do |j| # for R: fill empty values with NA (0..pc_ids.size).each do |j| # for R: fill empty values with NA data_frame[j] ||= [] data_frame[j][i] ||= "NA" end end - #remove_idx = [] - #data_frame.each_with_index do |r,i| - #remove_idx << i if r.uniq.size == 1 # remove properties with a single value TODO: don't break R names assignment - #end - - #p data_frame.size - #p pc_ids.size - #data_frame.delete_if.with_index { |_, index| remove_idx.include? index } - #pc_ids.delete_if.with_index { |_, index| remove_idx.include? index-1 } - #remove_idx.sort.reverse.each do |i| - #p i - #data_frame.delete_at i - #pc_ids.delete_at i - #end - #p data_frame.size - #p pc_ids.size + data_frame = data_frame.each_with_index.collect do |r,i| + if r.uniq.size == 1 # remove properties with a single value + r = nil + pc_ids[i-1] = nil # data_frame frame has additional activity entry + end + r + end + data_frame.compact! + pc_ids.compact! if pc_ids.empty? prediction = local_weighted_average substance, neighbors - prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." + prediction[:warning] = "No relevant variables for regression model. Using weighted average of similar substances." prediction else query_descriptors = pc_ids.collect { |i| substance.scaled_values[i] } - remove_idx = [] - query_descriptors.each_with_index do |v,i| - #remove_idx << i if v == "NA" - remove_idx << i unless v - end - remove_idx.sort.reverse.each do |i| - data_frame.delete_at i - pc_ids.delete_at i - query_descriptors.delete_at i + query_descriptors = query_descriptors.each_with_index.collect do |v,i| + unless v + v = nil + data_frame[i] = nil + pc_ids[i] = nil + end + v end + query_descriptors.compact! + data_frame.compact! + pc_ids.compact! prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors if prediction.nil? prediction = local_weighted_average substance, neighbors @@ -143,7 +135,6 @@ module OpenTox R.assign "weights", training_weights r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" =begin -=end rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) File.open("tmp.R","w+"){|f| f.puts "suppressPackageStartupMessages({ @@ -162,19 +153,21 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) f.puts "weights <- c(#{training_weights.join(', ')})" f.puts "features <- c(#{training_features.join(', ')})" f.puts "names(data) <- append(c('activities'),features)" # + f.puts "ctrl <- rfeControl(functions = #{method}, method = 'repeatedcv', repeats = 5, verbose = T)" + f.puts "lmProfile <- rfe(activities ~ ., data = data, rfeControl = ctrl)" + f.puts "model <- train(activities ~ ., data = data, method = '#{method}')" f.puts "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" f.puts "names(fingerprint) <- features" f.puts "prediction <- predict(model,fingerprint)" } +=end R.eval "data <- #{r_data_frame}" R.assign "features", training_features - p training_features.size - p R.eval("names(data)").to_ruby.size begin R.eval "names(data) <- append(c('activities'),features)" # - R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)" + R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)" R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" R.eval "names(fingerprint) <- features" R.eval "prediction <- predict(model,fingerprint)" -- cgit v1.2.3