summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-06-03 19:15:36 +0200
committerChristoph Helma <helma@in-silico.ch>2016-06-03 19:15:36 +0200
commit290c7f86950c4051d018b8019ff4e72ec406c58c (patch)
tree561e1d6f5739d21bb43945a56f524a5192bdfd7c /lib
parent128fd36b2531756c15a93776871e80eb44e524f1 (diff)
random forest regression
Diffstat (limited to 'lib')
-rw-r--r--lib/lazar.rb2
-rw-r--r--lib/model.rb29
-rw-r--r--lib/regression.rb63
3 files changed, 48 insertions, 46 deletions
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 1853aba..46605d3 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -57,6 +57,8 @@ suppressPackageStartupMessages({
library(pls,lib=\"#{rlib}\")
library(caret,lib=\"#{rlib}\")
library(doMC,lib=\"#{rlib}\")
+ library(randomForest,lib=\"#{rlib}\")
+ library(plyr,lib=\"#{rlib}\")
registerDoMC(#{NR_CORES})
})
"
diff --git a/lib/model.rb b/lib/model.rb
index 277bca3..0432c56 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -3,6 +3,7 @@ module OpenTox
module Model
class Lazar
+
include OpenTox
include Mongoid::Document
include Mongoid::Timestamps
@@ -11,11 +12,15 @@ module OpenTox
field :name, type: String
field :creator, type: String, default: __FILE__
field :training_dataset_id, type: BSON::ObjectId
- field :prediction_algorithm, type: String
field :prediction_feature_id, type: BSON::ObjectId
+
+ field :prediction_algorithm, type: String
+ field :prediction_algorithm_parameters, type: Hash, default: {}
+
field :neighbor_algorithm, type: String
field :neighbor_algorithm_parameters, type: Hash, default: {}
field :feature_selection_algorithm, type: String
+ field :feature_selection_algorithm_parameters, type: Hash, default: {}
field :relevant_features, type: Hash
# Create a lazar model from a training_dataset and a feature_dataset
@@ -35,7 +40,8 @@ module OpenTox
save
end
- def correlation_filter
+ def correlation_filter
+ # TODO: speedup, single assignment of all features to R+ parallel computation of significance?
self.relevant_features = {}
measurements = []
substances = []
@@ -47,6 +53,7 @@ module OpenTox
end
R.assign "tox", measurements
feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq
+ feature_ids.select!{|fid| Feature.find(fid).category == feature_selection_algorithm_parameters[:category]} if feature_selection_algorithm_parameters[:category]
feature_ids.each do |feature_id|
feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]}
unless feature_values.uniq.size == 1
@@ -68,7 +75,6 @@ module OpenTox
end
end
self.relevant_features = self.relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h
- p self.relevant_features
end
def predict_substance substance
@@ -90,14 +96,14 @@ module OpenTox
prediction.merge!({:value => nil,:probabilities => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []})
elsif neighbors.size == 1
value = nil
- tox = neighbors.first["measurements"]
- if tox.size == 1 # single measurement
- value = tox.first
+ m = neighbors.first["measurements"]
+ if m.size == 1 # single measurement
+ value = m.first
else # multiple measurement
- if tox.collect{|t| t.numeric?}.uniq == [true] # numeric
- value = tox.median
- elsif tox.uniq.size == 1 # single value
- value = tox.first
+ if m.collect{|t| t.numeric?}.uniq == [true] # numeric
+ value = m.median
+ elsif m.uniq.size == 1 # single value
+ value = m.first
else # contradictory results
# TODO add majority vote??
end
@@ -106,7 +112,8 @@ module OpenTox
else
# call prediction algorithm
klass,method = prediction_algorithm.split('.')
- result = Object.const_get(klass).send(method,substance,neighbors)
+ params = prediction_algorithm_parameters.merge({:substance => substance, :neighbors => neighbors})
+ result = Object.const_get(klass).send(method,params)
prediction.merge! result
prediction[:neighbors] = neighbors
prediction[:neighbors] ||= []
diff --git a/lib/regression.rb b/lib/regression.rb
index b9067c6..c4c83d2 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -3,7 +3,7 @@ module OpenTox
class Regression
- def self.local_weighted_average substance, neighbors
+ def self.local_weighted_average substance:, neighbors:
weighted_sum = 0.0
sim_sum = 0.0
neighbors.each do |neighbor|
@@ -18,7 +18,7 @@ module OpenTox
{:value => prediction}
end
- def self.local_fingerprint_regression substance, neighbors, method='pls'#, method_params="sigma=0.05"
+ def self.local_fingerprint_regression substance:, neighbors:, method: pls#, method_params="sigma=0.05"
values = []
fingerprints = {}
weights = []
@@ -68,8 +68,7 @@ module OpenTox
end
- #def self.local_physchem_regression(substance:, neighbors:, feature_id:, dataset_id:, method: 'pls')#, method_params="ncomp = 4"
- def self.local_physchem_regression substance, neighbors, method='pls' #, method_params="ncomp = 4"
+ def self.local_physchem_regression substance:, neighbors:, method: pls
activities = []
weights = []
@@ -88,46 +87,39 @@ module OpenTox
data_frame[j][i] = d[:scaled_value]
end
end if activities
- #(0..pc_ids.size+1).each do |j| # for R: fill empty values with NA
(0..pc_ids.size).each do |j| # for R: fill empty values with NA
data_frame[j] ||= []
data_frame[j][i] ||= "NA"
end
end
- #remove_idx = []
- #data_frame.each_with_index do |r,i|
- #remove_idx << i if r.uniq.size == 1 # remove properties with a single value TODO: don't break R names assignment
- #end
-
- #p data_frame.size
- #p pc_ids.size
- #data_frame.delete_if.with_index { |_, index| remove_idx.include? index }
- #pc_ids.delete_if.with_index { |_, index| remove_idx.include? index-1 }
- #remove_idx.sort.reverse.each do |i|
- #p i
- #data_frame.delete_at i
- #pc_ids.delete_at i
- #end
- #p data_frame.size
- #p pc_ids.size
+ data_frame = data_frame.each_with_index.collect do |r,i|
+ if r.uniq.size == 1 # remove properties with a single value
+ r = nil
+ pc_ids[i-1] = nil # data_frame frame has additional activity entry
+ end
+ r
+ end
+ data_frame.compact!
+ pc_ids.compact!
if pc_ids.empty?
prediction = local_weighted_average substance, neighbors
- prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
+ prediction[:warning] = "No relevant variables for regression model. Using weighted average of similar substances."
prediction
else
query_descriptors = pc_ids.collect { |i| substance.scaled_values[i] }
- remove_idx = []
- query_descriptors.each_with_index do |v,i|
- #remove_idx << i if v == "NA"
- remove_idx << i unless v
- end
- remove_idx.sort.reverse.each do |i|
- data_frame.delete_at i
- pc_ids.delete_at i
- query_descriptors.delete_at i
+ query_descriptors = query_descriptors.each_with_index.collect do |v,i|
+ unless v
+ v = nil
+ data_frame[i] = nil
+ pc_ids[i] = nil
+ end
+ v
end
+ query_descriptors.compact!
+ data_frame.compact!
+ pc_ids.compact!
prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors
if prediction.nil?
prediction = local_weighted_average substance, neighbors
@@ -143,7 +135,6 @@ module OpenTox
R.assign "weights", training_weights
r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
=begin
-=end
rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
File.open("tmp.R","w+"){|f|
f.puts "suppressPackageStartupMessages({
@@ -162,19 +153,21 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
f.puts "weights <- c(#{training_weights.join(', ')})"
f.puts "features <- c(#{training_features.join(', ')})"
f.puts "names(data) <- append(c('activities'),features)" #
+ f.puts "ctrl <- rfeControl(functions = #{method}, method = 'repeatedcv', repeats = 5, verbose = T)"
+ f.puts "lmProfile <- rfe(activities ~ ., data = data, rfeControl = ctrl)"
+
f.puts "model <- train(activities ~ ., data = data, method = '#{method}')"
f.puts "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
f.puts "names(fingerprint) <- features"
f.puts "prediction <- predict(model,fingerprint)"
}
+=end
R.eval "data <- #{r_data_frame}"
R.assign "features", training_features
- p training_features.size
- p R.eval("names(data)").to_ruby.size
begin
R.eval "names(data) <- append(c('activities'),features)" #
- R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)"
+ R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)"
R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
R.eval "names(fingerprint) <- features"
R.eval "prediction <- predict(model,fingerprint)"