summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-06-03 19:15:36 +0200
committerChristoph Helma <helma@in-silico.ch>2016-06-03 19:15:36 +0200
commit290c7f86950c4051d018b8019ff4e72ec406c58c (patch)
tree561e1d6f5739d21bb43945a56f524a5192bdfd7c
parent128fd36b2531756c15a93776871e80eb44e524f1 (diff)
random forest regression
-rw-r--r--ext/lazar/rinstall.R1
-rw-r--r--lib/lazar.rb2
-rw-r--r--lib/model.rb29
-rw-r--r--lib/regression.rb63
-rw-r--r--test/nanoparticles.rb50
5 files changed, 92 insertions, 53 deletions
diff --git a/ext/lazar/rinstall.R b/ext/lazar/rinstall.R
index 52b0d45..7023f60 100644
--- a/ext/lazar/rinstall.R
+++ b/ext/lazar/rinstall.R
@@ -6,5 +6,6 @@ install.packages("foreach",lib=libdir,repos=repo,dependencies=TRUE);
install.packages("gridExtra",lib=libdir,repos=repo,dependencies=TRUE);
install.packages("ggplot2",lib=libdir,repos=repo,dependencies=TRUE);
install.packages("pls",lib=libdir,repos=repo,dependencies=TRUE);
+install.packages("randomForest",lib=libdir,repos=repo,dependencies=TRUE);
install.packages("caret",lib=libdir,repos=repo,dependencies=TRUE);
install.packages("doMC",lib=libdir,repos=repo,dependencies=TRUE);
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 1853aba..46605d3 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -57,6 +57,8 @@ suppressPackageStartupMessages({
library(pls,lib=\"#{rlib}\")
library(caret,lib=\"#{rlib}\")
library(doMC,lib=\"#{rlib}\")
+ library(randomForest,lib=\"#{rlib}\")
+ library(plyr,lib=\"#{rlib}\")
registerDoMC(#{NR_CORES})
})
"
diff --git a/lib/model.rb b/lib/model.rb
index 277bca3..0432c56 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -3,6 +3,7 @@ module OpenTox
module Model
class Lazar
+
include OpenTox
include Mongoid::Document
include Mongoid::Timestamps
@@ -11,11 +12,15 @@ module OpenTox
field :name, type: String
field :creator, type: String, default: __FILE__
field :training_dataset_id, type: BSON::ObjectId
- field :prediction_algorithm, type: String
field :prediction_feature_id, type: BSON::ObjectId
+
+ field :prediction_algorithm, type: String
+ field :prediction_algorithm_parameters, type: Hash, default: {}
+
field :neighbor_algorithm, type: String
field :neighbor_algorithm_parameters, type: Hash, default: {}
field :feature_selection_algorithm, type: String
+ field :feature_selection_algorithm_parameters, type: Hash, default: {}
field :relevant_features, type: Hash
# Create a lazar model from a training_dataset and a feature_dataset
@@ -35,7 +40,8 @@ module OpenTox
save
end
- def correlation_filter
+ def correlation_filter
+ # TODO: speedup, single assignment of all features to R+ parallel computation of significance?
self.relevant_features = {}
measurements = []
substances = []
@@ -47,6 +53,7 @@ module OpenTox
end
R.assign "tox", measurements
feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq
+ feature_ids.select!{|fid| Feature.find(fid).category == feature_selection_algorithm_parameters[:category]} if feature_selection_algorithm_parameters[:category]
feature_ids.each do |feature_id|
feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]}
unless feature_values.uniq.size == 1
@@ -68,7 +75,6 @@ module OpenTox
end
end
self.relevant_features = self.relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h
- p self.relevant_features
end
def predict_substance substance
@@ -90,14 +96,14 @@ module OpenTox
prediction.merge!({:value => nil,:probabilities => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []})
elsif neighbors.size == 1
value = nil
- tox = neighbors.first["measurements"]
- if tox.size == 1 # single measurement
- value = tox.first
+ m = neighbors.first["measurements"]
+ if m.size == 1 # single measurement
+ value = m.first
else # multiple measurement
- if tox.collect{|t| t.numeric?}.uniq == [true] # numeric
- value = tox.median
- elsif tox.uniq.size == 1 # single value
- value = tox.first
+ if m.collect{|t| t.numeric?}.uniq == [true] # numeric
+ value = m.median
+ elsif m.uniq.size == 1 # single value
+ value = m.first
else # contradictory results
# TODO add majority vote??
end
@@ -106,7 +112,8 @@ module OpenTox
else
# call prediction algorithm
klass,method = prediction_algorithm.split('.')
- result = Object.const_get(klass).send(method,substance,neighbors)
+ params = prediction_algorithm_parameters.merge({:substance => substance, :neighbors => neighbors})
+ result = Object.const_get(klass).send(method,params)
prediction.merge! result
prediction[:neighbors] = neighbors
prediction[:neighbors] ||= []
diff --git a/lib/regression.rb b/lib/regression.rb
index b9067c6..c4c83d2 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -3,7 +3,7 @@ module OpenTox
class Regression
- def self.local_weighted_average substance, neighbors
+ def self.local_weighted_average substance:, neighbors:
weighted_sum = 0.0
sim_sum = 0.0
neighbors.each do |neighbor|
@@ -18,7 +18,7 @@ module OpenTox
{:value => prediction}
end
- def self.local_fingerprint_regression substance, neighbors, method='pls'#, method_params="sigma=0.05"
+ def self.local_fingerprint_regression substance:, neighbors:, method: pls#, method_params="sigma=0.05"
values = []
fingerprints = {}
weights = []
@@ -68,8 +68,7 @@ module OpenTox
end
- #def self.local_physchem_regression(substance:, neighbors:, feature_id:, dataset_id:, method: 'pls')#, method_params="ncomp = 4"
- def self.local_physchem_regression substance, neighbors, method='pls' #, method_params="ncomp = 4"
+ def self.local_physchem_regression substance:, neighbors:, method: pls
activities = []
weights = []
@@ -88,46 +87,39 @@ module OpenTox
data_frame[j][i] = d[:scaled_value]
end
end if activities
- #(0..pc_ids.size+1).each do |j| # for R: fill empty values with NA
(0..pc_ids.size).each do |j| # for R: fill empty values with NA
data_frame[j] ||= []
data_frame[j][i] ||= "NA"
end
end
- #remove_idx = []
- #data_frame.each_with_index do |r,i|
- #remove_idx << i if r.uniq.size == 1 # remove properties with a single value TODO: don't break R names assignment
- #end
-
- #p data_frame.size
- #p pc_ids.size
- #data_frame.delete_if.with_index { |_, index| remove_idx.include? index }
- #pc_ids.delete_if.with_index { |_, index| remove_idx.include? index-1 }
- #remove_idx.sort.reverse.each do |i|
- #p i
- #data_frame.delete_at i
- #pc_ids.delete_at i
- #end
- #p data_frame.size
- #p pc_ids.size
+ data_frame = data_frame.each_with_index.collect do |r,i|
+ if r.uniq.size == 1 # remove properties with a single value
+ r = nil
+ pc_ids[i-1] = nil # data_frame frame has additional activity entry
+ end
+ r
+ end
+ data_frame.compact!
+ pc_ids.compact!
if pc_ids.empty?
prediction = local_weighted_average substance, neighbors
- prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
+ prediction[:warning] = "No relevant variables for regression model. Using weighted average of similar substances."
prediction
else
query_descriptors = pc_ids.collect { |i| substance.scaled_values[i] }
- remove_idx = []
- query_descriptors.each_with_index do |v,i|
- #remove_idx << i if v == "NA"
- remove_idx << i unless v
- end
- remove_idx.sort.reverse.each do |i|
- data_frame.delete_at i
- pc_ids.delete_at i
- query_descriptors.delete_at i
+ query_descriptors = query_descriptors.each_with_index.collect do |v,i|
+ unless v
+ v = nil
+ data_frame[i] = nil
+ pc_ids[i] = nil
+ end
+ v
end
+ query_descriptors.compact!
+ data_frame.compact!
+ pc_ids.compact!
prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors
if prediction.nil?
prediction = local_weighted_average substance, neighbors
@@ -143,7 +135,6 @@ module OpenTox
R.assign "weights", training_weights
r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
=begin
-=end
rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
File.open("tmp.R","w+"){|f|
f.puts "suppressPackageStartupMessages({
@@ -162,19 +153,21 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
f.puts "weights <- c(#{training_weights.join(', ')})"
f.puts "features <- c(#{training_features.join(', ')})"
f.puts "names(data) <- append(c('activities'),features)" #
+ f.puts "ctrl <- rfeControl(functions = #{method}, method = 'repeatedcv', repeats = 5, verbose = T)"
+ f.puts "lmProfile <- rfe(activities ~ ., data = data, rfeControl = ctrl)"
+
f.puts "model <- train(activities ~ ., data = data, method = '#{method}')"
f.puts "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
f.puts "names(fingerprint) <- features"
f.puts "prediction <- predict(model,fingerprint)"
}
+=end
R.eval "data <- #{r_data_frame}"
R.assign "features", training_features
- p training_features.size
- p R.eval("names(data)").to_ruby.size
begin
R.eval "names(data) <- append(c('activities'),features)" #
- R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)"
+ R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)"
R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
R.eval "names(fingerprint) <- features"
R.eval "prediction <- predict(model,fingerprint)"
diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb
index e5d1973..3e0316f 100644
--- a/test/nanoparticles.rb
+++ b/test/nanoparticles.rb
@@ -23,12 +23,20 @@ class NanoparticleTest < MiniTest::Test
def test_inspect_cv
cv = CrossValidation.all.sort_by{|cv| cv.created_at}.last
- p cv
- p cv.id
- cv.correlation_plot_id = nil
+ #p cv
+ #p cv.id
+ #cv.correlation_plot_id = nil
File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot}
- p cv.statistics
+ #p cv.statistics
#p cv.model.training_dataset.substances.first.physchem_descriptors.keys.collect{|d| Feature.find(d).name}
+ CrossValidation.all.sort_by{|cv| cv.created_at}.reverse.each do |cv|
+ p cv.name
+ p cv.created_at
+ begin
+ p cv.r_squared
+ rescue
+ end
+ end
end
def test_inspect_worst_prediction
@@ -37,12 +45,12 @@ class NanoparticleTest < MiniTest::Test
assert_equal 3, worst_predictions.size
assert_kind_of Integer, worst_predictions.first[:neighbors]
worst_predictions = cv.worst_predictions
- #puts worst_predictions.to_yaml
assert_equal 5, worst_predictions.size
assert_kind_of Array, worst_predictions.first[:neighbors]
assert_kind_of Integer, worst_predictions.first[:neighbors].first[:common_descriptors]
- worst_predictions = cv.worst_predictions(n: 2, show_common_descriptors: true)
puts worst_predictions.to_yaml
+ worst_predictions = cv.worst_predictions(n: 2, show_common_descriptors: true)
+ #puts worst_predictions.to_yaml
assert_equal 2, worst_predictions.size
assert_kind_of Array, worst_predictions.first[:neighbors]
refute_nil worst_predictions.first[:neighbors].first[:common_descriptors]
@@ -67,7 +75,35 @@ class NanoparticleTest < MiniTest::Test
training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
feature = Feature.find_or_create_by(name: "Log2 transformed", category: "TOX")
- model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :feature_selection_algorithm => :correlation_filter, :neighbor_algorithm => "physchem_neighbors", :neighbor_algorithm_parameters => {:min_sim => 0.5}})
+ model = Model::LazarRegression.create(feature, training_dataset, {
+ :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression",
+ :feature_selection_algorithm => :correlation_filter,
+ :prediction_algorithm_parameters => {:method => 'pls'},
+ #:feature_selection_algorithm_parameters => {:category => "P-CHEM"},
+ #:feature_selection_algorithm_parameters => {:category => "Proteomics"},
+ :neighbor_algorithm => "physchem_neighbors",
+ :neighbor_algorithm_parameters => {:min_sim => 0.5}
+ })
+ cv = RegressionCrossValidation.create model
+ p cv.rmse
+ p cv.r_squared
+ refute_nil cv.r_squared
+ refute_nil cv.rmse
+ end
+
+ def test_validate_random_forest_model
+ training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
+ feature = Feature.find_or_create_by(name: "Log2 transformed", category: "TOX")
+
+ model = Model::LazarRegression.create(feature, training_dataset, {
+ :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression",
+ :prediction_algorithm_parameters => {:method => 'rf'},
+ :feature_selection_algorithm => :correlation_filter,
+ #:feature_selection_algorithm_parameters => {:category => "P-CHEM"},
+ #:feature_selection_algorithm_parameters => {:category => "Proteomics"},
+ :neighbor_algorithm => "physchem_neighbors",
+ :neighbor_algorithm_parameters => {:min_sim => 0.5}
+ })
cv = RegressionCrossValidation.create model
p cv.rmse
p cv.r_squared