summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-10-06 19:14:10 +0200
committerChristoph Helma <helma@in-silico.ch>2016-10-06 19:14:10 +0200
commit4348eec89033e6677c9f628646fc67bd03c73fe6 (patch)
tree5b7465c3c286ba343268ce8e29b10cc1b03a667b /lib
parentec87f7e079f3a7ef8ea6a0fa57f3b40e81ecaed0 (diff)
nano caret regression fixed
Diffstat (limited to 'lib')
-rw-r--r--lib/lazar.rb1
-rw-r--r--lib/model.rb64
-rw-r--r--lib/regression.rb220
-rw-r--r--lib/train-test-validation.rb5
4 files changed, 31 insertions, 259 deletions
diff --git a/lib/lazar.rb b/lib/lazar.rb
index d0f05c0..f251379 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -83,6 +83,7 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation","
"model.rb",
"classification.rb",
"regression.rb",
+ "caret.rb",
"validation-statistics.rb",
"validation.rb",
"train-test-validation.rb",
diff --git a/lib/model.rb b/lib/model.rb
index a272580..290309a 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -23,10 +23,12 @@ module OpenTox
# explicit prediction algorithm
if algorithms[:prediction] and algorithms[:prediction][:method]
case algorithms[:prediction][:method]
- when /Classifiction/
+ when /Classification/i
model = LazarClassification.new
- when /Regression/
+ when /Regression/i
model = LazarRegression.new
+ else
+ bad_request_error "Prediction method '#{algorithms[:prediction][:method]}' not implemented."
end
# guess model type
@@ -36,6 +38,10 @@ module OpenTox
model = LazarClassification.new
end
+ model.prediction_feature_id = prediction_feature.id
+ model.training_dataset_id = training_dataset.id
+ model.name = "#{training_dataset.name} #{prediction_feature.name}"
+
# set defaults
substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
@@ -60,7 +66,7 @@ module OpenTox
}
elsif model.class == LazarRegression
model.algorithms[:prediction] = {
- :method => "Algorithm::Regression.caret",
+ :method => "Algorithm::Caret.regression",
:parameters => "pls",
}
end
@@ -77,7 +83,7 @@ module OpenTox
:min => 0.5
},
:prediction => {
- :method => "Algorithm::Regression.caret",
+ :method => "Algorithm::Caret.regression",
:parameters => "rf",
},
:feature_selection => {
@@ -100,10 +106,6 @@ module OpenTox
end
end
- model.prediction_feature_id = prediction_feature.id
- model.training_dataset_id = training_dataset.id
- model.name = "#{training_dataset.name} #{prediction_feature.name}"
-
if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method]
model.relevant_features = Algorithm.run model.algorithms[:feature_selection][:method], dataset: training_dataset, prediction_feature: prediction_feature, types: model.algorithms[:descriptors][:types]
end
@@ -151,8 +153,12 @@ module OpenTox
else
bad_request_error "Descriptor method '#{algorithms[:descriptors][:method]}' not available."
end
- params = algorithms[:prediction].merge({:descriptors => descriptors, :neighbors => neighbors})
- params.delete :method
+ params = {
+ :method => algorithms[:prediction][:parameters],
+ :descriptors => descriptors,
+ :neighbors => neighbors,
+ :relevant_features => relevant_features
+ }
result = Algorithm.run algorithms[:prediction][:method], params
prediction.merge! result
prediction[:neighbors] = neighbors
@@ -218,11 +224,9 @@ module OpenTox
end
class LazarClassification < Lazar
-
end
class LazarRegression < Lazar
-
end
class Prediction
@@ -240,7 +244,7 @@ module OpenTox
field :leave_one_out_validation_id, type: BSON::ObjectId
def predict object
- Lazar.find(model_id).predict object
+ model.predict object
end
def training_dataset
@@ -251,6 +255,10 @@ module OpenTox
Lazar.find model_id
end
+ def prediction_feature
+ model.prediction_feature
+ end
+
def repeated_crossvalidation
Validation::RepeatedCrossValidation.find repeated_crossvalidation_id
end
@@ -276,15 +284,8 @@ module OpenTox
bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
prediction_model = self.new JSON.parse(File.read(metadata_file))
training_dataset = Dataset.from_csv_file file
- prediction_feature = training_dataset.features.first
- model = nil
- if prediction_feature.nominal?
- model = LazarClassification.create prediction_feature, training_dataset
- elsif prediction_feature.numeric?
- model = LazarRegression.create prediction_feature, training_dataset
- end
+ model = Lazar.create training_dataset: training_dataset
prediction_model[:model_id] = model.id
- prediction_model[:prediction_feature_id] = prediction_feature.id
prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id
#prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id
prediction_model.save
@@ -297,26 +298,19 @@ module OpenTox
def self.from_json_dump dir, category
Import::Enanomapper.import dir
-
+ training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
+ unless training_dataset
+ Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm")
+ training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
+ end
prediction_model = self.new(
:endpoint => "log2(Net cell association)",
:source => "https://data.enanomapper.net/",
:species => "A549 human lung epithelial carcinoma cells",
:unit => "log2(ug/Mg)"
)
- params = {
- :feature_selection_algorithm => :correlation_filter,
- :feature_selection_algorithm_parameters => {:category => category},
- :neighbor_algorithm => "physchem_neighbors",
- :neighbor_algorithm_parameters => {:min_sim => 0.5},
- :prediction_algorithm => "OpenTox::Algorithm::Regression.physchem_regression",
- :prediction_algorithm_parameters => {:method => 'rf'}, # random forests
- }
- training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
- prediction_feature = Feature.find_or_create_by(name: "log2(Net cell association)", category: "TOX")
- #prediction_feature = Feature.find("579621b84de73e267b414e55")
- prediction_model[:prediction_feature_id] = prediction_feature.id
- model = Model::LazarRegression.create(prediction_feature, training_dataset, params)
+ prediction_feature = Feature.where(name: "log2(Net cell association)", category: "TOX").first
+ model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset)
prediction_model[:model_id] = model.id
repeated_cv = Validation::RepeatedCrossValidation.create model
prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id
diff --git a/lib/regression.rb b/lib/regression.rb
index 396c9e4..cf6d9cb 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -19,226 +19,6 @@ module OpenTox
{:value => prediction}
end
- def self.caret descriptors:, neighbors:, method: "pls", parameters:nil
- values = []
- descriptors = {}
- weights = []
- descriptor_ids = neighbors.collect{|n| n["descriptors"]}.flatten.uniq.sort
-
- neighbors.each do |n|
- activities = n["measurements"]
- activities.each do |act|
- values << act
- weights << n["similarity"]
- descriptor_ids.each do |id|
- descriptors[id] ||= []
- descriptors[id] << n["descriptors"].include?(id)
- end
- end if activities
- end
-
- variables = []
- data_frame = [values]
-
- descriptors.each do |k,v|
- unless v.uniq.size == 1
- data_frame << v.collect{|m| m ? "T" : "F"}
- variables << k
- end
- end
-
- if variables.empty?
- prediction = weighted_average(descriptors: descriptors, neighbors: neighbors)
- prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
- prediction
- else
- substance_features = variables.collect{|f| descriptors.include?(f) ? "T" : "F"}
- #puts data_frame.to_yaml
- prediction = r_model_prediction method, data_frame, variables, weights, substance_features
- if prediction.nil? or prediction[:value].nil?
- prediction = weighted_average(descriptors: descriptors, neighbors: neighbors)
- prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances."
- prediction
- else
- prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]]
- prediction[:value] = prediction[:value]
- prediction[:rmse] = prediction[:rmse]
- prediction
- end
- end
-
- end
-
- def self.fingerprint_regression substance:, neighbors:, method: "pls" #, method_params="sigma=0.05"
- values = []
- fingerprints = {}
- weights = []
- fingerprint_ids = neighbors.collect{|n| Compound.find(n["_id"]).fingerprint}.flatten.uniq.sort
-
- neighbors.each do |n|
- fingerprint = Substance.find(n["_id"]).fingerprint
- activities = n["measurements"]
- activities.each do |act|
- values << act
- weights << n["similarity"]
- fingerprint_ids.each do |id|
- fingerprints[id] ||= []
- fingerprints[id] << fingerprint.include?(id)
- end
- end if activities
- end
-
- variables = []
- data_frame = [values]
-
- fingerprints.each do |k,v|
- unless v.uniq.size == 1
- data_frame << v.collect{|m| m ? "T" : "F"}
- variables << k
- end
- end
-
- if variables.empty?
- prediction = weighted_average(substance: substance, neighbors: neighbors)
- prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
- prediction
- else
- substance_features = variables.collect{|f| substance.fingerprint.include?(f) ? "T" : "F"}
- prediction = r_model_prediction method, data_frame, variables, weights, substance_features
- if prediction.nil? or prediction[:value].nil?
- prediction = weighted_average(substance: substance, neighbors: neighbors)
- prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances."
- prediction
- else
- prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]]
- prediction[:value] = prediction[:value]
- prediction[:rmse] = prediction[:rmse]
- prediction
- end
- end
-
- end
-
-=begin
- def self.physchem_regression substance:, neighbors:, method: "pls"
-
- activities = []
- weights = []
- pc_ids = neighbors.collect{|n| n["common_descriptors"].collect{|d| d[:id]}}.flatten.uniq.sort
- data_frame = []
- data_frame[0] = []
-
- neighbors.each_with_index do |n,i|
- activities = n["measurements"]
- activities.each do |act|
- data_frame[0][i] = act
- weights << n["similarity"]
- n["common_descriptors"].each do |d|
- j = pc_ids.index(d[:id])+1
- data_frame[j] ||= []
- data_frame[j][i] = d[:scaled_value]
- end
- end if activities
- (0..pc_ids.size).each do |j| # for R: fill empty values with NA
- data_frame[j] ||= []
- data_frame[j][i] ||= "NA"
- end
- end
-
- data_frame = data_frame.each_with_index.collect do |r,i|
- if r.uniq.size == 1 # remove properties with a single value
- r = nil
- pc_ids[i-1] = nil # data_frame frame has additional activity entry
- end
- r
- end
- data_frame.compact!
- pc_ids.compact!
-
- if pc_ids.empty?
- prediction = weighted_average(substance: substance, neighbors: neighbors)
- prediction[:warning] = "No relevant variables for regression model. Using weighted average of similar substances."
- prediction
- else
- query_descriptors = pc_ids.collect { |i| substance.scaled_values[i] }
- query_descriptors = query_descriptors.each_with_index.collect do |v,i|
- unless v
- v = nil
- data_frame[i] = nil
- pc_ids[i] = nil
- end
- v
- end
- query_descriptors.compact!
- data_frame.compact!
- pc_ids.compact!
- prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors
- if prediction.nil?
- prediction = weighted_average(substance: substance, neighbors: neighbors)
- prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances."
- end
- p prediction
- prediction
- end
-
- end
-=end
-
- def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
- R.assign "weights", training_weights
- r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
-=begin
-rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
- File.open("tmp.R","w+"){|f|
- f.puts "suppressPackageStartupMessages({
- library(iterators,lib=\"#{rlib}\")
- library(foreach,lib=\"#{rlib}\")
- library(ggplot2,lib=\"#{rlib}\")
- library(grid,lib=\"#{rlib}\")
- library(gridExtra,lib=\"#{rlib}\")
- library(pls,lib=\"#{rlib}\")
- library(caret,lib=\"#{rlib}\")
- library(doMC,lib=\"#{rlib}\")
- registerDoMC(#{NR_CORES})
-})"
-
- f.puts "data <- #{r_data_frame}\n"
- f.puts "weights <- c(#{training_weights.join(', ')})"
- f.puts "features <- c(#{training_features.join(', ')})"
- f.puts "names(data) <- append(c('activities'),features)" #
- f.puts "ctrl <- rfeControl(functions = #{method}, method = 'repeatedcv', repeats = 5, verbose = T)"
- f.puts "lmProfile <- rfe(activities ~ ., data = data, rfeControl = ctrl)"
-
- f.puts "model <- train(activities ~ ., data = data, method = '#{method}')"
- f.puts "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
- f.puts "names(fingerprint) <- features"
- f.puts "prediction <- predict(model,fingerprint)"
- }
-=end
-
- R.eval "data <- #{r_data_frame}"
- R.assign "features", training_features
- begin
- R.eval "names(data) <- append(c('activities'),features)" #
- R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)"
- R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
- R.eval "names(fingerprint) <- features"
- R.eval "prediction <- predict(model,fingerprint)"
- value = R.eval("prediction").to_f
- rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f
- r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f
- prediction_interval = value-1.96*rmse, value+1.96*rmse
- {
- :value => value,
- :rmse => rmse,
- :r_squared => r_squared,
- :prediction_interval => prediction_interval
- }
- rescue
- return nil
- end
- end
-
end
end
end
diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb
index 286614a..e3f5905 100644
--- a/lib/train-test-validation.rb
+++ b/lib/train-test-validation.rb
@@ -9,10 +9,7 @@ module OpenTox
def self.create model, training_set, test_set
- atts = model.attributes.dup # do not modify attributes of the original model
- atts["_id"] = BSON::ObjectId.new
- atts[:training_dataset_id] = training_set.id
- validation_model = model.class.create model.prediction_feature, training_set, atts
+ validation_model = model.class.create prediction_feature: model.prediction_feature, training_dataset: training_set, algorithms: model.algorithms
validation_model.save
predictions = validation_model.predict test_set.substances
nr_unpredicted = 0