summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/dataset.rb43
-rw-r--r--lib/lazar.rb14
-rw-r--r--lib/model.rb15
-rw-r--r--lib/overwrite.rb8
-rw-r--r--lib/regression.rb38
-rw-r--r--test/regression.rb4
6 files changed, 49 insertions, 73 deletions
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 59a68e5..b9c2187 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -52,46 +52,6 @@ module OpenTox
# Split a dataset into n folds
# @param [Integer] number of folds
# @return [Array] Array with folds [training_dataset,test_dataset]
-=begin
- def folds n
- # TODO fix splits for duplicates
- unique_compound_ids = compound_ids.uniq
- len = unique_compond_ids.size
- indices = (0..len-1).to_a.shuffle
- mid = (len/n)
- chunks = []
- start = 0
- 1.upto(n) do |i|
- last = start+mid
- last = last-1 unless len%n >= i
- test_idxs = indices[start..last] || []
- test_cids = test_idxs.collect{|i| unique_compond_ids[i]}
- test_data_entries = test_idxs.collect{|i| self.data_entries[i]}
- test_dataset = self.class.new(:compound_ids => test_cids, :feature_ids => self.feature_ids, :data_entries => test_data_entries)
- test_dataset.compounds.each do |compound|
- compound.dataset_ids << test_dataset.id
- compound.save
- end
- training_idxs = indices-test_idxs
- training_cids = training_idxs.collect{|i| unique_compond_ids[i]}
- training_data_entries = training_idxs.collect{|i| self.data_entries[i]}
- training_dataset = self.class.new(:compound_ids => training_cids, :feature_ids => self.feature_ids, :data_entries => training_data_entries)
- training_dataset.compounds.each do |compound|
- compound.dataset_ids << training_dataset.id
- compound.save
- end
- test_dataset.save
- training_dataset.save
- chunks << [training_dataset,test_dataset]
- start = last+1
- end
- chunks
- end
-=end
-
- # Split a dataset into n folds
- # @param [Integer] number of folds
- # @return [Array] Array with folds [training_dataset,test_dataset]
def folds n
unique_compound_data = {}
compound_ids.each_with_index do |cid,i|
@@ -121,18 +81,15 @@ module OpenTox
end
end
dataset = self.class.new(:compound_ids => cids, :feature_ids => self.feature_ids, :data_entries => data_entries, :source => self.id )
-=begin
dataset.compounds.each do |compound|
compound.dataset_ids << dataset.id
compound.save
end
-=end
dataset
end
start = last+1
chunks << chunk
end
- puts chunks.inspect
chunks
end
diff --git a/lib/lazar.rb b/lib/lazar.rb
index c43dae7..bcae96f 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -42,10 +42,16 @@ end
# R setup
R = Rserve::Connection.new
-R.eval "library(ggplot2)"
-R.eval "library(grid)"
-R.eval "library(gridExtra)"
-R.eval "library(pls)"
+R.eval "
+suppressPackageStartupMessages({
+ library(ggplot2)
+ library(grid)
+ library(gridExtra)
+ library(caret)
+ library(doMC)
+ registerDoMC(4)
+})
+"
# Require sub-Repositories
require_relative '../libfminer/libbbrc/bbrc' # include before openbabel
diff --git a/lib/model.rb b/lib/model.rb
index a53be92..8cffdfd 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -34,7 +34,6 @@ module OpenTox
def initialize training_dataset, params={}
super params
- #bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
# TODO document convention
prediction_feature = training_dataset.features.first
@@ -82,16 +81,16 @@ module OpenTox
prediction = {}
if neighbors.collect{|n| n["_id"]}.include? compound.id
- database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s]
+ database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq
prediction[:database_activities] = database_activities
- prediction[:warning] = "#{database_activities.size} structures have been removed from neighbors, because they have the same structure as the query compound."
+ prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound."
neighbors.delete_if{|n| n["_id"] == compound.id}
end
neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] }
if neighbors.empty?
prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset."})
else
- prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset.id,:prediction_feature_id => prediction_feature.id}))
+ prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset_id,:prediction_feature_id => prediction_feature.id}))
end
predictions << prediction
end
@@ -114,14 +113,13 @@ module OpenTox
:prediction_feature_id => prediction_feature.id
)
- confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Prediction confidence" )
+ confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Model RMSE" )
# TODO move into warnings field
warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings")
prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ]
prediction_dataset.compounds = compounds
- #prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:database_activities] ? "measured" : p[:confidence] , p[:warning]]}
# TODO fix dataset measurements
- prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence] , p[:dataset_activities].to_s, p[:warning]]}
+ prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:rmse] , p[:dataset_activities].to_s, p[:warning]]}
prediction_dataset.save
return prediction_dataset
end
@@ -159,14 +157,13 @@ module OpenTox
def self.create training_dataset, params={}
model = self.new training_dataset, params
model.neighbor_algorithm ||= "fingerprint_neighbors"
- model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_pls_regression"
+ model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression"
model.neighbor_algorithm_parameters ||= {}
{
:type => "MP2D",
:training_dataset_id => training_dataset.id,
:min_sim => 0.1
#:type => "FP4",
- #:training_dataset_id => training_dataset.id,
#:min_sim => 0.7
}.each do |key,value|
model.neighbor_algorithm_parameters[key] ||= value
diff --git a/lib/overwrite.rb b/lib/overwrite.rb
index c92ad2b..2287a92 100644
--- a/lib/overwrite.rb
+++ b/lib/overwrite.rb
@@ -22,6 +22,14 @@ class Numeric
end
end
+class Float
+ # round to significant digits
+ # http://stackoverflow.com/questions/8382619/how-to-round-a-float-to-a-specified-number-of-significant-digits-in-ruby
+ def signif(signs)
+ Float("%.#{signs}g" % self)
+ end
+end
+
module Enumerable
# @return [Array] only the duplicates of an enumerable
def duplicates
diff --git a/lib/regression.rb b/lib/regression.rb
index c988542..2bf8915 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -4,7 +4,7 @@ module OpenTox
# TODO add LOO errors
class Regression
- def self.weighted_average compound, params
+ def self.local_weighted_average compound, params
weighted_sum = 0.0
sim_sum = 0.0
confidence = 0.0
@@ -23,7 +23,8 @@ module OpenTox
end
# TODO explicit neighbors, also for physchem
- def self.local_fingerprint_regression compound, params, algorithm="plsr", algorithm_params="ncomp = 4"
+ #def self.local_fingerprint_regression compound, params, method="pls", method_params="ncomp = 4"
+ def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05"
neighbors = params[:neighbors]
return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
activities = []
@@ -54,25 +55,27 @@ module OpenTox
end
if variables.empty?
- result = weighted_average(compound, params)
+ result = local_weighted_average(compound, params)
result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
return result
else
compound_features = variables.collect{|f| compound.fingerprint.include?(f) ? "T" : "F"}
- prediction = r_model_prediction algorithm, algorithm_params, data_frame, variables, weights, compound_features
+ prediction = r_model_prediction method, data_frame, variables, weights, compound_features
if prediction.nil?
- prediction = weighted_average(compound, params)
+ prediction = local_weighted_average(compound, params)
prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
return prediction
else
- return {:value => 10**prediction, :confidence => 1} # TODO confidence
+ prediction[:value] = 10**prediction[:value]
+ prediction[:rmse] = 10**prediction[:rmse]
+ prediction
end
end
end
- def self.local_physchem_regression compound, params, algorithm="plsr", algorithm_params="ncomp = 4"
+ def self.local_physchem_regression compound, params, method="plsr"#, method_params="ncomp = 4"
neighbors = params[:neighbors]
return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
@@ -100,39 +103,44 @@ module OpenTox
end
if physchem.empty?
- result = weighted_average(compound, params)
+ result = local_weighted_average(compound, params)
result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
return result
else
data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] }
- prediction = r_model_prediction algorithm, algorithm_params, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]}
+ prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]}
if prediction.nil?
- prediction = weighted_average(compound, params)
+ prediction = local_weighted_average(compound, params)
prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
return prediction
else
- return {:value => 10**prediction, :confidence => 1} # TODO confidence
+ prediction[:value] = 10**prediction[:value]
+ prediction
end
end
end
- def self.r_model_prediction algorithm, params, training_data, training_features, training_weights, query_feature_values
+ def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
R.assign "weights", training_weights
r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
R.eval "data <- #{r_data_frame}"
R.assign "features", training_features
R.eval "names(data) <- append(c('activities'),features)" #
begin
- R.eval "model <- #{algorithm}(activities ~ .,data = data, weights = weights, #{params})"
+ R.eval "model <- train(activities ~ ., data = data, method = '#{method}')"#, #{params}"
rescue
return nil
end
- R.eval "fingerprint <- rbind(c(#{query_feature_values.join ','}))"
+ R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
R.eval "names(fingerprint) <- features"
R.eval "prediction <- predict(model,fingerprint)"
- R.eval("prediction").to_f
+ {
+ :value => R.eval("prediction").to_f,
+ :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f,
+ :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f,
+ }
end
end
diff --git a/test/regression.rb b/test/regression.rb
index fa3b7fb..c25ed2b 100644
--- a/test/regression.rb
+++ b/test/regression.rb
@@ -26,7 +26,7 @@ class LazarRegressionTest < MiniTest::Test
model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_fingerprint_regression")
compound = Compound.from_smiles "NC(=O)OCCC"
prediction = model.predict compound
- p prediction[:value]
+ p prediction
refute_nil prediction[:value]
end
@@ -35,7 +35,7 @@ class LazarRegressionTest < MiniTest::Test
model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression")
compound = Compound.from_smiles "NC(=O)OCCC"
prediction = model.predict compound
- p prediction[:value]
+ p prediction
refute_nil prediction[:value]
end