summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-02-27 16:47:48 +0100
committerChristoph Helma <helma@in-silico.ch>2016-02-27 16:47:48 +0100
commitb90720cc26d789a96fa6f7a054fe06fc8b4ef33d (patch)
tree3c5e28e4576b31eb66d1bc29a7c2abea38802bb3
parente778475c578f13f30af4437845716d7e781c2609 (diff)
local pls regression as default regression algorithmfminer
-rw-r--r--lib/compound.rb1
-rw-r--r--lib/crossvalidation.rb16
-rw-r--r--lib/lazar.rb2
-rw-r--r--lib/model.rb4
-rw-r--r--lib/regression.rb64
-rw-r--r--lib/validation.rb19
-rw-r--r--test/descriptor.rb1
-rw-r--r--test/lazar-regression.rb15
8 files changed, 100 insertions, 22 deletions
diff --git a/lib/compound.rb b/lib/compound.rb
index 8f37247..d5d6aa9 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -23,6 +23,7 @@ module OpenTox
field :sdf_id, type: BSON::ObjectId
field :molecular_weight, type: Float
field :fingerprints, type: Hash, default: {}
+ field :physchem, type: Hash, default: {}
field :default_fingerprint_size, type: Integer
field :dataset_ids, type: Array, default: []
field :features, type: Hash, default: {}
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 0c5f0be..362842e 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -55,7 +55,7 @@ module OpenTox
predictions: predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
)
$logger.debug "Nr unpredicted: #{nr_unpredicted}"
- cv.statistics
+ #cv.statistics
cv
end
end
@@ -179,12 +179,14 @@ module OpenTox
predictions.each do |pred|
compound_id,activity,prediction,confidence = pred
if activity and prediction
- error = Math.log10(prediction)-Math.log10(activity)
- rmse += error**2
- weighted_rmse += confidence*error**2
- mae += error.abs
- weighted_mae += confidence*error.abs
- confidence_sum += confidence
+ activity.each do |act|
+ error = Math.log10(prediction)-Math.log10(act)
+ rmse += error**2
+ weighted_rmse += confidence*error**2
+ mae += error.abs
+ weighted_mae += confidence*error.abs
+ confidence_sum += confidence
+ end
else
warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
$logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
diff --git a/lib/lazar.rb b/lib/lazar.rb
index ae42d42..e5c1609 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -45,7 +45,7 @@ R = Rserve::Connection.new
R.eval "library(ggplot2)"
R.eval "library(grid)"
R.eval "library(gridExtra)"
-R.eval "library('pls')"
+R.eval "library(pls)"
# Require sub-Repositories
require_relative '../libfminer/libbbrc/bbrc' # include before openbabel
diff --git a/lib/model.rb b/lib/model.rb
index 0d2354f..41b3217 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -34,7 +34,7 @@ module OpenTox
def initialize training_dataset, params={}
super params
- bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
+ #bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
# TODO document convention
prediction_feature = training_dataset.features.first
@@ -159,7 +159,7 @@ module OpenTox
def self.create training_dataset, params={}
model = self.new training_dataset, params
model.neighbor_algorithm ||= "fingerprint_neighbors"
- model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.weighted_average"
+ model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_pls_regression"
model.neighbor_algorithm_parameters ||= {}
{
:type => "MP2D",
diff --git a/lib/regression.rb b/lib/regression.rb
index 2b41851..10a1861 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -93,6 +93,70 @@ module OpenTox
end
+ def self.local_physchem_regression compound, params
+ neighbors = params[:neighbors]
+ return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
+ activities = []
+ fingerprints = {}
+ weights = []
+ fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort
+
+ neighbors.each_with_index do |row,i|
+ neighbor = Compound.find row["_id"]
+ fingerprint = neighbor.fingerprint
+ row["features"][params[:prediction_feature_id].to_s].each do |act|
+ activities << Math.log10(act)
+ weights << row["tanimoto"]
+ fingerprint_ids.each_with_index do |id,j|
+ fingerprints[id] ||= []
+ fingerprints[id] << fingerprint.include?(id)
+ end
+ end
+ end
+
+ name = Feature.find(params[:prediction_feature_id]).name
+ R.assign "activities", activities
+ R.assign "weights", weights
+ variables = []
+ data_frame = ["c(#{activities.join ","})"]
+ fingerprints.each do |k,v|
+ unless v.uniq.size == 1
+ data_frame << "factor(c(#{v.collect{|m| m ? "T" : "F"}.join ","}))"
+ variables << k
+ end
+ end
+ if variables.empty?
+ result = weighted_average(compound, params)
+ result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
+ return result
+ return {:value => nil, :confidence => nil} # TODO confidence
+ else
+ R.eval "data <- data.frame(#{data_frame.join ","})"
+ R.assign "features", variables
+ R.eval "names(data) <- append(c('activities'),features)" #
+ begin
+ R.eval "model <- plsr(activities ~ .,data = data, ncomp = 4, weights = weights)"
+ rescue # fall back to weighted average
+ result = weighted_average(compound, params)
+ result[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
+ return result
+ end
+ #begin
+ #compound_features = fingerprint_ids.collect{|f| compound.fingerprint.include? f } # FIX
+ compound_features = variables.collect{|f| compound.fingerprint.include? f }
+ R.eval "fingerprint <- rbind(c(#{compound_features.collect{|f| f ? "T" : "F"}.join ','}))"
+ R.eval "names(fingerprint) <- features" #
+ R.eval "prediction <- predict(model,fingerprint)"
+ prediction = 10**R.eval("prediction").to_f
+ return {:value => prediction, :confidence => 1} # TODO confidence
+ #rescue
+ #p "Prediction failed"
+ #return {:value => nil, :confidence => nil} # TODO confidence
+ #end
+ end
+
+ end
+
def self.weighted_average_with_relevant_fingerprints neighbors
weighted_sum = 0.0
sim_sum = 0.0
diff --git a/lib/validation.rb b/lib/validation.rb
index 651860e..9c19cde 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -62,6 +62,13 @@ module OpenTox
validation
end
+ end
+
+ class ClassificationValidation < Validation
+ end
+
+ class RegressionValidation < Validation
+
def statistics
rmse = 0
weighted_rmse = 0
@@ -105,18 +112,8 @@ module OpenTox
finished_at: Time.now
)
=end
- puts "R^2 #{r**2}"
- puts "RMSE #{rmse}"
- puts "MAE #{mae}"
- return { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae }
+ { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae }
end
-
- end
-
- class ClassificationValidation < Validation
- end
-
- class RegressionValidation < Validation
end
end
diff --git a/test/descriptor.rb b/test/descriptor.rb
index 58149a7..28be79e 100644
--- a/test/descriptor.rb
+++ b/test/descriptor.rb
@@ -62,6 +62,7 @@ class DescriptorTest < MiniTest::Test
assert_equal 330, result.size
assert_equal 30.8723, result[2]
assert_equal 5, result[328]
+ p result
end
def test_compound_descriptor_parameters
diff --git a/test/lazar-regression.rb b/test/lazar-regression.rb
index 9ade6d5..932b91c 100644
--- a/test/lazar-regression.rb
+++ b/test/lazar-regression.rb
@@ -4,7 +4,7 @@ class LazarRegressionTest < MiniTest::Test
def test_weighted_average
training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
- model = Model::LazarRegression.create training_dataset, {:neighbor_algorithm_parameters => {:min_sim => 0}}
+ model = Model::LazarRegression.create training_dataset, {:neighbor_algorithm_parameters => {:min_sim => 0}, :prediction_algorithm => "OpenTox::Algorithm::Regression.weighted_average"}
compound = Compound.from_smiles "CC(C)(C)CN"
prediction = model.predict compound
assert_equal 7.2, prediction[:value].round(1)
@@ -35,4 +35,17 @@ class LazarRegressionTest < MiniTest::Test
#assert_equal 1, prediction[:neighbors].size
end
+ def test_local_physchem_regression
+ training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
+ model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression")
+ compound = Compound.from_smiles "NC(=O)OCCC"
+ prediction = model.predict compound
+ model.update(:prediction_algorithm => "OpenTox::Algorithm::Regression.local_pls_regression")
+ prediction = model.predict compound
+ p prediction
+ #assert_equal 13.6, prediction[:value].round(1)
+ #assert_equal 0.83, prediction[:confidence].round(2)
+ #assert_equal 1, prediction[:neighbors].size
+ end
+
end