summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/compound.rb6
-rw-r--r--lib/crossvalidation.rb9
-rw-r--r--lib/lazar.rb1
-rw-r--r--lib/regression.rb67
-rw-r--r--test/lazar-regression.rb7
-rw-r--r--test/validation.rb23
6 files changed, 82 insertions, 31 deletions
diff --git a/lib/compound.rb b/lib/compound.rb
index 040fd6f..8f37247 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -38,7 +38,7 @@ module OpenTox
compound
end
- def fingerprint type="MP2D"
+ def fingerprint type=DEFAULT_FINGERPRINT
unless fingerprints[type]
return [] unless self.smiles
#http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format
@@ -337,12 +337,14 @@ module OpenTox
end
- # Get mg from mmol
+ # Convert mg to mmol
# @return [Float] value in mg
def mmol_to_mg mmol
mmol.to_f*molecular_weight
end
+ # Convert mmol to mg
+ # @return [Float] value in mg
def mg_to_mmol mg
mg.to_f/molecular_weight
end
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 9b5c4e2..9789882 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -35,14 +35,14 @@ module OpenTox
predictions = []
training_dataset = Dataset.find model.training_dataset_id
training_dataset.folds(n).each_with_index do |fold,fold_nr|
- fork do # parallel execution of validations
+ #fork do # parallel execution of validations
$logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
t = Time.now
validation = Validation.create(model, fold[0], fold[1],cv)
$logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds"
- end
+ #end
end
- Process.waitall
+ #Process.waitall
cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id)
cv.validations.each do |validation|
nr_instances += validation.nr_instances
@@ -176,6 +176,7 @@ module OpenTox
mae = 0
weighted_mae = 0
confidence_sum = 0
+ p predictions
predictions.each do |pred|
compound_id,activity,prediction,confidence = pred
if activity and prediction
@@ -194,6 +195,8 @@ module OpenTox
y = predictions.collect{|p| p[2]}
R.assign "measurement", x
R.assign "prediction", y
+ p x
+ p y
R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
r = R.eval("r").to_ruby
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 5d9bc19..ae42d42 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -45,6 +45,7 @@ R = Rserve::Connection.new
R.eval "library(ggplot2)"
R.eval "library(grid)"
R.eval "library(gridExtra)"
+R.eval "library('pls')"
# Require sub-Repositories
require_relative '../libfminer/libbbrc/bbrc' # include before openbabel
diff --git a/lib/regression.rb b/lib/regression.rb
index 575a1ef..7c64d8f 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -9,7 +9,7 @@ module OpenTox
sim_sum = 0.0
confidence = 0.0
neighbors = params[:neighbors]
- activities = []
+ #activities = []
neighbors.each do |row|
#if row["dataset_ids"].include? params[:training_dataset_id]
sim = row["tanimoto"]
@@ -17,7 +17,7 @@ module OpenTox
# TODO add LOO errors
row["features"][params[:prediction_feature_id].to_s].each do |act|
weighted_sum += sim*Math.log10(act)
- activities << act
+ #activities << act # TODO: Transformation??
sim_sum += sim
end
#end
@@ -33,28 +33,51 @@ module OpenTox
{:value => prediction,:confidence => confidence}
end
- def self.local_linear_regression compound, neighbors
- return nil unless neighbors.size > 0
- features = neighbors.collect{|n| Compound.find(n.first).fp4}.flatten.uniq
- training_data = Array.new(neighbors.size){Array.new(features.size,0)}
- neighbors.each_with_index do |n,i|
- #p n.first
- neighbor = Compound.find n.first
- features.each_with_index do |f,j|
- training_data[i][j] = 1 if neighbor.fp4.include? f
+ def self.local_pls_regression compound, params
+ neighbors = params[:neighbors]
+ return {:value => nil, :confidence => nil} unless neighbors.size > 0
+ activities = []
+ fingerprints = {}
+ weights = []
+ fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort
+
+ neighbors.each_with_index do |row,i|
+ neighbor = Compound.find row["_id"]
+ fingerprint = neighbor.fingerprint
+ row["features"][params[:prediction_feature_id].to_s].each do |act|
+ activities << Math.log10(act)
+ weights << row["tanimoto"]
+ fingerprint_ids.each_with_index do |id,j|
+ fingerprints[id] ||= []
+ fingerprints[id] << fingerprint.include?(id)
+ end
+ end
+ end
+
+ name = Feature.find(params[:prediction_feature_id]).name
+ R.assign "activities", activities
+ R.assign "weights", weights
+ variables = []
+ data_frame = ["c(#{activities.join ","})"]
+ fingerprints.each do |k,v|
+ unless v.uniq.size == 1
+ data_frame << "factor(c(#{v.collect{|m| m ? "T" : "F"}.join ","}))"
+ variables << "'#{k}'"
end
end
- p training_data
-
- R.assign "activities", neighbors.collect{|n| n[2].median}
- R.assign "features", training_data
- R.eval "model <- lm(activities ~ features)"
- R.eval "summary <- summary(model)"
- p R.summary
- compound_features = features.collect{|f| compound.fp4.include? f ? 1 : 0}
- R.assign "compound_features", compound_features
- R.eval "prediction <- predict(model,compound_features)"
- p R.prediction
+ begin
+ R.eval "data <- data.frame(#{data_frame.join ","})"
+ R.eval "names(data) <- c('activities',#{variables.join ','})"
+ R.eval "model <- plsr(activities ~ .,data = data, ncomp = 3, weights = weights)"
+ compound_features = fingerprint_ids.collect{|f| compound.fingerprint.include? f }
+ R.eval "fingerprint <- rbind(c(#{compound_features.collect{|f| f ? "T" : "F"}.join ','}))"
+ R.eval "names(fingerprint) <- c(#{variables.join ','})"
+ R.eval "prediction <- predict(model,fingerprint)"
+ prediction = 10**R.eval("prediction").to_f
+ {:value => prediction, :confidence => 1} # TODO confidence
+ rescue
+ {:value => nil, :confidence => nil} # TODO confidence
+ end
end
diff --git a/test/lazar-regression.rb b/test/lazar-regression.rb
index c1dc9b9..9ade6d5 100644
--- a/test/lazar-regression.rb
+++ b/test/lazar-regression.rb
@@ -21,14 +21,15 @@ class LazarRegressionTest < MiniTest::Test
assert_equal 3, prediction[:neighbors].size
end
- def test_local_linear_regression
- skip
+ def test_local_pls_regression
training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
model = Model::LazarRegression.create training_dataset
- model.update(:prediction_algorithm => "OpenTox::Algorithm::Regression.local_linear_regression")
compound = Compound.from_smiles "NC(=O)OCCC"
prediction = model.predict compound
p prediction
+ model.update(:prediction_algorithm => "OpenTox::Algorithm::Regression.local_pls_regression")
+ prediction = model.predict compound
+ p prediction
#assert_equal 13.6, prediction[:value].round(1)
#assert_equal 0.83, prediction[:confidence].round(2)
#assert_equal 1, prediction[:neighbors].size
diff --git a/test/validation.rb b/test/validation.rb
index 95f9bc0..066ec95 100644
--- a/test/validation.rb
+++ b/test/validation.rb
@@ -30,7 +30,7 @@ class ValidationTest < MiniTest::Test
model = Model::LazarRegression.create dataset
cv = RegressionCrossValidation.create model
#cv = RegressionCrossValidation.find '561503262b72ed54fd000001'
- #p cv.id
+ p cv
#File.open("tmp.svg","w+"){|f| f.puts cv.correlation_plot}
#`inkview tmp.svg`
#File.open("tmp.svg","w+"){|f| f.puts cv.confidence_plot}
@@ -71,6 +71,27 @@ class ValidationTest < MiniTest::Test
assert cv.mae < 1
end
+ def test_pls_regression_crossvalidation
+ dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
+ #dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.csv"
+ params = {
+ :prediction_algorithm => "OpenTox::Algorithm::Regression.local_pls_regression",
+ }
+ model = Model::LazarRegression.create dataset, params
+ cv = RegressionCrossValidation.create model
+ #p cv
+ cv.validation_ids.each do |vid|
+ model = Model::Lazar.find(Validation.find(vid).model_id)
+ p model
+ #assert_equal params[:neighbor_algorithm_parameters][:type], model[:neighbor_algorithm_parameters][:type]
+ #assert_equal params[:neighbor_algorithm_parameters][:min_sim], model[:neighbor_algorithm_parameters][:min_sim]
+ #refute_equal params[:neighbor_algorithm_parameters][:training_dataset_id], model[:neighbor_algorithm_parameters][:training_dataset_id]
+ end
+
+ assert cv.rmse < 1.5, "RMSE > 1.5"
+ assert cv.mae < 1
+ end
+
def test_repeated_crossvalidation
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
model = Model::LazarClassification.create dataset