From f61b7d3c65d084747dc1bf87214e5ec0c57326be Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 9 Feb 2016 11:04:00 +0100 Subject: pls regression --- lib/compound.rb | 6 +++-- lib/crossvalidation.rb | 9 ++++--- lib/lazar.rb | 1 + lib/regression.rb | 67 ++++++++++++++++++++++++++++++++---------------- test/lazar-regression.rb | 7 ++--- test/validation.rb | 23 ++++++++++++++++- 6 files changed, 82 insertions(+), 31 deletions(-) diff --git a/lib/compound.rb b/lib/compound.rb index 040fd6f..8f37247 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -38,7 +38,7 @@ module OpenTox compound end - def fingerprint type="MP2D" + def fingerprint type=DEFAULT_FINGERPRINT unless fingerprints[type] return [] unless self.smiles #http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format @@ -337,12 +337,14 @@ module OpenTox end - # Get mg from mmol + # Convert mg to mmol # @return [Float] value in mg def mmol_to_mg mmol mmol.to_f*molecular_weight end + # Convert mmol to mg + # @return [Float] value in mg def mg_to_mmol mg mg.to_f/molecular_weight end diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 9b5c4e2..9789882 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -35,14 +35,14 @@ module OpenTox predictions = [] training_dataset = Dataset.find model.training_dataset_id training_dataset.folds(n).each_with_index do |fold,fold_nr| - fork do # parallel execution of validations + #fork do # parallel execution of validations $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" t = Time.now validation = Validation.create(model, fold[0], fold[1],cv) $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" - end + #end end - Process.waitall + #Process.waitall cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id) cv.validations.each do |validation| nr_instances += validation.nr_instances @@ -176,6 +176,7 @@ module OpenTox mae = 0 weighted_mae = 0 confidence_sum = 0 + p predictions predictions.each do |pred| compound_id,activity,prediction,confidence = pred if activity and prediction @@ -194,6 +195,8 @@ module OpenTox y = predictions.collect{|p| p[2]} R.assign "measurement", x R.assign "prediction", y + p x + p y R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" r = R.eval("r").to_ruby diff --git a/lib/lazar.rb b/lib/lazar.rb index 5d9bc19..ae42d42 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -45,6 +45,7 @@ R = Rserve::Connection.new R.eval "library(ggplot2)" R.eval "library(grid)" R.eval "library(gridExtra)" +R.eval "library('pls')" # Require sub-Repositories require_relative '../libfminer/libbbrc/bbrc' # include before openbabel diff --git a/lib/regression.rb b/lib/regression.rb index 575a1ef..7c64d8f 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -9,7 +9,7 @@ module OpenTox sim_sum = 0.0 confidence = 0.0 neighbors = params[:neighbors] - activities = [] + #activities = [] neighbors.each do |row| #if row["dataset_ids"].include? params[:training_dataset_id] sim = row["tanimoto"] @@ -17,7 +17,7 @@ module OpenTox # TODO add LOO errors row["features"][params[:prediction_feature_id].to_s].each do |act| weighted_sum += sim*Math.log10(act) - activities << act + #activities << act # TODO: Transformation?? sim_sum += sim end #end @@ -33,28 +33,51 @@ module OpenTox {:value => prediction,:confidence => confidence} end - def self.local_linear_regression compound, neighbors - return nil unless neighbors.size > 0 - features = neighbors.collect{|n| Compound.find(n.first).fp4}.flatten.uniq - training_data = Array.new(neighbors.size){Array.new(features.size,0)} - neighbors.each_with_index do |n,i| - #p n.first - neighbor = Compound.find n.first - features.each_with_index do |f,j| - training_data[i][j] = 1 if neighbor.fp4.include? f + def self.local_pls_regression compound, params + neighbors = params[:neighbors] + return {:value => nil, :confidence => nil} unless neighbors.size > 0 + activities = [] + fingerprints = {} + weights = [] + fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort + + neighbors.each_with_index do |row,i| + neighbor = Compound.find row["_id"] + fingerprint = neighbor.fingerprint + row["features"][params[:prediction_feature_id].to_s].each do |act| + activities << Math.log10(act) + weights << row["tanimoto"] + fingerprint_ids.each_with_index do |id,j| + fingerprints[id] ||= [] + fingerprints[id] << fingerprint.include?(id) + end + end + end + + name = Feature.find(params[:prediction_feature_id]).name + R.assign "activities", activities + R.assign "weights", weights + variables = [] + data_frame = ["c(#{activities.join ","})"] + fingerprints.each do |k,v| + unless v.uniq.size == 1 + data_frame << "factor(c(#{v.collect{|m| m ? "T" : "F"}.join ","}))" + variables << "'#{k}'" end end - p training_data - - R.assign "activities", neighbors.collect{|n| n[2].median} - R.assign "features", training_data - R.eval "model <- lm(activities ~ features)" - R.eval "summary <- summary(model)" - p R.summary - compound_features = features.collect{|f| compound.fp4.include? f ? 1 : 0} - R.assign "compound_features", compound_features - R.eval "prediction <- predict(model,compound_features)" - p R.prediction + begin + R.eval "data <- data.frame(#{data_frame.join ","})" + R.eval "names(data) <- c('activities',#{variables.join ','})" + R.eval "model <- plsr(activities ~ .,data = data, ncomp = 3, weights = weights)" + compound_features = fingerprint_ids.collect{|f| compound.fingerprint.include? f } + R.eval "fingerprint <- rbind(c(#{compound_features.collect{|f| f ? "T" : "F"}.join ','}))" + R.eval "names(fingerprint) <- c(#{variables.join ','})" + R.eval "prediction <- predict(model,fingerprint)" + prediction = 10**R.eval("prediction").to_f + {:value => prediction, :confidence => 1} # TODO confidence + rescue + {:value => nil, :confidence => nil} # TODO confidence + end end diff --git a/test/lazar-regression.rb b/test/lazar-regression.rb index c1dc9b9..9ade6d5 100644 --- a/test/lazar-regression.rb +++ b/test/lazar-regression.rb @@ -21,14 +21,15 @@ class LazarRegressionTest < MiniTest::Test assert_equal 3, prediction[:neighbors].size end - def test_local_linear_regression - skip + def test_local_pls_regression training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv" model = Model::LazarRegression.create training_dataset - model.update(:prediction_algorithm => "OpenTox::Algorithm::Regression.local_linear_regression") compound = Compound.from_smiles "NC(=O)OCCC" prediction = model.predict compound p prediction + model.update(:prediction_algorithm => "OpenTox::Algorithm::Regression.local_pls_regression") + prediction = model.predict compound + p prediction #assert_equal 13.6, prediction[:value].round(1) #assert_equal 0.83, prediction[:confidence].round(2) #assert_equal 1, prediction[:neighbors].size diff --git a/test/validation.rb b/test/validation.rb index 95f9bc0..066ec95 100644 --- a/test/validation.rb +++ b/test/validation.rb @@ -30,7 +30,7 @@ class ValidationTest < MiniTest::Test model = Model::LazarRegression.create dataset cv = RegressionCrossValidation.create model #cv = RegressionCrossValidation.find '561503262b72ed54fd000001' - #p cv.id + p cv #File.open("tmp.svg","w+"){|f| f.puts cv.correlation_plot} #`inkview tmp.svg` #File.open("tmp.svg","w+"){|f| f.puts cv.confidence_plot} @@ -71,6 +71,27 @@ class ValidationTest < MiniTest::Test assert cv.mae < 1 end + def test_pls_regression_crossvalidation + dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv" + #dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.csv" + params = { + :prediction_algorithm => "OpenTox::Algorithm::Regression.local_pls_regression", + } + model = Model::LazarRegression.create dataset, params + cv = RegressionCrossValidation.create model + #p cv + cv.validation_ids.each do |vid| + model = Model::Lazar.find(Validation.find(vid).model_id) + p model + #assert_equal params[:neighbor_algorithm_parameters][:type], model[:neighbor_algorithm_parameters][:type] + #assert_equal params[:neighbor_algorithm_parameters][:min_sim], model[:neighbor_algorithm_parameters][:min_sim] + #refute_equal params[:neighbor_algorithm_parameters][:training_dataset_id], model[:neighbor_algorithm_parameters][:training_dataset_id] + end + + assert cv.rmse < 1.5, "RMSE > 1.5" + assert cv.mae < 1 + end + def test_repeated_crossvalidation dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" model = Model::LazarClassification.create dataset -- cgit v1.2.3