From b90720cc26d789a96fa6f7a054fe06fc8b4ef33d Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sat, 27 Feb 2016 16:47:48 +0100 Subject: local pls regression as default regression algorithm --- lib/compound.rb | 1 + lib/crossvalidation.rb | 16 ++++++------ lib/lazar.rb | 2 +- lib/model.rb | 4 +-- lib/regression.rb | 64 ++++++++++++++++++++++++++++++++++++++++++++++++ lib/validation.rb | 19 ++++++-------- test/descriptor.rb | 1 + test/lazar-regression.rb | 15 +++++++++++- 8 files changed, 100 insertions(+), 22 deletions(-) diff --git a/lib/compound.rb b/lib/compound.rb index 8f37247..d5d6aa9 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -23,6 +23,7 @@ module OpenTox field :sdf_id, type: BSON::ObjectId field :molecular_weight, type: Float field :fingerprints, type: Hash, default: {} + field :physchem, type: Hash, default: {} field :default_fingerprint_size, type: Integer field :dataset_ids, type: Array, default: [] field :features, type: Hash, default: {} diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 0c5f0be..362842e 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -55,7 +55,7 @@ module OpenTox predictions: predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence ) $logger.debug "Nr unpredicted: #{nr_unpredicted}" - cv.statistics + #cv.statistics cv end end @@ -179,12 +179,14 @@ module OpenTox predictions.each do |pred| compound_id,activity,prediction,confidence = pred if activity and prediction - error = Math.log10(prediction)-Math.log10(activity) - rmse += error**2 - weighted_rmse += confidence*error**2 - mae += error.abs - weighted_mae += confidence*error.abs - confidence_sum += confidence + activity.each do |act| + error = Math.log10(prediction)-Math.log10(act) + rmse += error**2 + weighted_rmse += confidence*error**2 + mae += error.abs + weighted_mae += confidence*error.abs + confidence_sum += confidence + end else warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." diff --git a/lib/lazar.rb b/lib/lazar.rb index ae42d42..e5c1609 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -45,7 +45,7 @@ R = Rserve::Connection.new R.eval "library(ggplot2)" R.eval "library(grid)" R.eval "library(gridExtra)" -R.eval "library('pls')" +R.eval "library(pls)" # Require sub-Repositories require_relative '../libfminer/libbbrc/bbrc' # include before openbabel diff --git a/lib/model.rb b/lib/model.rb index 0d2354f..41b3217 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -34,7 +34,7 @@ module OpenTox def initialize training_dataset, params={} super params - bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1 + #bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1 # TODO document convention prediction_feature = training_dataset.features.first @@ -159,7 +159,7 @@ module OpenTox def self.create training_dataset, params={} model = self.new training_dataset, params model.neighbor_algorithm ||= "fingerprint_neighbors" - model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.weighted_average" + model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_pls_regression" model.neighbor_algorithm_parameters ||= {} { :type => "MP2D", diff --git a/lib/regression.rb b/lib/regression.rb index 2b41851..10a1861 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -93,6 +93,70 @@ module OpenTox end + def self.local_physchem_regression compound, params + neighbors = params[:neighbors] + return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 + activities = [] + fingerprints = {} + weights = [] + fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort + + neighbors.each_with_index do |row,i| + neighbor = Compound.find row["_id"] + fingerprint = neighbor.fingerprint + row["features"][params[:prediction_feature_id].to_s].each do |act| + activities << Math.log10(act) + weights << row["tanimoto"] + fingerprint_ids.each_with_index do |id,j| + fingerprints[id] ||= [] + fingerprints[id] << fingerprint.include?(id) + end + end + end + + name = Feature.find(params[:prediction_feature_id]).name + R.assign "activities", activities + R.assign "weights", weights + variables = [] + data_frame = ["c(#{activities.join ","})"] + fingerprints.each do |k,v| + unless v.uniq.size == 1 + data_frame << "factor(c(#{v.collect{|m| m ? "T" : "F"}.join ","}))" + variables << k + end + end + if variables.empty? + result = weighted_average(compound, params) + result[:warning] = "No variables for regression model. Using weighted average of similar compounds." + return result + return {:value => nil, :confidence => nil} # TODO confidence + else + R.eval "data <- data.frame(#{data_frame.join ","})" + R.assign "features", variables + R.eval "names(data) <- append(c('activities'),features)" # + begin + R.eval "model <- plsr(activities ~ .,data = data, ncomp = 4, weights = weights)" + rescue # fall back to weighted average + result = weighted_average(compound, params) + result[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." + return result + end + #begin + #compound_features = fingerprint_ids.collect{|f| compound.fingerprint.include? f } # FIX + compound_features = variables.collect{|f| compound.fingerprint.include? f } + R.eval "fingerprint <- rbind(c(#{compound_features.collect{|f| f ? "T" : "F"}.join ','}))" + R.eval "names(fingerprint) <- features" # + R.eval "prediction <- predict(model,fingerprint)" + prediction = 10**R.eval("prediction").to_f + return {:value => prediction, :confidence => 1} # TODO confidence + #rescue + #p "Prediction failed" + #return {:value => nil, :confidence => nil} # TODO confidence + #end + end + + end + def self.weighted_average_with_relevant_fingerprints neighbors weighted_sum = 0.0 sim_sum = 0.0 diff --git a/lib/validation.rb b/lib/validation.rb index 651860e..9c19cde 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -62,6 +62,13 @@ module OpenTox validation end + end + + class ClassificationValidation < Validation + end + + class RegressionValidation < Validation + def statistics rmse = 0 weighted_rmse = 0 @@ -105,18 +112,8 @@ module OpenTox finished_at: Time.now ) =end - puts "R^2 #{r**2}" - puts "RMSE #{rmse}" - puts "MAE #{mae}" - return { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae } + { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae } end - - end - - class ClassificationValidation < Validation - end - - class RegressionValidation < Validation end end diff --git a/test/descriptor.rb b/test/descriptor.rb index 58149a7..28be79e 100644 --- a/test/descriptor.rb +++ b/test/descriptor.rb @@ -62,6 +62,7 @@ class DescriptorTest < MiniTest::Test assert_equal 330, result.size assert_equal 30.8723, result[2] assert_equal 5, result[328] + p result end def test_compound_descriptor_parameters diff --git a/test/lazar-regression.rb b/test/lazar-regression.rb index 9ade6d5..932b91c 100644 --- a/test/lazar-regression.rb +++ b/test/lazar-regression.rb @@ -4,7 +4,7 @@ class LazarRegressionTest < MiniTest::Test def test_weighted_average training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv" - model = Model::LazarRegression.create training_dataset, {:neighbor_algorithm_parameters => {:min_sim => 0}} + model = Model::LazarRegression.create training_dataset, {:neighbor_algorithm_parameters => {:min_sim => 0}, :prediction_algorithm => "OpenTox::Algorithm::Regression.weighted_average"} compound = Compound.from_smiles "CC(C)(C)CN" prediction = model.predict compound assert_equal 7.2, prediction[:value].round(1) @@ -35,4 +35,17 @@ class LazarRegressionTest < MiniTest::Test #assert_equal 1, prediction[:neighbors].size end + def test_local_physchem_regression + training_dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv" + model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression") + compound = Compound.from_smiles "NC(=O)OCCC" + prediction = model.predict compound + model.update(:prediction_algorithm => "OpenTox::Algorithm::Regression.local_pls_regression") + prediction = model.predict compound + p prediction + #assert_equal 13.6, prediction[:value].round(1) + #assert_equal 0.83, prediction[:confidence].round(2) + #assert_equal 1, prediction[:neighbors].size + end + end -- cgit v1.2.3