From 24b1524f20eccd3bfd59171f1f7151fcc272a427 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 14 Mar 2016 10:06:22 +0100 Subject: folds split on unique compounds instead of data entries --- lib/dataset.rb | 43 ------------------------------------------- lib/lazar.rb | 14 ++++++++++---- lib/model.rb | 15 ++++++--------- lib/overwrite.rb | 8 ++++++++ lib/regression.rb | 38 +++++++++++++++++++++++--------------- test/regression.rb | 4 ++-- 6 files changed, 49 insertions(+), 73 deletions(-) diff --git a/lib/dataset.rb b/lib/dataset.rb index 59a68e5..b9c2187 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -49,46 +49,6 @@ module OpenTox # Dataset operations - # Split a dataset into n folds - # @param [Integer] number of folds - # @return [Array] Array with folds [training_dataset,test_dataset] -=begin - def folds n - # TODO fix splits for duplicates - unique_compound_ids = compound_ids.uniq - len = unique_compond_ids.size - indices = (0..len-1).to_a.shuffle - mid = (len/n) - chunks = [] - start = 0 - 1.upto(n) do |i| - last = start+mid - last = last-1 unless len%n >= i - test_idxs = indices[start..last] || [] - test_cids = test_idxs.collect{|i| unique_compond_ids[i]} - test_data_entries = test_idxs.collect{|i| self.data_entries[i]} - test_dataset = self.class.new(:compound_ids => test_cids, :feature_ids => self.feature_ids, :data_entries => test_data_entries) - test_dataset.compounds.each do |compound| - compound.dataset_ids << test_dataset.id - compound.save - end - training_idxs = indices-test_idxs - training_cids = training_idxs.collect{|i| unique_compond_ids[i]} - training_data_entries = training_idxs.collect{|i| self.data_entries[i]} - training_dataset = self.class.new(:compound_ids => training_cids, :feature_ids => self.feature_ids, :data_entries => training_data_entries) - training_dataset.compounds.each do |compound| - compound.dataset_ids << training_dataset.id - compound.save - end - test_dataset.save - training_dataset.save - chunks << [training_dataset,test_dataset] - start = last+1 - end - chunks - end -=end - # Split a dataset into n folds # @param [Integer] number of folds # @return [Array] Array with folds [training_dataset,test_dataset] @@ -121,18 +81,15 @@ module OpenTox end end dataset = self.class.new(:compound_ids => cids, :feature_ids => self.feature_ids, :data_entries => data_entries, :source => self.id ) -=begin dataset.compounds.each do |compound| compound.dataset_ids << dataset.id compound.save end -=end dataset end start = last+1 chunks << chunk end - puts chunks.inspect chunks end diff --git a/lib/lazar.rb b/lib/lazar.rb index c43dae7..bcae96f 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -42,10 +42,16 @@ end # R setup R = Rserve::Connection.new -R.eval "library(ggplot2)" -R.eval "library(grid)" -R.eval "library(gridExtra)" -R.eval "library(pls)" +R.eval " +suppressPackageStartupMessages({ + library(ggplot2) + library(grid) + library(gridExtra) + library(caret) + library(doMC) + registerDoMC(4) +}) +" # Require sub-Repositories require_relative '../libfminer/libbbrc/bbrc' # include before openbabel diff --git a/lib/model.rb b/lib/model.rb index a53be92..8cffdfd 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -34,7 +34,6 @@ module OpenTox def initialize training_dataset, params={} super params - #bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1 # TODO document convention prediction_feature = training_dataset.features.first @@ -82,16 +81,16 @@ module OpenTox prediction = {} if neighbors.collect{|n| n["_id"]}.include? compound.id - database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s] + database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq prediction[:database_activities] = database_activities - prediction[:warning] = "#{database_activities.size} structures have been removed from neighbors, because they have the same structure as the query compound." + prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." neighbors.delete_if{|n| n["_id"] == compound.id} end neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] } if neighbors.empty? prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset."}) else - prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset.id,:prediction_feature_id => prediction_feature.id})) + prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset_id,:prediction_feature_id => prediction_feature.id})) end predictions << prediction end @@ -114,14 +113,13 @@ module OpenTox :prediction_feature_id => prediction_feature.id ) - confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Prediction confidence" ) + confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Model RMSE" ) # TODO move into warnings field warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings") prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ] prediction_dataset.compounds = compounds - #prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:database_activities] ? "measured" : p[:confidence] , p[:warning]]} # TODO fix dataset measurements - prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence] , p[:dataset_activities].to_s, p[:warning]]} + prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:rmse] , p[:dataset_activities].to_s, p[:warning]]} prediction_dataset.save return prediction_dataset end @@ -159,14 +157,13 @@ module OpenTox def self.create training_dataset, params={} model = self.new training_dataset, params model.neighbor_algorithm ||= "fingerprint_neighbors" - model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_pls_regression" + model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression" model.neighbor_algorithm_parameters ||= {} { :type => "MP2D", :training_dataset_id => training_dataset.id, :min_sim => 0.1 #:type => "FP4", - #:training_dataset_id => training_dataset.id, #:min_sim => 0.7 }.each do |key,value| model.neighbor_algorithm_parameters[key] ||= value diff --git a/lib/overwrite.rb b/lib/overwrite.rb index c92ad2b..2287a92 100644 --- a/lib/overwrite.rb +++ b/lib/overwrite.rb @@ -22,6 +22,14 @@ class Numeric end end +class Float + # round to significant digits + # http://stackoverflow.com/questions/8382619/how-to-round-a-float-to-a-specified-number-of-significant-digits-in-ruby + def signif(signs) + Float("%.#{signs}g" % self) + end +end + module Enumerable # @return [Array] only the duplicates of an enumerable def duplicates diff --git a/lib/regression.rb b/lib/regression.rb index c988542..2bf8915 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -4,7 +4,7 @@ module OpenTox # TODO add LOO errors class Regression - def self.weighted_average compound, params + def self.local_weighted_average compound, params weighted_sum = 0.0 sim_sum = 0.0 confidence = 0.0 @@ -23,7 +23,8 @@ module OpenTox end # TODO explicit neighbors, also for physchem - def self.local_fingerprint_regression compound, params, algorithm="plsr", algorithm_params="ncomp = 4" + #def self.local_fingerprint_regression compound, params, method="pls", method_params="ncomp = 4" + def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05" neighbors = params[:neighbors] return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 activities = [] @@ -54,25 +55,27 @@ module OpenTox end if variables.empty? - result = weighted_average(compound, params) + result = local_weighted_average(compound, params) result[:warning] = "No variables for regression model. Using weighted average of similar compounds." return result else compound_features = variables.collect{|f| compound.fingerprint.include?(f) ? "T" : "F"} - prediction = r_model_prediction algorithm, algorithm_params, data_frame, variables, weights, compound_features + prediction = r_model_prediction method, data_frame, variables, weights, compound_features if prediction.nil? - prediction = weighted_average(compound, params) + prediction = local_weighted_average(compound, params) prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." return prediction else - return {:value => 10**prediction, :confidence => 1} # TODO confidence + prediction[:value] = 10**prediction[:value] + prediction[:rmse] = 10**prediction[:rmse] + prediction end end end - def self.local_physchem_regression compound, params, algorithm="plsr", algorithm_params="ncomp = 4" + def self.local_physchem_regression compound, params, method="plsr"#, method_params="ncomp = 4" neighbors = params[:neighbors] return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 @@ -100,39 +103,44 @@ module OpenTox end if physchem.empty? - result = weighted_average(compound, params) + result = local_weighted_average(compound, params) result[:warning] = "No variables for regression model. Using weighted average of similar compounds." return result else data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] } - prediction = r_model_prediction algorithm, algorithm_params, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]} + prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]} if prediction.nil? - prediction = weighted_average(compound, params) + prediction = local_weighted_average(compound, params) prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." return prediction else - return {:value => 10**prediction, :confidence => 1} # TODO confidence + prediction[:value] = 10**prediction[:value] + prediction end end end - def self.r_model_prediction algorithm, params, training_data, training_features, training_weights, query_feature_values + def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values R.assign "weights", training_weights r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" R.eval "data <- #{r_data_frame}" R.assign "features", training_features R.eval "names(data) <- append(c('activities'),features)" # begin - R.eval "model <- #{algorithm}(activities ~ .,data = data, weights = weights, #{params})" + R.eval "model <- train(activities ~ ., data = data, method = '#{method}')"#, #{params}" rescue return nil end - R.eval "fingerprint <- rbind(c(#{query_feature_values.join ','}))" + R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" R.eval "names(fingerprint) <- features" R.eval "prediction <- predict(model,fingerprint)" - R.eval("prediction").to_f + { + :value => R.eval("prediction").to_f, + :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f, + :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f, + } end end diff --git a/test/regression.rb b/test/regression.rb index fa3b7fb..c25ed2b 100644 --- a/test/regression.rb +++ b/test/regression.rb @@ -26,7 +26,7 @@ class LazarRegressionTest < MiniTest::Test model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_fingerprint_regression") compound = Compound.from_smiles "NC(=O)OCCC" prediction = model.predict compound - p prediction[:value] + p prediction refute_nil prediction[:value] end @@ -35,7 +35,7 @@ class LazarRegressionTest < MiniTest::Test model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression") compound = Compound.from_smiles "NC(=O)OCCC" prediction = model.predict compound - p prediction[:value] + p prediction refute_nil prediction[:value] end -- cgit v1.2.3