From 85f2308c101b4778508c2d767e08af4cfd671b7b Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 2 Jun 2016 12:22:39 +0200 Subject: local pls regression for nanoparticles --- lib/nanoparticle.rb | 13 ++++++++++--- lib/regression.rb | 34 +++++++++++++++++--------------- lib/validation-statistics.rb | 6 +++++- test/nanoparticles.rb | 46 ++++++++++++-------------------------------- test/setup.rb | 4 ++-- 5 files changed, 47 insertions(+), 56 deletions(-) diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index ca79a3d..65aab23 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -6,9 +6,10 @@ module OpenTox field :core, type: Hash, default: {} field :coating, type: Array, default: [] field :proteomics, type: Hash, default: {} + + attr_accessor :scaled_values def physchem_neighbors min_sim: 0.9, dataset_id:, prediction_feature_id: - p self.name dataset = Dataset.find(dataset_id) relevant_features = {} measurements = [] @@ -52,7 +53,9 @@ module OpenTox common_descriptors = relevant_features.keys & substance.physchem_descriptors.keys # scale values query_descriptors = common_descriptors.collect{|d| (physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} + @scaled_values = common_descriptors.collect{|d| [d,(physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h neighbor_descriptors = common_descriptors.collect{|d| (substance.physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} + neighbor_scaled_values = common_descriptors.collect{|d| [d,(substance.physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h #weights = common_descriptors.collect{|d| 1-relevant_features[d]["p_value"]} weights = common_descriptors.collect{|d| relevant_features[d]["r"]**2} sim = Algorithm::Similarity.weighted_cosine(query_descriptors,neighbor_descriptors,weights) @@ -61,12 +64,16 @@ module OpenTox "measurements" => values, "similarity" => sim, "common_descriptors" => common_descriptors.collect do |id| - {:id => id, :p_value => relevant_features[id]["p_value"], :r_squared => relevant_features[id]["r"]**2} + { + :id => id, + :scaled_value => neighbor_scaled_values[id], + :p_value => relevant_features[id]["p_value"], + :r_squared => relevant_features[id]["r"]**2} end } if sim >= min_sim end end - p neighbors.size + $logger.debug "#{self.name}: #{neighbors.size} neighbors" neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} neighbors end diff --git a/lib/regression.rb b/lib/regression.rb index cffcbbf..5028c78 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -73,23 +73,19 @@ module OpenTox activities = [] weights = [] - pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq + pc_ids = neighbors.collect{|n| n["common_descriptors"].collect{|d| d[:id]}}.flatten.uniq.sort data_frame = [] data_frame[0] = [] neighbors.each_with_index do |n,i| - neighbor = Substance.find(n["_id"]) - activities = neighbor["measurements"] + activities = n["measurements"] activities.each do |act| data_frame[0][i] = act weights << n["similarity"] - neighbor.physchem_descriptors.each do |pid,values| - values = [values] unless values.is_a? Array - values.uniq! - warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1 - j = pc_ids.index(pid)+1 + n["common_descriptors"].each do |d| + j = pc_ids.index(d[:id])+1 data_frame[j] ||= [] - data_frame[j][i] = values.for_R + data_frame[j][i] = d[:scaled_value] end end if activities (0..pc_ids.size+1).each do |j| # for R: fill empty values with NA @@ -97,10 +93,12 @@ module OpenTox data_frame[j][i] ||= "NA" end end + remove_idx = [] data_frame.each_with_index do |r,i| remove_idx << i if r.uniq.size == 1 # remove properties with a single value end + remove_idx.reverse.each do |i| data_frame.delete_at i pc_ids.delete_at i @@ -112,7 +110,7 @@ module OpenTox prediction else query_descriptors = pc_ids.collect do |i| - substance.physchem_descriptors[i] ? substance.physchem_descriptors[i].for_R : "NA" + substance.scaled_values[i] ? substance.scaled_values[i] : "NA" end remove_idx = [] query_descriptors.each_with_index do |v,i| @@ -127,10 +125,9 @@ module OpenTox if prediction.nil? prediction = local_weighted_average substance, neighbors prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." - prediction - else - prediction end + p prediction + prediction end end @@ -172,10 +169,15 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" R.eval "names(fingerprint) <- features" R.eval "prediction <- predict(model,fingerprint)" + value = R.eval("prediction").to_f + rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f + r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f + prediction_interval = value-1.96*rmse, value+1.96*rmse { - :value => R.eval("prediction").to_f, - :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f, - :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f, + :value => value, + :rmse => rmse, + :r_squared => r_squared, + :prediction_interval => prediction_interval } rescue return nil diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index e42d298..6b252b1 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -100,6 +100,8 @@ module OpenTox # TODO: predictions within prediction_interval self.rmse = 0 self.mae = 0 + #self.within_prediction_interval = 0 + #self.outside_prediction_interval = 0 x = [] y = [] predictions.each do |cid,pred| @@ -109,6 +111,9 @@ module OpenTox error = pred[:value]-pred[:measurements].median self.rmse += error**2 self.mae += error.abs + #if pred[:prediction_interval] + #if pred[:measurements] + #end else warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." @@ -118,7 +123,6 @@ module OpenTox R.assign "prediction", y R.eval "r <- cor(measurement,prediction,use='pairwise')" self.r_squared = R.eval("r").to_ruby**2 - self.mae = self.mae/predictions.size self.rmse = Math.sqrt(self.rmse/predictions.size) $logger.debug "R^2 #{r_squared}" diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb index a2c77b5..b6a2f00 100644 --- a/test/nanoparticles.rb +++ b/test/nanoparticles.rb @@ -5,6 +5,7 @@ class NanoparticleTest < MiniTest::Test include OpenTox::Validation def setup + # TODO: multiple runs create duplicates #Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm") end @@ -35,39 +36,13 @@ class NanoparticleTest < MiniTest::Test # TODO move to validation-statistics def test_inspect_cv - skip cv = CrossValidation.all.sort_by{|cv| cv.created_at}.last cv.correlation_plot_id = nil File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot} - #p cv -=begin - #File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot} - cv.predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,5].each do |sid,p| - s = Substance.find(sid) - puts - p s.name - p([p["value"],p["measurements"],(p["value"]-p["measured"].median).abs]) - neighbors = s.physchem_neighbors dataset_id: cv.model.training_dataset_id, prediction_feature_id: cv.model.prediction_feature_id, type: nil - neighbors.each do |n| - neighbor = Substance.find(n["_id"]) - p "==" - p neighbor.name, n["similarity"], n["measurements"] - p neighbor.core["name"] - p neighbor.coating.collect{|c| c["name"]} - n["common_descriptors"].each do |id| - f = Feature.find(id) - print "#{f.name} #{f.conditions["MEDIUM"]}" - print ", " - end - puts - end - - end -=end + p cv.statistics end def test_inspect_worst_prediction - skip -# TODO check/fix single/double neighbor prediction + cv = CrossValidation.all.sort_by{|cv| cv.created_at}.last worst_predictions = cv.worst_predictions(n: 3,show_neigbors: false) assert_equal 3, worst_predictions.size @@ -100,15 +75,18 @@ class NanoparticleTest < MiniTest::Test refute_nil cv.r_squared refute_nil cv.rmse end - def test_validate_pls_model - skip training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") - feature = Feature.find_or_create_by(name: "Net cell association", category: "TOX", unit: "mL/ug(Mg)") - model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "physchem_neighbors"}) - cv = Validation::RegressionCrossValidation.create model + #feature = Feature.find_or_create_by(name: "Net cell association", category: "TOX", unit: "mL/ug(Mg)") + feature = Feature.find_or_create_by(name: "Log2 transformed", category: "TOX") + + model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "physchem_neighbors", :neighbor_algorithm_parameters => {:min_sim => 0.5}}) + cv = RegressionCrossValidation.create model p cv - File.open("tmp.png","w+"){|f| f.puts cv.correlation_plot} + #p cv.predictions.sort_by{|sid,p| (p["value"] - p["measurements"].median).abs} + p cv.rmse + p cv.r_squared + File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot} refute_nil cv.r_squared refute_nil cv.rmse end diff --git a/test/setup.rb b/test/setup.rb index e7c32b4..6c97282 100644 --- a/test/setup.rb +++ b/test/setup.rb @@ -5,5 +5,5 @@ require_relative '../lib/lazar.rb' include OpenTox TEST_DIR ||= File.expand_path(File.dirname(__FILE__)) DATA_DIR ||= File.join(TEST_DIR,"data") -$mongo.database.drop -$gridfs = $mongo.database.fs +#$mongo.database.drop +#$gridfs = $mongo.database.fs -- cgit v1.2.3