From eec5bddbd35c9ecee8021128508d8718bccb4fe3 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 2 Jun 2016 17:54:48 +0200 Subject: local pls regression for nanoparticle proteomics --- lib/import.rb | 15 ++------------- lib/nanoparticle.rb | 12 +++++++++--- lib/regression.rb | 41 +++++++++++++++++++++++++---------------- test/nanoparticles.rb | 36 ++++++++++++++++-------------------- 4 files changed, 52 insertions(+), 52 deletions(-) diff --git a/lib/import.rb b/lib/import.rb index 80d4579..4c49e5e 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -68,17 +68,10 @@ module OpenTox effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature effect["conditions"].delete_if { |k, v| v.nil? } if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data -=begin - JSON.parse(effect["result"]["textValue"]).each do |identifier, value| - # time critical step - t = Time.now - proteomics_features[identifier] ||= klass.find_or_create_by(:name => identifier, :category => "Proteomics") - t1 += Time.now - t - t = Time.now + JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step + proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics") nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset - t2 += Time.now - t end -=end else feature = klass.find_or_create_by( :name => effect["endpoint"], @@ -90,10 +83,6 @@ module OpenTox end end nanoparticle.save - #p "Total time: #{Time.now - start_time}" - #p "Proteomics features: #{t1}" - #p "Proteomics values: #{t2}" - #p "Time2: #{t2}" end datasets.each { |u,d| d.save } end diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 65aab23..3e29ae1 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -10,6 +10,7 @@ module OpenTox attr_accessor :scaled_values def physchem_neighbors min_sim: 0.9, dataset_id:, prediction_feature_id: + p name dataset = Dataset.find(dataset_id) relevant_features = {} measurements = [] @@ -46,6 +47,7 @@ module OpenTox end end end + #p relevant_features.keys.collect{|i| Feature.find(i).name} neighbors = [] substances.each do |substance| values = dataset.values(substance,prediction_feature_id) @@ -86,9 +88,12 @@ module OpenTox physchem_descriptors[feature.id.to_s] << value physchem_descriptors[feature.id.to_s].uniq! when "Proteomics" - proteomics[feature.id.to_s] ||= [] - proteomics[feature.id.to_s] << value - proteomics[feature.id.to_s].uniq! + #proteomics[feature.id.to_s] ||= [] + #proteomics[feature.id.to_s] << value + #proteomics[feature.id.to_s].uniq! + physchem_descriptors[feature.id.to_s] ||= [] + physchem_descriptors[feature.id.to_s] << value + physchem_descriptors[feature.id.to_s].uniq! when "TOX" if feature.name == "Total protein (BCA assay)" physchem_descriptors[feature.id.to_s] ||= [] @@ -109,6 +114,7 @@ module OpenTox def parse_ambit_value feature, v, dataset #p dataset #p feature + # TODO add study id to warnings v.delete "unit" # TODO: ppm instead of weights if v.keys == ["textValue"] diff --git a/lib/regression.rb b/lib/regression.rb index 5028c78..b9067c6 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -88,35 +88,42 @@ module OpenTox data_frame[j][i] = d[:scaled_value] end end if activities - (0..pc_ids.size+1).each do |j| # for R: fill empty values with NA + #(0..pc_ids.size+1).each do |j| # for R: fill empty values with NA + (0..pc_ids.size).each do |j| # for R: fill empty values with NA data_frame[j] ||= [] data_frame[j][i] ||= "NA" end end - remove_idx = [] - data_frame.each_with_index do |r,i| - remove_idx << i if r.uniq.size == 1 # remove properties with a single value - end + #remove_idx = [] + #data_frame.each_with_index do |r,i| + #remove_idx << i if r.uniq.size == 1 # remove properties with a single value TODO: don't break R names assignment + #end - remove_idx.reverse.each do |i| - data_frame.delete_at i - pc_ids.delete_at i - end + #p data_frame.size + #p pc_ids.size + #data_frame.delete_if.with_index { |_, index| remove_idx.include? index } + #pc_ids.delete_if.with_index { |_, index| remove_idx.include? index-1 } + #remove_idx.sort.reverse.each do |i| + #p i + #data_frame.delete_at i + #pc_ids.delete_at i + #end + #p data_frame.size + #p pc_ids.size if pc_ids.empty? prediction = local_weighted_average substance, neighbors prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." prediction else - query_descriptors = pc_ids.collect do |i| - substance.scaled_values[i] ? substance.scaled_values[i] : "NA" - end + query_descriptors = pc_ids.collect { |i| substance.scaled_values[i] } remove_idx = [] query_descriptors.each_with_index do |v,i| - remove_idx << i if v == "NA" + #remove_idx << i if v == "NA" + remove_idx << i unless v end - remove_idx.reverse.each do |i| + remove_idx.sort.reverse.each do |i| data_frame.delete_at i pc_ids.delete_at i query_descriptors.delete_at i @@ -135,8 +142,9 @@ module OpenTox def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values R.assign "weights", training_weights r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" -rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) =begin +=end +rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) File.open("tmp.R","w+"){|f| f.puts "suppressPackageStartupMessages({ library(iterators,lib=\"#{rlib}\") @@ -159,10 +167,11 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) f.puts "names(fingerprint) <- features" f.puts "prediction <- predict(model,fingerprint)" } -=end R.eval "data <- #{r_data_frame}" R.assign "features", training_features + p training_features.size + p R.eval("names(data)").to_ruby.size begin R.eval "names(data) <- append(c('activities'),features)" # R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)" diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb index b6a2f00..227f7db 100644 --- a/test/nanoparticles.rb +++ b/test/nanoparticles.rb @@ -9,19 +9,6 @@ class NanoparticleTest < MiniTest::Test #Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm") end - def test_create_model_with_feature_selection - skip - training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") - feature = Feature.find_or_create_by(name: "Net cell association", category: "TOX", unit: "mL/ug(Mg)") - model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average", :neighbor_algorithm => "physchem_neighbors", :feature_selection_algorithm => "correlation_filter"}) - nanoparticle = training_dataset.nanoparticles[-34] - #p nanoparticle.neighbors - prediction = model.predict nanoparticle - p prediction - #p prediction - refute_nil prediction[:value] - end - def test_create_model skip training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") @@ -34,12 +21,14 @@ class NanoparticleTest < MiniTest::Test model.delete end - # TODO move to validation-statistics def test_inspect_cv cv = CrossValidation.all.sort_by{|cv| cv.created_at}.last + p cv + p cv.id cv.correlation_plot_id = nil File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot} p cv.statistics + #p cv.model.training_dataset.substances.first.physchem_descriptors.keys.collect{|d| Feature.find(d).name} end def test_inspect_worst_prediction @@ -67,26 +56,33 @@ class NanoparticleTest < MiniTest::Test model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_weighted_average", :neighbor_algorithm => "physchem_neighbors", :neighbor_algorithm_parameters => {:min_sim => 0.5}}) cv = RegressionCrossValidation.create model - p cv - #p cv.predictions.sort_by{|sid,p| (p["value"] - p["measurements"].median).abs} p cv.rmse p cv.r_squared #File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot} refute_nil cv.r_squared refute_nil cv.rmse end + def test_validate_pls_model training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") - #feature = Feature.find_or_create_by(name: "Net cell association", category: "TOX", unit: "mL/ug(Mg)") feature = Feature.find_or_create_by(name: "Log2 transformed", category: "TOX") model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "physchem_neighbors", :neighbor_algorithm_parameters => {:min_sim => 0.5}}) cv = RegressionCrossValidation.create model - p cv - #p cv.predictions.sort_by{|sid,p| (p["value"] - p["measurements"].median).abs} p cv.rmse p cv.r_squared - File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot} + refute_nil cv.r_squared + refute_nil cv.rmse + end + + def test_validate_proteomics_pls_model + training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") + feature = Feature.find_or_create_by(name: "Log2 transformed", category: "TOX") + + model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "proteomics_neighbors", :neighbor_algorithm_parameters => {:min_sim => 0.5}}) + cv = RegressionCrossValidation.create model + p cv.rmse + p cv.r_squared refute_nil cv.r_squared refute_nil cv.rmse end -- cgit v1.2.3