From eec5bddbd35c9ecee8021128508d8718bccb4fe3 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 2 Jun 2016 17:54:48 +0200 Subject: local pls regression for nanoparticle proteomics --- lib/import.rb | 15 ++------------- lib/nanoparticle.rb | 12 +++++++++--- lib/regression.rb | 41 +++++++++++++++++++++++++---------------- 3 files changed, 36 insertions(+), 32 deletions(-) (limited to 'lib') diff --git a/lib/import.rb b/lib/import.rb index 80d4579..4c49e5e 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -68,17 +68,10 @@ module OpenTox effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature effect["conditions"].delete_if { |k, v| v.nil? } if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data -=begin - JSON.parse(effect["result"]["textValue"]).each do |identifier, value| - # time critical step - t = Time.now - proteomics_features[identifier] ||= klass.find_or_create_by(:name => identifier, :category => "Proteomics") - t1 += Time.now - t - t = Time.now + JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step + proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics") nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset - t2 += Time.now - t end -=end else feature = klass.find_or_create_by( :name => effect["endpoint"], @@ -90,10 +83,6 @@ module OpenTox end end nanoparticle.save - #p "Total time: #{Time.now - start_time}" - #p "Proteomics features: #{t1}" - #p "Proteomics values: #{t2}" - #p "Time2: #{t2}" end datasets.each { |u,d| d.save } end diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 65aab23..3e29ae1 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -10,6 +10,7 @@ module OpenTox attr_accessor :scaled_values def physchem_neighbors min_sim: 0.9, dataset_id:, prediction_feature_id: + p name dataset = Dataset.find(dataset_id) relevant_features = {} measurements = [] @@ -46,6 +47,7 @@ module OpenTox end end end + #p relevant_features.keys.collect{|i| Feature.find(i).name} neighbors = [] substances.each do |substance| values = dataset.values(substance,prediction_feature_id) @@ -86,9 +88,12 @@ module OpenTox physchem_descriptors[feature.id.to_s] << value physchem_descriptors[feature.id.to_s].uniq! when "Proteomics" - proteomics[feature.id.to_s] ||= [] - proteomics[feature.id.to_s] << value - proteomics[feature.id.to_s].uniq! + #proteomics[feature.id.to_s] ||= [] + #proteomics[feature.id.to_s] << value + #proteomics[feature.id.to_s].uniq! + physchem_descriptors[feature.id.to_s] ||= [] + physchem_descriptors[feature.id.to_s] << value + physchem_descriptors[feature.id.to_s].uniq! when "TOX" if feature.name == "Total protein (BCA assay)" physchem_descriptors[feature.id.to_s] ||= [] @@ -109,6 +114,7 @@ module OpenTox def parse_ambit_value feature, v, dataset #p dataset #p feature + # TODO add study id to warnings v.delete "unit" # TODO: ppm instead of weights if v.keys == ["textValue"] diff --git a/lib/regression.rb b/lib/regression.rb index 5028c78..b9067c6 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -88,35 +88,42 @@ module OpenTox data_frame[j][i] = d[:scaled_value] end end if activities - (0..pc_ids.size+1).each do |j| # for R: fill empty values with NA + #(0..pc_ids.size+1).each do |j| # for R: fill empty values with NA + (0..pc_ids.size).each do |j| # for R: fill empty values with NA data_frame[j] ||= [] data_frame[j][i] ||= "NA" end end - remove_idx = [] - data_frame.each_with_index do |r,i| - remove_idx << i if r.uniq.size == 1 # remove properties with a single value - end + #remove_idx = [] + #data_frame.each_with_index do |r,i| + #remove_idx << i if r.uniq.size == 1 # remove properties with a single value TODO: don't break R names assignment + #end - remove_idx.reverse.each do |i| - data_frame.delete_at i - pc_ids.delete_at i - end + #p data_frame.size + #p pc_ids.size + #data_frame.delete_if.with_index { |_, index| remove_idx.include? index } + #pc_ids.delete_if.with_index { |_, index| remove_idx.include? index-1 } + #remove_idx.sort.reverse.each do |i| + #p i + #data_frame.delete_at i + #pc_ids.delete_at i + #end + #p data_frame.size + #p pc_ids.size if pc_ids.empty? prediction = local_weighted_average substance, neighbors prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." prediction else - query_descriptors = pc_ids.collect do |i| - substance.scaled_values[i] ? substance.scaled_values[i] : "NA" - end + query_descriptors = pc_ids.collect { |i| substance.scaled_values[i] } remove_idx = [] query_descriptors.each_with_index do |v,i| - remove_idx << i if v == "NA" + #remove_idx << i if v == "NA" + remove_idx << i unless v end - remove_idx.reverse.each do |i| + remove_idx.sort.reverse.each do |i| data_frame.delete_at i pc_ids.delete_at i query_descriptors.delete_at i @@ -135,8 +142,9 @@ module OpenTox def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values R.assign "weights", training_weights r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" -rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) =begin +=end +rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) File.open("tmp.R","w+"){|f| f.puts "suppressPackageStartupMessages({ library(iterators,lib=\"#{rlib}\") @@ -159,10 +167,11 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) f.puts "names(fingerprint) <- features" f.puts "prediction <- predict(model,fingerprint)" } -=end R.eval "data <- #{r_data_frame}" R.assign "features", training_features + p training_features.size + p R.eval("names(data)").to_ruby.size begin R.eval "names(data) <- append(c('activities'),features)" # R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)" -- cgit v1.2.3