From eec5bddbd35c9ecee8021128508d8718bccb4fe3 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 2 Jun 2016 17:54:48 +0200
Subject: local pls regression for nanoparticle proteomics

---
 lib/import.rb       | 15 ++-------------
 lib/nanoparticle.rb | 12 +++++++++---
 lib/regression.rb   | 41 +++++++++++++++++++++++++----------------
 3 files changed, 36 insertions(+), 32 deletions(-)

(limited to 'lib')

diff --git a/lib/import.rb b/lib/import.rb
index 80d4579..4c49e5e 100644
--- a/lib/import.rb
+++ b/lib/import.rb
@@ -68,17 +68,10 @@ module OpenTox
             effect["result"]["textValue"] ?  klass = NominalFeature : klass = NumericFeature
             effect["conditions"].delete_if { |k, v| v.nil? }
             if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data
-=begin
-              JSON.parse(effect["result"]["textValue"]).each do |identifier, value|
-                # time critical step
-              t = Time.now
-                proteomics_features[identifier] ||= klass.find_or_create_by(:name => identifier, :category => "Proteomics")
-              t1 += Time.now - t
-              t = Time.now
+              JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step
+                proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics")
                 nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset
-              t2 += Time.now - t
               end
-=end
             else
               feature = klass.find_or_create_by(
                 :name => effect["endpoint"],
@@ -90,10 +83,6 @@ module OpenTox
             end
           end
           nanoparticle.save
-          #p "Total time: #{Time.now - start_time}"
-          #p "Proteomics features: #{t1}"
-          #p "Proteomics values: #{t2}"
-          #p "Time2: #{t2}"
         end
         datasets.each { |u,d| d.save }
       end
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
index 65aab23..3e29ae1 100644
--- a/lib/nanoparticle.rb
+++ b/lib/nanoparticle.rb
@@ -10,6 +10,7 @@ module OpenTox
     attr_accessor :scaled_values
  
     def physchem_neighbors min_sim: 0.9, dataset_id:, prediction_feature_id:
+      p name
       dataset = Dataset.find(dataset_id)
       relevant_features = {}
       measurements = []
@@ -46,6 +47,7 @@ module OpenTox
           end
         end
       end
+      #p relevant_features.keys.collect{|i| Feature.find(i).name}
       neighbors = []
       substances.each do |substance|
         values = dataset.values(substance,prediction_feature_id)
@@ -86,9 +88,12 @@ module OpenTox
           physchem_descriptors[feature.id.to_s] << value
           physchem_descriptors[feature.id.to_s].uniq!
         when "Proteomics"
-          proteomics[feature.id.to_s] ||= []
-          proteomics[feature.id.to_s] << value
-          proteomics[feature.id.to_s].uniq!
+          #proteomics[feature.id.to_s] ||= []
+          #proteomics[feature.id.to_s] << value
+          #proteomics[feature.id.to_s].uniq!
+          physchem_descriptors[feature.id.to_s] ||= []
+          physchem_descriptors[feature.id.to_s] << value
+          physchem_descriptors[feature.id.to_s].uniq!
         when "TOX"
           if feature.name == "Total protein (BCA assay)"
             physchem_descriptors[feature.id.to_s] ||= []
@@ -109,6 +114,7 @@ module OpenTox
     def parse_ambit_value feature, v, dataset
       #p dataset
       #p feature
+      # TODO add study id to warnings
       v.delete "unit"
       # TODO: ppm instead of weights
       if v.keys == ["textValue"]
diff --git a/lib/regression.rb b/lib/regression.rb
index 5028c78..b9067c6 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -88,35 +88,42 @@ module OpenTox
               data_frame[j][i] = d[:scaled_value]
             end
           end if activities
-          (0..pc_ids.size+1).each do |j| # for R: fill empty values with NA
+          #(0..pc_ids.size+1).each do |j| # for R: fill empty values with NA
+          (0..pc_ids.size).each do |j| # for R: fill empty values with NA
             data_frame[j] ||= []
             data_frame[j][i] ||= "NA"
           end
         end
 
-        remove_idx = []
-        data_frame.each_with_index do |r,i|
-          remove_idx << i if r.uniq.size == 1 # remove properties with a single value
-        end
+        #remove_idx = []
+        #data_frame.each_with_index do |r,i|
+          #remove_idx << i if r.uniq.size == 1 # remove properties with a single value TODO: don't break R names assignment
+        #end
 
-        remove_idx.reverse.each do |i|
-          data_frame.delete_at i
-          pc_ids.delete_at i
-        end
+        #p data_frame.size
+        #p pc_ids.size
+        #data_frame.delete_if.with_index { |_, index| remove_idx.include? index }
+        #pc_ids.delete_if.with_index { |_, index| remove_idx.include? index-1 }
+        #remove_idx.sort.reverse.each do |i|
+          #p i
+          #data_frame.delete_at i
+          #pc_ids.delete_at i
+        #end
+        #p data_frame.size
+        #p pc_ids.size
 
         if pc_ids.empty?
           prediction = local_weighted_average substance, neighbors
           prediction[:warning] = "No variables for regression model. Using weighted average of similar substances."
           prediction
         else
-          query_descriptors = pc_ids.collect do |i|
-            substance.scaled_values[i] ? substance.scaled_values[i] : "NA"
-          end
+          query_descriptors = pc_ids.collect { |i| substance.scaled_values[i] }
           remove_idx = []
           query_descriptors.each_with_index do |v,i|
-            remove_idx << i if v == "NA"
+            #remove_idx << i if v == "NA"
+            remove_idx << i unless v
           end
-          remove_idx.reverse.each do |i|
+          remove_idx.sort.reverse.each do |i|
             data_frame.delete_at i
             pc_ids.delete_at i
             query_descriptors.delete_at i
@@ -135,8 +142,9 @@ module OpenTox
       def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
         R.assign "weights", training_weights
         r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
-rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
 =begin
+=end
+rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
         File.open("tmp.R","w+"){|f|
           f.puts "suppressPackageStartupMessages({
   library(iterators,lib=\"#{rlib}\")
@@ -159,10 +167,11 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
           f.puts "names(fingerprint) <- features" 
           f.puts "prediction <- predict(model,fingerprint)"
         }
-=end
         
         R.eval "data <- #{r_data_frame}"
         R.assign "features", training_features
+        p training_features.size
+        p R.eval("names(data)").to_ruby.size
         begin
           R.eval "names(data) <- append(c('activities'),features)" #
           R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)"
-- 
cgit v1.2.3