From cfc64a2966ab38698e499f0b44f41208ee77a07f Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Tue, 26 Apr 2016 17:38:15 +0200
Subject: first nanomaterial prediction

---
 data/enm-dump.rb      | 15 ++++----
 lib/import.rb         | 18 +++++++++-
 lib/model.rb          |  1 +
 lib/nanoparticle.rb   |  2 ++
 lib/overwrite.rb      |  9 +++++
 lib/regression.rb     | 99 +++++++++++++++++++++++++++++++++++----------------
 test/nanoparticles.rb | 40 ++++++++++++++++++---
 test/setup.rb         |  4 +--
 test/validation.rb    |  2 ++
 9 files changed, 146 insertions(+), 44 deletions(-)

diff --git a/data/enm-dump.rb b/data/enm-dump.rb
index c1c25e7..88667fc 100644
--- a/data/enm-dump.rb
+++ b/data/enm-dump.rb
@@ -6,11 +6,12 @@ json = JSON.parse File.read('./bundles.json')
 json["dataset"].each do |dataset|
   uri = dataset["URI"]
   id = uri.split("/").last
-  `wget --header='accept:application/json' '#{uri}' -O 'bundle#{id}'`
-  `wget --header='accept:application/json' '#{dataset["summary"]}' -O 'summary#{id}.json'`
-  `wget --header='accept:application/json' '#{dataset["compound"]}' -O 'compound#{id}.json'`
-  `wget --header='accept:application/json' '#{dataset["substance"]}' -O 'substance#{id}.json'`
-  `wget --header='accept:application/json' '#{dataset["property"]}' -O 'property#{id}.json'`
-  `wget --header='accept:application/json' '#{dataset["dataset"]}' -O 'dataset#{id}.json'`
-  `wget --header='accept:application/json' '#{dataset["matrix"]}' -O 'matrix#{id}.json'`
+  #`wget --header='accept:application/json' '#{uri}' -O 'bundle#{id}'`
+  `wget --header='accept:application/ld+json' '#{uri}/substance' -O 'study#{id}.json'`
+  #`wget --header='accept:application/json' '#{dataset["summary"]}' -O 'summary#{id}.json'`
+  #`wget --header='accept:application/json' '#{dataset["compound"]}' -O 'compound#{id}.json'`
+  #`wget --header='accept:application/json' '#{dataset["substance"]}' -O 'substance#{id}.json'`
+  #`wget --header='accept:application/json' '#{dataset["property"]}' -O 'property#{id}.json'`
+  #`wget --header='accept:application/json' '#{dataset["dataset"]}' -O 'dataset#{id}.json'`
+  #`wget --header='accept:application/json' '#{dataset["matrix"]}' -O 'matrix#{id}.json'`
 end
diff --git a/lib/import.rb b/lib/import.rb
index 9091207..3c1edfe 100644
--- a/lib/import.rb
+++ b/lib/import.rb
@@ -30,7 +30,7 @@ module OpenTox
                 $logger.debug File.join(np["compound"]["URI"],"study")
                 effect["conditions"].delete_if { |k, v| v.nil? }
                 feature = klass.find_or_create_by(
-                  :source => File.join(np["compound"]["URI"],"study"),
+                  #:source => File.join(np["compound"]["URI"],"study"),
                   :name => "#{study["protocol"]["category"]["title"]} #{study["protocol"]["endpoint"]}",
                   :unit => effect["result"]["unit"],
                   :category => study["protocol"]["topcategory"],
@@ -48,6 +48,22 @@ module OpenTox
         datasets.collect{|d| d.id}
       end
 
+=begin
+      def self.import_ld # defunct, AMBIT JSON_LD does not have substance entries
+        #get list of bundle URIs
+        bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
+        datasets = []
+        bundles.each do |bundle|
+          uri = bundle["URI"]
+          study = JSON.parse(`curl -H 'Accept:application/ld+json' '#{uri}/substance'`)
+          study["@graph"].each do |i|
+            puts i.to_yaml if i.keys.include? "sio:has-value"
+          end
+        end
+        datasets.collect{|d| d.id}
+      end
+=end
+
       def self.dump
         #get list of bundle URIs
         `wget 'https://data.enanomapper.net/bundle?media=application%2Fjson' -O bundles.json`
diff --git a/lib/model.rb b/lib/model.rb
index b82f098..45054e2 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -50,6 +50,7 @@ module OpenTox
       end
 
       def predict_compound compound
+        #p compound
         neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
         # remove neighbors without prediction_feature
         # check for database activities (neighbors may include query compound)
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
index b934bb3..b5de5b9 100644
--- a/lib/nanoparticle.rb
+++ b/lib/nanoparticle.rb
@@ -16,9 +16,11 @@ module OpenTox
       when "P-CHEM"
         physchem_descriptors[feature.id.to_s] ||= []
         physchem_descriptors[feature.id.to_s] << value
+        physchem_descriptors[feature.id.to_s].uniq!
       when "TOX"
         toxicities[feature.id.to_s] ||= []
         toxicities[feature.id.to_s] << value
+        toxicities[feature.id.to_s].uniq!
       else
         warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted."
       end
diff --git a/lib/overwrite.rb b/lib/overwrite.rb
index cef5758..4a79051 100644
--- a/lib/overwrite.rb
+++ b/lib/overwrite.rb
@@ -114,6 +114,15 @@ class Array
     Math.sqrt(self.sample_variance)
   end
 
+  def for_R
+    if self.first.is_a?(String) 
+      #"\"#{self.collect{|v| v.sub('[','').sub(']','')}.join(" ")}\"" # quote and remove square brackets
+      "NA"
+    else
+      self.median
+    end
+  end
+
 end
 
 module URI
diff --git a/lib/regression.rb b/lib/regression.rb
index cb17f25..5610a77 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -75,46 +75,62 @@ module OpenTox
       
       end
 
-      def self.local_physchem_regression  compound, params, method="plsr"#, method_params="ncomp = 4"
+      def self.local_physchem_regression  compound, params, method="pls"#, method_params="ncomp = 4"
+
+        neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s]} # use only neighbors with measured activities
 
-        neighbors = params[:neighbors]
         return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
         return {:value => neighbors.first["toxicities"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
 
         activities = []
         weights = []
-        physchem = {}
+        pc_ids = neighbors.collect{|n| n.physchem_descriptors.keys}.flatten.uniq
+        data_frame = []
+        data_frame[0] = []
         
         neighbors.each_with_index do |n,i|
-          if n["toxicities"][params[:prediction_feature_id].to_s]
-            n["toxicities"][params[:prediction_feature_id].to_s].each do |act|
-              # TODO fix!!!!
-              activities << -Math.log10(act)
-              #if act.numeric?
-              #activities << act
-              n["tanimoto"] ?  weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
-              neighbor = Substance.find(n["_id"])
-              neighbor.physchem_descriptors.each do |pid,v| # insert physchem only if there is an activity
-                physchem[pid] ||= []
-                physchem[pid] +=  v
-              end
+          neighbor = Substance.find(n["_id"])
+          n["toxicities"][params[:prediction_feature_id].to_s].each do |act|
+            data_frame[0][i] = act
+            n["tanimoto"] ?  weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
+            neighbor.physchem_descriptors.each do |pid,values| 
+              values.uniq!
+              warn "More than one value for #{Feature.find(pid).name}: #{values.join(', ')}" unless values.size == 1
+              j = pc_ids.index(pid)+1
+              data_frame[j] ||= []
+              data_frame[j][i] = values.for_R
             end
           end
+          (0..pc_ids.size+1).each do |j| # for R: fill empty values with NA
+            data_frame[j] ||= []
+            data_frame[j][i] ||= "NA"
+          end
         end
-
-        # remove properties with a single value
-        physchem.each do |pid,v|
-          physchem.delete(pid) if v.uniq.size <= 1
+        remove_idx = []
+        data_frame.each_with_index do |r,i|
+          remove_idx << i if r.uniq.size == 1 # remove properties with a single value
+        end
+        remove_idx.reverse.each do |i|
+          data_frame.delete_at i
+          pc_ids.delete_at i
         end
 
-        if physchem.empty?
+        if pc_ids.empty?
           result = local_weighted_average(compound, params)
           result[:warning] = "No variables for regression model. Using weighted average of similar compounds."
           return result
-
         else
-          data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid].collect{|v| "\"#{v.sub('[','').sub(']','')}\"" if v.is_a? String }}
-          prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem_descriptors[pid]}
+          query_descriptors = pc_ids.collect{|i| compound.physchem_descriptors[i].for_R}
+          remove_idx = []
+          query_descriptors.each_with_index do |v,i|
+            remove_idx << i if v == "NA"
+          end
+          remove_idx.reverse.each do |i|
+            data_frame.delete_at i
+            pc_ids.delete_at i
+            query_descriptors.delete_at i
+          end
+          prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors
           if prediction.nil?
             prediction = local_weighted_average(compound, params)
             prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
@@ -130,16 +146,39 @@ module OpenTox
       def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
         R.assign "weights", training_weights
         r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
-        #p r_data_frame
-        File.open("tmp.R","w+"){|f| f.puts "data <- #{r_data_frame}\n"}
+rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R"))
+        File.open("tmp.R","w+"){|f|
+          f.puts "suppressPackageStartupMessages({
+  library(iterators,lib=\"#{rlib}\")
+  library(foreach,lib=\"#{rlib}\")
+  library(ggplot2,lib=\"#{rlib}\")
+  library(grid,lib=\"#{rlib}\")
+  library(gridExtra,lib=\"#{rlib}\")
+  library(pls,lib=\"#{rlib}\")
+  library(caret,lib=\"#{rlib}\")
+  library(doMC,lib=\"#{rlib}\")
+  registerDoMC(#{NR_CORES})
+})"
+
+          f.puts "data <- #{r_data_frame}\n"
+          f.puts "weights <- c(#{training_weights.join(', ')})"
+          f.puts "features <- c(#{training_features.join(', ')})"
+          f.puts "names(data) <- append(c('activities'),features)" #
+          f.puts "model <- train(activities ~ ., data = data, method = '#{method}')"
+          f.puts "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
+          f.puts "names(fingerprint) <- features" 
+          f.puts "prediction <- predict(model,fingerprint)"
+        }
+        
         R.eval "data <- #{r_data_frame}"
         R.assign "features", training_features
         R.eval "names(data) <- append(c('activities'),features)" #
-        begin
-          R.eval "model <- train(activities ~ ., data = data, method = '#{method}')"
-        rescue 
-          return nil
-        end
+        #begin
+          R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)"
+        #rescue 
+          #return nil
+        #end
+        p query_feature_values
         R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))"
         R.eval "names(fingerprint) <- features" 
         R.eval "prediction <- predict(model,fingerprint)"
diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb
index 46073a9..31bb903 100644
--- a/test/nanoparticles.rb
+++ b/test/nanoparticles.rb
@@ -11,9 +11,38 @@ class NanoparticleTest  < MiniTest::Test
     p dataset_ids.collect{|d| {d => Dataset.find(d).name}}
     dataset_ids.collect do |d|
       d = Dataset.find(d)
-      p d.name
-      puts d.to_csv
+      #p d.name
+      #puts d.to_csv
+    end
+  end
+
+  def test_summaries
+    features = Feature.all.to_a
+    #p features.collect do |f|
+      #f if f.category == "TOX"
+    #end.to_a.flatten.size
+    toxcounts = {}
+    pccounts = {}
+    Nanoparticle.all.each do |np|
+      np.toxicities.each do |t,v|
+        toxcounts[t] ||= 0
+        toxcounts[t] += 1#v.uniq.size
+      end
+      np.physchem_descriptors.each do |t,v|
+        pccounts[t] ||= 0
+        pccounts[t] += 1#v.uniq.size
+      end
     end
+    #puts counts.keys.collect{|i| Feature.find(i)}.to_yaml
+    #pccounts.each{|e,n| p Feature.find(e),n if n > 100}
+    #p toxcounts.collect{|e,n| Feature.find(e).name if n > 1}.uniq
+    toxcounts.each{|e,n| p Feature.find(e),n if n > 100}
+  end
+
+
+  def test_import_ld
+    skip
+    dataset_ids = Import::Enanomapper.import_ld
   end
 
   def test_export
@@ -24,11 +53,14 @@ class NanoparticleTest  < MiniTest::Test
 
   def test_create_model
     training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
-    model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors")
+    feature = Feature.find_or_create_by(name: "7.99 Toxicity (other) ICP-AES", category: "TOX", unit: "mL/ug(Mg)")
+    model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors"})
     nanoparticle = training_dataset.nanoparticles[-34]
+    #p nanoparticle.neighbors
     prediction = model.predict nanoparticle
     p prediction
-    refute_nil prediction[:value]
+    #p prediction
+    #refute_nil prediction[:value]
   end
 
 end
diff --git a/test/setup.rb b/test/setup.rb
index e7c32b4..6c97282 100644
--- a/test/setup.rb
+++ b/test/setup.rb
@@ -5,5 +5,5 @@ require_relative '../lib/lazar.rb'
 include OpenTox
 TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
 DATA_DIR ||= File.join(TEST_DIR,"data")
-$mongo.database.drop
-$gridfs = $mongo.database.fs
+#$mongo.database.drop
+#$gridfs = $mongo.database.fs
diff --git a/test/validation.rb b/test/validation.rb
index baee2d1..cbc7d09 100644
--- a/test/validation.rb
+++ b/test/validation.rb
@@ -9,6 +9,7 @@ class ValidationTest < MiniTest::Test
     model = Model::LazarClassification.create dataset.features.first, dataset
     cv = ClassificationCrossValidation.create model
     assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7, this may occur due to an unfavorable training/test set split"
+    assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy (#{cv.weighted_accuracy}) should be larger than accuracy (#{cv.accuracy})."
   end
 
   def test_default_regression_crossvalidation
@@ -85,6 +86,7 @@ class ValidationTest < MiniTest::Test
     assert_equal 14, loo.nr_unpredicted
     refute_empty loo.confusion_matrix
     assert loo.accuracy > 0.77
+    assert loo.weighted_accuracy > loo.accuracy, "Weighted accuracy (#{loo.weighted_accuracy}) should be larger than accuracy (#{loo.accuracy})."
   end
 
   def test_regression_loo_validation
-- 
cgit v1.2.3