From 753fcc204d93d86c76860bee6e2f7d0468c3c940 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 14 Apr 2016 19:43:24 +0200
Subject: features/toxicities fixed

---
 .gitignore             |  1 +
 lib/classification.rb  |  2 +-
 lib/compound.rb        |  6 ++----
 lib/dataset.rb         | 29 +++++++++++++++++++++--------
 lib/model.rb           | 35 ++++++++++++++++-------------------
 lib/nanoparticle.rb    | 30 +++++++++++++++++++-----------
 lib/opentox.rb         |  5 +++++
 lib/regression.rb      | 35 ++++++++++++++++++++---------------
 lib/substance.rb       |  1 +
 test/classification.rb | 14 +++++++-------
 test/nanoparticles.rb  | 23 ++++++++++++++++++-----
 test/setup.rb          |  4 ++--
 12 files changed, 113 insertions(+), 72 deletions(-)

diff --git a/.gitignore b/.gitignore
index fb51df7..6e0f374 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+R
 openbabel
 Gemfile.lock
 *.gem
diff --git a/lib/classification.rb b/lib/classification.rb
index 0202940..4a17546 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -10,7 +10,7 @@ module OpenTox
         confidence = 0.0
         neighbors.each do |row|
           sim = row["tanimoto"]
-          row["features"][params[:prediction_feature_id].to_s].each do |act|
+          row["toxicities"][params[:prediction_feature_id].to_s].each do |act|
             weighted_sum[act] ||= 0
             weighted_sum[act] += sim
           end
diff --git a/lib/compound.rb b/lib/compound.rb
index 7895619..55cd482 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -17,8 +17,6 @@ module OpenTox
     field :sdf_id, type: BSON::ObjectId
     field :fingerprints, type: Hash, default: {}
     field :default_fingerprint_size, type: Integer
-    # TODO separate between physchem, bio and tox
-    field :features, type: Hash, default: {}
 
     index({smiles: 1}, {unique: true})
 
@@ -291,7 +289,7 @@ module OpenTox
           candidate_fingerprint = compound.fingerprint params[:type]
           sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
           feature_values = training_dataset.values(compound,prediction_feature)
-          neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim]
+          neighbors << {"_id" => compound.id, "toxicities" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim]
         end
         neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]}
       end
@@ -332,7 +330,7 @@ module OpenTox
             'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]}
           }},
           '_id' => 1,
-          'features' => 1,
+          'toxicities' => 1,
           'dataset_ids' => 1
         }},
         {'$match' =>  {'tanimoto' => {'$gte' => params[:min_sim]}}},
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 25307c9..274c475 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -13,6 +13,10 @@ module OpenTox
       substances.select{|s| s.is_a? Compound}
     end
 
+    def nanoparticles
+      substances.select{|s| s.is_a? Nanoparticle}
+    end
+
     # Get all substances
     def substances
       @substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id}
@@ -21,7 +25,7 @@ module OpenTox
 
     # Get all features
     def features
-      @features ||= data_entries.collect{|cid,f| f.first}.flatten.uniq.collect{|id| OpenTox::Feature.find(id)}
+      @features ||= data_entries.collect{|cid,f| f.first}.flatten.uniq.compact.collect{|id| OpenTox::Feature.find(id)}.compact
       @features
     end
 
@@ -98,13 +102,22 @@ module OpenTox
     # @return [String]
     def to_csv(inchi=false)
       CSV.generate() do |csv| 
-        csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
+        compound = Substance.find(data_entries.first.first).is_a? Compound
+        if compound
+          csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
+        else
+          csv << ["Name"] + features.collect{|f| f.name}
+        end
         data_entries.each do |sid,f|
-          substance = Substance.find cid
+          substance = Substance.find sid
           features.each do |feature|
-            f[feature.id].each do |v|
-              csv << [inchi ? substance.inchi : substance.smiles , v]
-            end
+            f[feature.id.to_s].each do |v|
+              if compound
+                csv << [inchi ? substance.inchi : substance.smiles , v]
+              else
+                csv << [substance.name , v]
+              end
+            end if f[feature.id.to_s]
           end
         end
       end
@@ -221,8 +234,8 @@ module OpenTox
           self.data_entries[compound.id.to_s] ||= {}
           self.data_entries[compound.id.to_s][@features[j].id.to_s] ||= []
           self.data_entries[compound.id.to_s][@features[j].id.to_s] << v
-          compound.features[@features[j].id.to_s] ||= []
-          compound.features[@features[j].id.to_s] << v
+          compound.toxicities[@features[j].id.to_s] ||= []
+          compound.toxicities[@features[j].id.to_s] << v
           compound.save
         end
       end
diff --git a/lib/model.rb b/lib/model.rb
index 5140d5a..1960c10 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -36,6 +36,7 @@ module OpenTox
         super params
 
         # TODO document convention
+        #p training_dataset.features
         prediction_feature = training_dataset.features.first
         # set defaults for empty parameters
         self.prediction_feature_id ||= prediction_feature.id
@@ -56,12 +57,13 @@ module OpenTox
         prediction = {}
         if neighbors.collect{|n| n["_id"]}.include? compound.id
 
-          database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq
+          #TODO restrict to dataset features
+          database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s].uniq
           prediction[:database_activities] = database_activities
           prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound."
           neighbors.delete_if{|n| n["_id"] == compound.id}
         end
-        neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] }
+        neighbors.delete_if{|n| n['toxicities'].empty? or n['toxicities'][prediction_feature.id.to_s] == [nil] }
         if neighbors.empty?
           prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []})
         else
@@ -78,12 +80,11 @@ module OpenTox
 
         # parse data
         compounds = []
-        case object.class.to_s
-        when "OpenTox::Compound"
+        if object.is_a? Substance
           compounds = [object] 
-        when "Array"
+        elsif object.is_a? Array
           compounds = object
-        when "OpenTox::Dataset"
+        elsif object.is_a? Dataset
           compounds = object.compounds
         else 
           bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter."
@@ -97,30 +98,26 @@ module OpenTox
         end
 
         # serialize result
-        case object.class.to_s
-        when "OpenTox::Compound"
+        if object.is_a? Substance
           prediction = predictions[compounds.first.id.to_s]
           prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity
+          return prediction
+        elsif object.is_a? Array
           return predictions
-        when "Array"
-          return predictions
-        when "OpenTox::Dataset"
+        elsif object.is_a? Dataset
           predictions.each{|cid,p| p.delete(:neighbors)}
           # prepare prediction dataset
           measurement_feature = Feature.find prediction_feature_id
 
           prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
-          prediction_dataset = LazarPrediction.new(
+          prediction_dataset = LazarPrediction.create(
             :name => "Lazar prediction for #{prediction_feature.name}",
             :creator =>  __FILE__,
-            :prediction_feature_id => prediction_feature.id
-
+            :prediction_feature_id => prediction_feature.id,
+            :predictions => predictions
           )
 
-          compounds.each_with_index do |c,i|
-            prediction_dataset.predictions[c.id.to_s] = predictions[i]
-          end
-          prediction_dataset.save
+          #prediction_dataset.save
           return prediction_dataset
         end
 
@@ -264,7 +261,7 @@ module OpenTox
         training_features = training.collect{|t| t.physchem_descriptors.keys}.flatten.uniq
         query_features = nanoparticle.physchem_descriptors.keys
         common_features = (training_features & query_features)
-        p common_features
+        #p common_features
       end
 
     end
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
index 6e9b0ea..0350363 100644
--- a/lib/nanoparticle.rb
+++ b/lib/nanoparticle.rb
@@ -5,12 +5,10 @@ module OpenTox
 
     field :core, type: String
     field :coating, type: Array, default: []
-
-    field :toxicities, type: Hash, default: {}
-    #field :features, type: Hash, default: {}
     field :bundles, type: Array, default: []
 
-    def predict
+    def nanoparticle_neighbors params
+      Dataset.find(params[:training_dataset_id]).nanoparticles
     end
 
     def add_feature feature, value
@@ -21,22 +19,32 @@ module OpenTox
         toxicities[feature.id.to_s] ||= []
         toxicities[feature.id.to_s] << value
       else
-        $logger.warn "Unknown feature type '#{feature.source}'. Value '#{value}' not inserted."
-        warnings << "Unknown feature type '#{feature.source}'. Value '#{value}' not inserted."
+        warn "Unknown feature type '#{feature.source}'. Value '#{value}' not inserted."
       end
     end
 
     def parse_ambit_value feature, v
+      # TODO: units, mmol/log10 conversion
       if v.keys == ["loValue"]
-        add_feature feature, v["loValue"]
+        #if v["loValue"].numeric?
+          add_feature feature, v["loValue"]
+        #else
+          #warn "'#{v["loValue"]}' is not a numeric value, entry ignored."
+        #end
       elsif v.keys.size == 2 and v["loQualifier"] == "mean"
-        add_feature feature, {:mean => v["loValue"]}
+        #add_feature feature, {:mean => v["loValue"]}
+        add_feature feature, v["loValue"]
+        warn "'#{feature.name}' is a mean value. Original data is not available."
       elsif v.keys.size == 2 and v["loQualifier"] #== ">="
-        add_feature feature, {:min => v["loValue"],:max => Float::INFINITY}
+        #add_feature feature, {:min => v["loValue"],:max => Float::INFINITY}
+        warn "Only min value available for '#{feature.name}', entry ignored"
       elsif v.keys.size == 2 and v["upQualifier"] #== ">="
-        add_feature feature, {:max => v["upValue"],:min => -Float::INFINITY}
+        #add_feature feature, {:max => v["upValue"],:min => -Float::INFINITY}
+        warn "Only max value available for '#{feature.name}', entry ignored"
       elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] 
-        add_feature feature, {:min => v["loValue"],:max => v["upValue"]}
+        #add_feature feature, {:min => v["loValue"],:max => v["upValue"]}
+        add_feature feature, [v["loValue"],v["upValue"]].mean
+        warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available."
       elsif v == {} # do nothing
       else
         $logger.warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'."
diff --git a/lib/opentox.rb b/lib/opentox.rb
index cc18cc6..7d8a8a2 100644
--- a/lib/opentox.rb
+++ b/lib/opentox.rb
@@ -15,6 +15,11 @@ module OpenTox
       field :name,  type: String
       field :source,  type: String
       field :warnings, type: Array, default: []
+
+      def warn warning
+        $logger.warn warning
+        warnings << warning
+      end
     end
     OpenTox.const_set klass,c
   end
diff --git a/lib/regression.rb b/lib/regression.rb
index 5021fb3..cb17f25 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -9,8 +9,8 @@ module OpenTox
         neighbors = params[:neighbors]
         neighbors.each do |row|
           sim = row["tanimoto"]
-          if row["features"][params[:prediction_feature_id].to_s]
-            row["features"][params[:prediction_feature_id].to_s].each do |act|
+          if row["toxicities"][params[:prediction_feature_id].to_s]
+            row["toxicities"][params[:prediction_feature_id].to_s].each do |act|
               weighted_sum += sim*Math.log10(act)
               sim_sum += sim
             end
@@ -32,8 +32,8 @@ module OpenTox
         neighbors.each_with_index do |row,i|
           neighbor = Compound.find row["_id"]
           fingerprint = neighbor.fingerprint
-          if row["features"][params[:prediction_feature_id].to_s]
-            row["features"][params[:prediction_feature_id].to_s].each do |act|
+          if row["toxicities"][params[:prediction_feature_id].to_s]
+            row["toxicities"][params[:prediction_feature_id].to_s].each do |act|
               activities << Math.log10(act)
               weights << row["tanimoto"]
               fingerprint_ids.each_with_index do |id,j|
@@ -79,21 +79,24 @@ module OpenTox
 
         neighbors = params[:neighbors]
         return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
-        return {:value => neighbors.first["features"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
+        return {:value => neighbors.first["toxicities"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
 
         activities = []
         weights = []
         physchem = {}
         
-        neighbors.each_with_index do |row,i|
-          neighbor = Compound.find row["_id"]
-          if row["features"][params[:prediction_feature_id].to_s]
-            row["features"][params[:prediction_feature_id].to_s].each do |act|
-              activities << Math.log10(act)
-              weights << row["tanimoto"] # TODO cosine ?
-              neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity
+        neighbors.each_with_index do |n,i|
+          if n["toxicities"][params[:prediction_feature_id].to_s]
+            n["toxicities"][params[:prediction_feature_id].to_s].each do |act|
+              # TODO fix!!!!
+              activities << -Math.log10(act)
+              #if act.numeric?
+              #activities << act
+              n["tanimoto"] ?  weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
+              neighbor = Substance.find(n["_id"])
+              neighbor.physchem_descriptors.each do |pid,v| # insert physchem only if there is an activity
                 physchem[pid] ||= []
-                physchem[pid] <<  v
+                physchem[pid] +=  v
               end
             end
           end
@@ -110,8 +113,8 @@ module OpenTox
           return result
 
         else
-          data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] }
-          prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]}
+          data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid].collect{|v| "\"#{v.sub('[','').sub(']','')}\"" if v.is_a? String }}
+          prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem_descriptors[pid]}
           if prediction.nil?
             prediction = local_weighted_average(compound, params)
             prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
@@ -127,6 +130,8 @@ module OpenTox
       def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
         R.assign "weights", training_weights
         r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
+        #p r_data_frame
+        File.open("tmp.R","w+"){|f| f.puts "data <- #{r_data_frame}\n"}
         R.eval "data <- #{r_data_frame}"
         R.assign "features", training_features
         R.eval "names(data) <- append(c('activities'),features)" #
diff --git a/lib/substance.rb b/lib/substance.rb
index 6768ce7..82ca65d 100644
--- a/lib/substance.rb
+++ b/lib/substance.rb
@@ -2,6 +2,7 @@ module OpenTox
 
   class Substance
     field :physchem_descriptors, type: Hash, default: {}
+    field :toxicities, type: Hash, default: {}
     field :dataset_ids, type: Array, default: []
   end
 
diff --git a/test/classification.rb b/test/classification.rb
index af23db6..7412714 100644
--- a/test/classification.rb
+++ b/test/classification.rb
@@ -30,14 +30,14 @@ class LazarClassificationTest < MiniTest::Test
 
     # make a dataset prediction
     compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
-    prediction = model.predict compound_dataset
-    assert_equal compound_dataset.compounds, prediction.compounds
+    prediction_dataset = model.predict compound_dataset
+    assert_equal compound_dataset.compounds, prediction_dataset.compounds
 
-    cid = prediction.compounds[7].id.to_s
-    assert_equal "Could not find similar compounds with experimental data in the training dataset.", prediction.predictions[cid][:warning]
-    cid = prediction.compounds[9].id.to_s
-    assert_equal "1 compounds have been removed from neighbors, because they have the same structure as the query compound.", prediction.predictions[cid][:warning]
+    cid = prediction_dataset.compounds[7].id.to_s
+    assert_equal "Could not find similar compounds with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warning]
+    cid = prediction_dataset.compounds[9].id.to_s
+    assert_equal "1 compounds have been removed from neighbors, because they have the same structure as the query compound.", prediction_dataset.predictions[cid][:warning]
     # cleanup
-    [training_dataset,model,compound_dataset].each{|o| o.delete}
+    [training_dataset,model,compound_dataset,prediction_dataset].each{|o| o.delete}
   end
 end
diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb
index 6f241ec..46073a9 100644
--- a/test/nanoparticles.rb
+++ b/test/nanoparticles.rb
@@ -6,16 +6,29 @@ class NanoparticleTest  < MiniTest::Test
     dataset_ids = Import::Enanomapper.import
     assert_operator Nanoparticle.count , :>, 570, "Only #{Nanoparticle.count} nanoparticles imported"
     assert_operator dataset_ids.size, :>, 8, "Only #{dataset_ids.size} bundles imported"
-    p dataset_ids.collect{|d| Dataset.find(d).name}
     assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("NanoWiki")
     assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
+    p dataset_ids.collect{|d| {d => Dataset.find(d).name}}
+    dataset_ids.collect do |d|
+      d = Dataset.find(d)
+      p d.name
+      puts d.to_csv
+    end
   end
 
-  def test_create_model
-    Model::NanoLazar.create_all.each do |model|
-      np = Nanoparticle.find(model.training_particle_ids.sample)
-      model.predict np
+  def test_export
+    Dataset.all.each do |d|
+      puts d.to_csv
     end
   end
 
+  def test_create_model
+    training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
+    model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors")
+    nanoparticle = training_dataset.nanoparticles[-34]
+    prediction = model.predict nanoparticle
+    p prediction
+    refute_nil prediction[:value]
+  end
+
 end
diff --git a/test/setup.rb b/test/setup.rb
index e7c32b4..6c97282 100644
--- a/test/setup.rb
+++ b/test/setup.rb
@@ -5,5 +5,5 @@ require_relative '../lib/lazar.rb'
 include OpenTox
 TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
 DATA_DIR ||= File.join(TEST_DIR,"data")
-$mongo.database.drop
-$gridfs = $mongo.database.fs
+#$mongo.database.drop
+#$gridfs = $mongo.database.fs
-- 
cgit v1.2.3