From 51f57e2858b60bed74ebcc97189b2188c900c283 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Fri, 6 May 2016 12:49:28 +0200
Subject: dataset tests cleanup

---
 lib/compound.rb     |  7 ++++---
 lib/dataset.rb      | 39 +++++++++++++++++++++++----------------
 lib/lazar.rb        |  1 -
 lib/model.rb        |  4 ++--
 lib/nanoparticle.rb |  6 +++---
 lib/regression.rb   |  6 +++---
 lib/substance.rb    |  2 +-
 7 files changed, 36 insertions(+), 29 deletions(-)

(limited to 'lib')

diff --git a/lib/compound.rb b/lib/compound.rb
index 143c4f2..6cb7f78 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -22,6 +22,7 @@ module OpenTox
 
     # Overwrites standard Mongoid method to create fingerprints before database insertion
     def self.find_or_create_by params
+      #PhysChem.descriptors # load descriptor features
       compound = self.find_or_initialize_by params
       compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT).size
       compound.save
@@ -77,7 +78,7 @@ module OpenTox
 
     def physchem descriptors=PhysChem.openbabel_descriptors
       # TODO: speedup java descriptors
-      calculated_ids = physchem.keys
+      calculated_ids = physchem_descriptors.keys
       # BSON::ObjectId instances are not allowed as keys in a BSON document.
       new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids
       descs = {}
@@ -90,11 +91,11 @@ module OpenTox
       # avoid recalculating Cdk features with multiple values
       descs.keys.uniq.each do |k|
         descs[k].send(k[0].downcase,k[1],self).each do |n,v|
-          physchem[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
+          physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
         end
       end
       save
-      physchem.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id}
+      physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id}
     end
 
     def smarts_match smarts, count=false
diff --git a/lib/dataset.rb b/lib/dataset.rb
index b51d74b..9b24440 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -30,19 +30,11 @@ module OpenTox
       @features
     end
 
-    # Find data entry values for a given compound and feature
-    # @param compound [OpenTox::Compound] OpenTox Compound object
-    # @param feature [OpenTox::Feature] OpenTox Feature object
-    # @return [Array] Data entry values
-    #def values(compound, feature)
-      #data_entries[compound.id.to_s][feature.id.to_s]
-    #end
-
     # Writers
 
     # Set compounds
     def compounds=(compounds)
-      self.substance_ids = compounds.collect{|c| c.id}
+      self.substance_ids = compounds.collect{|c| c.id}.uniq
     end
 
     # Set features
@@ -95,14 +87,27 @@ module OpenTox
           csv << ["Name"] + features.collect{|f| f.name}
         end
         substances.each do |substance|
-          features.each do |f|
-            substance.toxicities[f.id.to_s].each do |v|
-              if compound
-                csv << [inchi ? substance.inchi : substance.smiles , v]
-              else
-                csv << [substance.name , v]
+          if compound
+            name = (inchi ? substance.inchi : substance.smiles)
+          else
+            name = substance.name
+          end
+          nr_measurements = features.collect{|f| substance.toxicities[f.id.to_s].size if substance.toxicities[f.id.to_s]}.compact.uniq
+
+          if nr_measurements.size > 1
+            warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries."
+          else
+            (0..nr_measurements.first-1).each do |i|
+              row = [name]
+              features.each do |f|
+                if substance.toxicities[f.id.to_s]
+                  row << substance.toxicities[f.id.to_s][i]
+                else
+                  row << ""
+                end
               end
-            end if substance.toxicities[f.id.to_s]
+              csv << row
+            end
           end
         end
       end
@@ -224,6 +229,8 @@ module OpenTox
         compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi}
         warn "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." 
       end
+      substance_ids.uniq!
+      feature_ids.uniq!
       
       $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
       time = Time.now
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 8eb46e0..8daaaa1 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -86,4 +86,3 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","Cross
   "experiment.rb",
   "import.rb",
 ].each{ |f| require_relative f }
-OpenTox::PhysChem.descriptors # load descriptor features
diff --git a/lib/model.rb b/lib/model.rb
index 12abc6e..841ab20 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -47,9 +47,9 @@ module OpenTox
           end
         end
         R.assign "tox", toxicities
-        feature_ids = training_dataset.substances.collect{ |s| s["physchem"].keys}.flatten.uniq
+        feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq
         feature_ids.each do |feature_id|
-          feature_values = substances.collect{|s| s["physchem"][feature_id]}
+          feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id]}
           R.assign "feature", feature_values
           begin
             #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')"
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
index c9fbb77..9bf419d 100644
--- a/lib/nanoparticle.rb
+++ b/lib/nanoparticle.rb
@@ -15,9 +15,9 @@ module OpenTox
     def add_feature feature, value
       case feature.category
       when "P-CHEM"
-        physchem[feature.id.to_s] ||= []
-        physchem[feature.id.to_s] << value
-        physchem[feature.id.to_s].uniq!
+        physchem_descriptors[feature.id.to_s] ||= []
+        physchem_descriptors[feature.id.to_s] << value
+        physchem_descriptors[feature.id.to_s].uniq!
       when "Proteomics"
         proteomics[feature.id.to_s] ||= []
         proteomics[feature.id.to_s] << value
diff --git a/lib/regression.rb b/lib/regression.rb
index fe45f99..d2c4e91 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -84,7 +84,7 @@ module OpenTox
 
         activities = []
         weights = []
-        pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem.keys}.flatten.uniq
+        pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq
         data_frame = []
         data_frame[0] = []
         
@@ -93,7 +93,7 @@ module OpenTox
           n["toxicities"][params[:prediction_feature_id].to_s].each do |act|
             data_frame[0][i] = act
             n["tanimoto"] ?  weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
-            neighbor.physchem.each do |pid,values| 
+            neighbor.physchem_descriptors.each do |pid,values| 
               values.uniq!
               warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1
               j = pc_ids.index(pid)+1
@@ -121,7 +121,7 @@ module OpenTox
           return result
         else
           query_descriptors = pc_ids.collect do |i|
-            compound.physchem[i] ? compound.physchem_descriptors[i].for_R : "NA"
+            compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA"
           end
           remove_idx = []
           query_descriptors.each_with_index do |v,i|
diff --git a/lib/substance.rb b/lib/substance.rb
index 34bc94a..82ca65d 100644
--- a/lib/substance.rb
+++ b/lib/substance.rb
@@ -1,7 +1,7 @@
 module OpenTox
 
   class Substance
-    field :physchem, type: Hash, default: {}
+    field :physchem_descriptors, type: Hash, default: {}
     field :toxicities, type: Hash, default: {}
     field :dataset_ids, type: Array, default: []
   end
-- 
cgit v1.2.3