From ab7b37541b4f8a762be737009631d3eefd898b4a Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 5 May 2016 16:14:02 +0200
Subject: ambit mirror, import from mirrored json, proteomics import

---
 lib/compound.rb       |   6 +--
 lib/import.rb         | 101 +++++++++++++++++++++++++++-----------------------
 lib/model.rb          |   4 +-
 lib/nanoparticle.rb   |  21 ++++-------
 lib/regression.rb     |   6 +--
 lib/substance.rb      |   2 +-
 test/nanoparticles.rb |  29 +++++++++------
 7 files changed, 89 insertions(+), 80 deletions(-)

diff --git a/lib/compound.rb b/lib/compound.rb
index c2ce5d0..143c4f2 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -77,7 +77,7 @@ module OpenTox
 
     def physchem descriptors=PhysChem.openbabel_descriptors
       # TODO: speedup java descriptors
-      calculated_ids = physchem_descriptors.keys
+      calculated_ids = physchem.keys
       # BSON::ObjectId instances are not allowed as keys in a BSON document.
       new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids
       descs = {}
@@ -90,11 +90,11 @@ module OpenTox
       # avoid recalculating Cdk features with multiple values
       descs.keys.uniq.each do |k|
         descs[k].send(k[0].downcase,k[1],self).each do |n,v|
-          physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
+          physchem[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
         end
       end
       save
-      physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id}
+      physchem.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id}
     end
 
     def smarts_match smarts, count=false
diff --git a/lib/import.rb b/lib/import.rb
index 3c1edfe..11cb367 100644
--- a/lib/import.rb
+++ b/lib/import.rb
@@ -5,47 +5,73 @@ module OpenTox
     class Enanomapper
       include OpenTox
 
-      def self.import
+      def self.mirror dir="."
         #get list of bundle URIs
         bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
+        File.open(File.join(dir,"bundles.json"),"w+"){|f| f.puts JSON.pretty_generate(bundles)}
         datasets = []
         bundles.each do |bundle|
-          uri = bundle["URI"]
-          dataset = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"])
           nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"]
-          features = JSON.parse(RestClientWrapper.get(bundle["property"]+"?media=application%2Fjson"))["feature"]
-          nanoparticles.each do |np|
-            nanoparticle = Nanoparticle.find_or_create_by(
-              :name => np["values"]["https://data.enanomapper.net/identifier/name"],
-              :source => np["compound"]["URI"],
-            )
-            dataset.substance_ids << nanoparticle.id
-            dataset.substance_ids.uniq!
-            studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study")))["study"]
+          nanoparticles.each do |nanoparticle|
+            uuid = nanoparticle["values"]["https://data.enanomapper.net/identifier/uuid"]
+            $logger.debug uuid
+            File.open(File.join(dir,"nanoparticle-#{uuid}.json"),"w+"){|f| f.puts JSON.pretty_generate(nanoparticle)}
+            studies = JSON.parse(RestClientWrapper.get(File.join(nanoparticle["compound"]["URI"],"study")))["study"]
             studies.each do |study|
-              study["effects"].each do |effect|
-                effect["result"]["textValue"] ?  klass = NominalFeature : klass = NumericFeature
-                # TODO parse core/coating
-                # TODO parse proteomics, they come as a large textValue
-                $logger.debug File.join(np["compound"]["URI"],"study")
-                effect["conditions"].delete_if { |k, v| v.nil? }
+              File.open(File.join(dir,"study-#{uuid}.json"),"w+"){|f| f.puts JSON.pretty_generate(study)}
+            end
+          end
+        end
+      end
+
+      def self.import dir="."
+        datasets = {}
+        JSON.parse(File.read(File.join(dir,"bundles.json"))).each do |bundle|
+          datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"])
+        end
+        Dir[File.join(dir,"study*.json")].each do |s|
+          study = JSON.parse(File.read(s))
+          np = JSON.parse(File.read(File.join(dir,"nanoparticle-#{study['owner']['substance']['uuid']}.json")))
+          nanoparticle = Nanoparticle.find_or_create_by(
+            :name => np["values"]["https://data.enanomapper.net/identifier/name"],
+            :source => np["compound"]["URI"],
+          )
+          np["bundles"].keys.each do |bundle_uri|
+            datasets[bundle_uri].substance_ids << nanoparticle.id
+            nanoparticle["dataset_ids"] << datasets[bundle_uri].id
+          end
+          study["effects"].each do |effect|
+            effect["result"]["textValue"] ?  klass = NominalFeature : klass = NumericFeature
+            # TODO parse core/coating
+            # TODO parse proteomics, they come as a large textValue
+            #$logger.debug File.join(np["compound"]["URI"],"study")
+            effect["conditions"].delete_if { |k, v| v.nil? }
+            # parse proteomics data
+            if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50
+              JSON.parse(effect["result"]["textValue"]).each do |identifier, value|
                 feature = klass.find_or_create_by(
-                  #:source => File.join(np["compound"]["URI"],"study"),
-                  :name => "#{study["protocol"]["category"]["title"]} #{study["protocol"]["endpoint"]}",
-                  :unit => effect["result"]["unit"],
-                  :category => study["protocol"]["topcategory"],
-                  :conditions => effect["conditions"]
+                  :name => identifier,
+                  :category => "Proteomics",
                 )
-                nanoparticle.parse_ambit_value feature, effect["result"]
-                dataset.feature_ids << feature.id 
-                dataset.feature_ids.uniq!
+                nanoparticle.parse_ambit_value feature, value
               end
+            else
+              feature = klass.find_or_create_by(
+                :name => "#{study["protocol"]["category"]["title"]} #{study["protocol"]["endpoint"]}",
+                :unit => effect["result"]["unit"],
+                :category => study["protocol"]["topcategory"],
+                :conditions => effect["conditions"]
+              )
+              nanoparticle.parse_ambit_value feature, effect["result"]
             end
           end
-          dataset.save
-          datasets << dataset
+          nanoparticle.save
+        end
+        datasets.each do |u,d|
+          d.feature_ids.uniq!
+          d.substance_ids.uniq!
+          d.save
         end
-        datasets.collect{|d| d.id}
       end
 
 =begin
@@ -64,23 +90,6 @@ module OpenTox
       end
 =end
 
-      def self.dump
-        #get list of bundle URIs
-        `wget 'https://data.enanomapper.net/bundle?media=application%2Fjson' -O bundles.json`
-        json = JSON.parse File.read('./bundles.json')
-        json["dataset"].each do |dataset|
-          uri = dataset["URI"]
-          id = uri.split("/").last
-          `wget --header='accept:application/json' '#{uri}' -O 'bundle#{id}'`
-          `wget --header='accept:application/json' '#{dataset["summary"]}' -O 'summary#{id}.json'`
-          `wget --header='accept:application/json' '#{dataset["compound"]}' -O 'compound#{id}.json'`
-          `wget --header='accept:application/json' '#{dataset["substance"]}' -O 'substance#{id}.json'`
-          `wget --header='accept:application/json' '#{dataset["property"]}' -O 'property#{id}.json'`
-          `wget --header='accept:application/json' '#{dataset["dataset"]}' -O 'dataset#{id}.json'`
-          `wget --header='accept:application/json' '#{dataset["matrix"]}' -O 'matrix#{id}.json'`
-        end
-      end
-
     end
 
   end
diff --git a/lib/model.rb b/lib/model.rb
index 841ab20..12abc6e 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -47,9 +47,9 @@ module OpenTox
           end
         end
         R.assign "tox", toxicities
-        feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq
+        feature_ids = training_dataset.substances.collect{ |s| s["physchem"].keys}.flatten.uniq
         feature_ids.each do |feature_id|
-          feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id]}
+          feature_values = substances.collect{|s| s["physchem"][feature_id]}
           R.assign "feature", feature_values
           begin
             #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')"
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
index dda4a9f..c9fbb77 100644
--- a/lib/nanoparticle.rb
+++ b/lib/nanoparticle.rb
@@ -6,6 +6,7 @@ module OpenTox
     field :core, type: String
     field :coating, type: Array, default: []
     field :bundles, type: Array, default: []
+    field :proteomics, type: Hash, default: {}
 
     def nanoparticle_neighbors params
       Dataset.find(params[:training_dataset_id]).nanoparticles.collect{|np| np["tanimoto"] = 1; np}
@@ -14,21 +15,18 @@ module OpenTox
     def add_feature feature, value
       case feature.category
       when "P-CHEM"
-        physchem_descriptors[feature.id.to_s] ||= []
-        physchem_descriptors[feature.id.to_s] << value
-        physchem_descriptors[feature.id.to_s].uniq!
+        physchem[feature.id.to_s] ||= []
+        physchem[feature.id.to_s] << value
+        physchem[feature.id.to_s].uniq!
+      when "Proteomics"
+        proteomics[feature.id.to_s] ||= []
+        proteomics[feature.id.to_s] << value
+        proteomics[feature.id.to_s].uniq!
       when "TOX"
         toxicities[feature.id.to_s] ||= []
         # TODO generic way of parsing TOX values
         if feature.name == "7.99 Toxicity (other) ICP-AES" and feature.unit == "mL/ug(Mg)" 
           toxicities[feature.id.to_s] << -Math.log10(value)
-        #if value.numeric?
-          #begin
-          #rescue
-            #p feature
-            #p value
-            #exit
-          #end
         else
           toxicities[feature.id.to_s] << value
         end
@@ -36,7 +34,6 @@ module OpenTox
       else
         warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted."
       end
-      save
     end
 
     def parse_ambit_value feature, v
@@ -79,5 +76,3 @@ module OpenTox
 
   end
 end
-
-
diff --git a/lib/regression.rb b/lib/regression.rb
index d2c4e91..fe45f99 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -84,7 +84,7 @@ module OpenTox
 
         activities = []
         weights = []
-        pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq
+        pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem.keys}.flatten.uniq
         data_frame = []
         data_frame[0] = []
         
@@ -93,7 +93,7 @@ module OpenTox
           n["toxicities"][params[:prediction_feature_id].to_s].each do |act|
             data_frame[0][i] = act
             n["tanimoto"] ?  weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
-            neighbor.physchem_descriptors.each do |pid,values| 
+            neighbor.physchem.each do |pid,values| 
               values.uniq!
               warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1
               j = pc_ids.index(pid)+1
@@ -121,7 +121,7 @@ module OpenTox
           return result
         else
           query_descriptors = pc_ids.collect do |i|
-            compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA"
+            compound.physchem[i] ? compound.physchem_descriptors[i].for_R : "NA"
           end
           remove_idx = []
           query_descriptors.each_with_index do |v,i|
diff --git a/lib/substance.rb b/lib/substance.rb
index 82ca65d..34bc94a 100644
--- a/lib/substance.rb
+++ b/lib/substance.rb
@@ -1,7 +1,7 @@
 module OpenTox
 
   class Substance
-    field :physchem_descriptors, type: Hash, default: {}
+    field :physchem, type: Hash, default: {}
     field :toxicities, type: Hash, default: {}
     field :dataset_ids, type: Array, default: []
   end
diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb
index 7308a83..69cfd30 100644
--- a/test/nanoparticles.rb
+++ b/test/nanoparticles.rb
@@ -4,22 +4,27 @@ require_relative "setup.rb"
 class NanoparticleTest  < MiniTest::Test
 
   def setup
-    `mongorestore --db=development #{File.join(File.dirname(__FILE__),"..","dump","production")}`
+    #`mongorestore --db=development #{File.join(File.dirname(__FILE__),"..","dump","production")}`
+  end
+
+  def test_mirror
+    Import::Enanomapper.mirror File.join(File.dirname(__FILE__),"..","data")
   end
 
   def test_import
-    skip
-    dataset_ids = Import::Enanomapper.import
-    assert_operator Nanoparticle.count , :>, 570, "Only #{Nanoparticle.count} nanoparticles imported"
-    assert_operator dataset_ids.size, :>, 8, "Only #{dataset_ids.size} bundles imported"
-    assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("NanoWiki")
-    assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
-    p dataset_ids.collect{|d| {d => Dataset.find(d).name}}
-    dataset_ids.collect do |d|
-      d = Dataset.find(d)
+    Import::Enanomapper.import File.join(File.dirname(__FILE__),"..","data")
+#    skip
+#    dataset_ids = Import::Enanomapper.import
+#    assert_operator Nanoparticle.count , :>, 570, "Only #{Nanoparticle.count} nanoparticles imported"
+#    assert_operator dataset_ids.size, :>, 8, "Only #{dataset_ids.size} bundles imported"
+#    assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("NanoWiki")
+#    assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
+#    p dataset_ids.collect{|d| {d => Dataset.find(d).name}}
+#    dataset_ids.collect do |d|
+#      d = Dataset.find(d)
       #p d.name
       #puts d.to_csv
-    end
+#    end
   end
 
   def test_summaries
@@ -35,7 +40,7 @@ class NanoparticleTest  < MiniTest::Test
         toxcounts[t] ||= 0
         toxcounts[t] += 1#v.uniq.size
       end
-      np.physchem_descriptors.each do |t,v|
+      np.physchem.each do |t,v|
         pccounts[t] ||= 0
         pccounts[t] += 1#v.uniq.size
       end
-- 
cgit v1.2.3