improved enm import

author: Christoph Helma <helma@in-silico.ch> 2016-04-13 18:18:36 +0200
committer: Christoph Helma <helma@in-silico.ch> 2016-04-13 18:18:36 +0200
commit: 64f1f32ced77afb278bdb7c27397c5299a73675c (patch)
tree: b44cdc6c9533be8e33815fb16e83a341c35ea3d1
parent: 815cf6ba1543fc323eb7cbd1202fadbf03bcfbca (diff)
7 files changed, 67 insertions, 58 deletions
diff --git a/.gitignore b/.gitignore
index 791dc27..fb51df7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,4 @@
-last-utils
-libfminer
 openbabel
-fminer_debug.txt
-test/fminer_debug.txt
 Gemfile.lock
 *.gem
 .bundle
@@ -11,3 +7,4 @@ pkg/*
 .yardoc/
 doc/
 lazar.log
+data
diff --git a/lib/compound.rb b/lib/compound.rb
index 757ba1a..7895619 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -17,8 +17,6 @@ module OpenTox
     field :sdf_id, type: BSON::ObjectId
     field :fingerprints, type: Hash, default: {}
     field :default_fingerprint_size, type: Integer
-    field :physchem_descriptors, type: Hash, default: {}
-    field :dataset_ids, type: Array, default: []
     # TODO separate between physchem, bio and tox
     field :features, type: Hash, default: {}
 
diff --git a/lib/import.rb b/lib/import.rb
index 86c633a..cf0855e 100644
--- a/lib/import.rb
+++ b/lib/import.rb
@@ -8,64 +8,75 @@ module OpenTox
       def self.import
         #get list of bundle URIs
         bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
+        datasets = []
         bundles.each do |bundle|
           uri = bundle["URI"]
+          dataset = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"])
           nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"]
           features = JSON.parse(RestClientWrapper.get(bundle["property"]+"?media=application%2Fjson"))["feature"]
           nanoparticles.each do |np|
-              nanoparticle = Nanoparticle.find_or_create_by(
-                :name => np["values"]["https://data.enanomapper.net/identifier/name"],
-                :source => np["compound"]["URI"],
-              )
-              nanoparticle.bundles << uri
-              np["composition"].each do |comp|
-                case comp["relation"]
-                when "HAS_CORE"
-                  nanoparticle.core = comp["component"]["compound"]["URI"]
-                when "HAS_COATING"
-                  nanoparticle.coating << comp["component"]["compound"]["URI"]
-                end
-              end if np["composition"]
-              np["values"].each do |u,v|
-                if u.match(/property/)
-                  name, unit, source = nil
-                  features.each do |uri,feat|
-                    if u.match(/#{uri}/)
-                      name = feat["title"]
-                      unit = feat["units"]
-                      source = uri
-                    end
+            nanoparticle = Nanoparticle.find_or_create_by(
+              :name => np["values"]["https://data.enanomapper.net/identifier/name"],
+              :source => np["compound"]["URI"],
+            )
+            dataset.data_entries[nanoparticle.id.to_s] ||= {}
+            nanoparticle.bundles << uri
+            nanoparticle.dataset_ids << dataset.id
+            np["composition"].each do |comp|
+              case comp["relation"]
+              when "HAS_CORE"
+                nanoparticle.core = comp["component"]["compound"]["URI"]
+              when "HAS_COATING"
+                nanoparticle.coating << comp["component"]["compound"]["URI"]
+              end
+            end if np["composition"]
+            np["values"].each do |u,v|
+              if u.match(/property/)
+                name, unit, source = nil
+                features.each do |uri,feat|
+                  if u.match(/#{uri}/)
+                    name = feat["title"]
+                    unit = feat["units"]
+                    source = uri
                   end
-                  feature = Feature.find_or_create_by(
-                    :name => name,
-                    :unit => unit,
-                    :source => source
-                  )
                 end
-                v.each{|value| nanoparticle.parse_ambit_value feature, value} if v.is_a? Array
+                feature = Feature.find_or_create_by(
+                  :name => name,
+                  :unit => unit,
+                  :source => source
+                )
               end
-              nanoparticle.bundles.uniq!
-              nanoparticle.physchem_descriptors.each{|f,v| v.uniq!}
-              nanoparticle.toxicities.each{|f,v| v.uniq!}
-              nanoparticle.save!
+              v.each{|value| nanoparticle.parse_ambit_value feature, value} if v.is_a? Array
+            end
+            nanoparticle.bundles.uniq!
+            nanoparticle.physchem_descriptors.each{|f,v| v.uniq!}
+            #nanoparticle.toxicities.each{|f,v| v.uniq!}
+            nanoparticle.toxicities.each do |f,v|
+              dataset.data_entries[nanoparticle.id.to_s][f.to_s] ||= []
+              dataset.data_entries[nanoparticle.id.to_s][f.to_s] += v
+            end
+            nanoparticle.save
           end
+          dataset.save
+          datasets << dataset
         end
+        datasets.collect{|d| d.id}
+      end
 
-        def self.dump
-          #get list of bundle URIs
-          `wget 'https://data.enanomapper.net/bundle?media=application%2Fjson' -O bundles.json`
-          json = JSON.parse File.read('./bundles.json')
-          json["dataset"].each do |dataset|
-            uri = dataset["URI"]
-            id = uri.split("/").last
-            `wget --header='accept:application/json' '#{uri}' -O 'bundle#{id}'`
-            `wget --header='accept:application/json' '#{dataset["summary"]}' -O 'summary#{id}.json'`
-            `wget --header='accept:application/json' '#{dataset["compound"]}' -O 'compound#{id}.json'`
-            `wget --header='accept:application/json' '#{dataset["substance"]}' -O 'substance#{id}.json'`
-            `wget --header='accept:application/json' '#{dataset["property"]}' -O 'property#{id}.json'`
-            `wget --header='accept:application/json' '#{dataset["dataset"]}' -O 'dataset#{id}.json'`
-            `wget --header='accept:application/json' '#{dataset["matrix"]}' -O 'matrix#{id}.json'`
-          end
+      def self.dump
+        #get list of bundle URIs
+        `wget 'https://data.enanomapper.net/bundle?media=application%2Fjson' -O bundles.json`
+        json = JSON.parse File.read('./bundles.json')
+        json["dataset"].each do |dataset|
+          uri = dataset["URI"]
+          id = uri.split("/").last
+          `wget --header='accept:application/json' '#{uri}' -O 'bundle#{id}'`
+          `wget --header='accept:application/json' '#{dataset["summary"]}' -O 'summary#{id}.json'`
+          `wget --header='accept:application/json' '#{dataset["compound"]}' -O 'compound#{id}.json'`
+          `wget --header='accept:application/json' '#{dataset["substance"]}' -O 'substance#{id}.json'`
+          `wget --header='accept:application/json' '#{dataset["property"]}' -O 'property#{id}.json'`
+          `wget --header='accept:application/json' '#{dataset["dataset"]}' -O 'dataset#{id}.json'`
+          `wget --header='accept:application/json' '#{dataset["matrix"]}' -O 'matrix#{id}.json'`
         end
       end
 
diff --git a/lib/lazar.rb b/lib/lazar.rb
index a1ad551..8eb46e0 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -71,6 +71,7 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","Cross
   "opentox.rb",
   "feature.rb",
   "physchem.rb",
+  "substance.rb",
   "compound.rb",
   "nanoparticle.rb",
   "dataset.rb",
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
index c58dc8c..6e9b0ea 100644
--- a/lib/nanoparticle.rb
+++ b/lib/nanoparticle.rb
@@ -6,7 +6,6 @@ module OpenTox
     field :core, type: String
     field :coating, type: Array, default: []
 
-    field :physchem_descriptors, type: Hash, default: {}
     field :toxicities, type: Hash, default: {}
     #field :features, type: Hash, default: {}
     field :bundles, type: Array, default: []
diff --git a/lib/substance.rb b/lib/substance.rb
index a5b9825..6768ce7 100644
--- a/lib/substance.rb
+++ b/lib/substance.rb
@@ -1,9 +1,8 @@
 module OpenTox
 
   class Substance
-    include OpenTox
-    include Mongoid::Document
-    include Mongoid::Timestamps
+    field :physchem_descriptors, type: Hash, default: {}
+    field :dataset_ids, type: Array, default: []
   end
 
 end
diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb
index 8a6836c..6f241ec 100644
--- a/test/nanoparticles.rb
+++ b/test/nanoparticles.rb
@@ -3,8 +3,12 @@ require_relative "setup.rb"
 class NanoparticleTest  < MiniTest::Test
 
   def test_import
-    Import::Enanomapper.import
+    dataset_ids = Import::Enanomapper.import
     assert_operator Nanoparticle.count , :>, 570, "Only #{Nanoparticle.count} nanoparticles imported"
+    assert_operator dataset_ids.size, :>, 8, "Only #{dataset_ids.size} bundles imported"
+    p dataset_ids.collect{|d| Dataset.find(d).name}
+    assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("NanoWiki")
+    assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
   end
 
   def test_create_model
author	Christoph Helma <helma@in-silico.ch>	2016-04-13 18:18:36 +0200
committer	Christoph Helma <helma@in-silico.ch>	2016-04-13 18:18:36 +0200
commit	64f1f32ced77afb278bdb7c27397c5299a73675c (patch)
tree	b44cdc6c9533be8e33815fb16e83a341c35ea3d1
parent	815cf6ba1543fc323eb7cbd1202fadbf03bcfbca (diff)