From 64f1f32ced77afb278bdb7c27397c5299a73675c Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 13 Apr 2016 18:18:36 +0200 Subject: improved enm import --- .gitignore | 5 +-- lib/compound.rb | 2 - lib/import.rb | 105 ++++++++++++++++++++++++++++---------------------- lib/lazar.rb | 1 + lib/nanoparticle.rb | 1 - lib/substance.rb | 5 +-- test/nanoparticles.rb | 6 ++- 7 files changed, 67 insertions(+), 58 deletions(-) diff --git a/.gitignore b/.gitignore index 791dc27..fb51df7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,4 @@ -last-utils -libfminer openbabel -fminer_debug.txt -test/fminer_debug.txt Gemfile.lock *.gem .bundle @@ -11,3 +7,4 @@ pkg/* .yardoc/ doc/ lazar.log +data diff --git a/lib/compound.rb b/lib/compound.rb index 757ba1a..7895619 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -17,8 +17,6 @@ module OpenTox field :sdf_id, type: BSON::ObjectId field :fingerprints, type: Hash, default: {} field :default_fingerprint_size, type: Integer - field :physchem_descriptors, type: Hash, default: {} - field :dataset_ids, type: Array, default: [] # TODO separate between physchem, bio and tox field :features, type: Hash, default: {} diff --git a/lib/import.rb b/lib/import.rb index 86c633a..cf0855e 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -8,64 +8,75 @@ module OpenTox def self.import #get list of bundle URIs bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] + datasets = [] bundles.each do |bundle| uri = bundle["URI"] + dataset = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"]) nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"] features = JSON.parse(RestClientWrapper.get(bundle["property"]+"?media=application%2Fjson"))["feature"] nanoparticles.each do |np| - nanoparticle = Nanoparticle.find_or_create_by( - :name => np["values"]["https://data.enanomapper.net/identifier/name"], - :source => np["compound"]["URI"], - ) - nanoparticle.bundles << uri - np["composition"].each do |comp| - case comp["relation"] - when "HAS_CORE" - nanoparticle.core = comp["component"]["compound"]["URI"] - when "HAS_COATING" - nanoparticle.coating << comp["component"]["compound"]["URI"] - end - end if np["composition"] - np["values"].each do |u,v| - if u.match(/property/) - name, unit, source = nil - features.each do |uri,feat| - if u.match(/#{uri}/) - name = feat["title"] - unit = feat["units"] - source = uri - end + nanoparticle = Nanoparticle.find_or_create_by( + :name => np["values"]["https://data.enanomapper.net/identifier/name"], + :source => np["compound"]["URI"], + ) + dataset.data_entries[nanoparticle.id.to_s] ||= {} + nanoparticle.bundles << uri + nanoparticle.dataset_ids << dataset.id + np["composition"].each do |comp| + case comp["relation"] + when "HAS_CORE" + nanoparticle.core = comp["component"]["compound"]["URI"] + when "HAS_COATING" + nanoparticle.coating << comp["component"]["compound"]["URI"] + end + end if np["composition"] + np["values"].each do |u,v| + if u.match(/property/) + name, unit, source = nil + features.each do |uri,feat| + if u.match(/#{uri}/) + name = feat["title"] + unit = feat["units"] + source = uri end - feature = Feature.find_or_create_by( - :name => name, - :unit => unit, - :source => source - ) end - v.each{|value| nanoparticle.parse_ambit_value feature, value} if v.is_a? Array + feature = Feature.find_or_create_by( + :name => name, + :unit => unit, + :source => source + ) end - nanoparticle.bundles.uniq! - nanoparticle.physchem_descriptors.each{|f,v| v.uniq!} - nanoparticle.toxicities.each{|f,v| v.uniq!} - nanoparticle.save! + v.each{|value| nanoparticle.parse_ambit_value feature, value} if v.is_a? Array + end + nanoparticle.bundles.uniq! + nanoparticle.physchem_descriptors.each{|f,v| v.uniq!} + #nanoparticle.toxicities.each{|f,v| v.uniq!} + nanoparticle.toxicities.each do |f,v| + dataset.data_entries[nanoparticle.id.to_s][f.to_s] ||= [] + dataset.data_entries[nanoparticle.id.to_s][f.to_s] += v + end + nanoparticle.save end + dataset.save + datasets << dataset end + datasets.collect{|d| d.id} + end - def self.dump - #get list of bundle URIs - `wget 'https://data.enanomapper.net/bundle?media=application%2Fjson' -O bundles.json` - json = JSON.parse File.read('./bundles.json') - json["dataset"].each do |dataset| - uri = dataset["URI"] - id = uri.split("/").last - `wget --header='accept:application/json' '#{uri}' -O 'bundle#{id}'` - `wget --header='accept:application/json' '#{dataset["summary"]}' -O 'summary#{id}.json'` - `wget --header='accept:application/json' '#{dataset["compound"]}' -O 'compound#{id}.json'` - `wget --header='accept:application/json' '#{dataset["substance"]}' -O 'substance#{id}.json'` - `wget --header='accept:application/json' '#{dataset["property"]}' -O 'property#{id}.json'` - `wget --header='accept:application/json' '#{dataset["dataset"]}' -O 'dataset#{id}.json'` - `wget --header='accept:application/json' '#{dataset["matrix"]}' -O 'matrix#{id}.json'` - end + def self.dump + #get list of bundle URIs + `wget 'https://data.enanomapper.net/bundle?media=application%2Fjson' -O bundles.json` + json = JSON.parse File.read('./bundles.json') + json["dataset"].each do |dataset| + uri = dataset["URI"] + id = uri.split("/").last + `wget --header='accept:application/json' '#{uri}' -O 'bundle#{id}'` + `wget --header='accept:application/json' '#{dataset["summary"]}' -O 'summary#{id}.json'` + `wget --header='accept:application/json' '#{dataset["compound"]}' -O 'compound#{id}.json'` + `wget --header='accept:application/json' '#{dataset["substance"]}' -O 'substance#{id}.json'` + `wget --header='accept:application/json' '#{dataset["property"]}' -O 'property#{id}.json'` + `wget --header='accept:application/json' '#{dataset["dataset"]}' -O 'dataset#{id}.json'` + `wget --header='accept:application/json' '#{dataset["matrix"]}' -O 'matrix#{id}.json'` end end diff --git a/lib/lazar.rb b/lib/lazar.rb index a1ad551..8eb46e0 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -71,6 +71,7 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","Cross "opentox.rb", "feature.rb", "physchem.rb", + "substance.rb", "compound.rb", "nanoparticle.rb", "dataset.rb", diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index c58dc8c..6e9b0ea 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -6,7 +6,6 @@ module OpenTox field :core, type: String field :coating, type: Array, default: [] - field :physchem_descriptors, type: Hash, default: {} field :toxicities, type: Hash, default: {} #field :features, type: Hash, default: {} field :bundles, type: Array, default: [] diff --git a/lib/substance.rb b/lib/substance.rb index a5b9825..6768ce7 100644 --- a/lib/substance.rb +++ b/lib/substance.rb @@ -1,9 +1,8 @@ module OpenTox class Substance - include OpenTox - include Mongoid::Document - include Mongoid::Timestamps + field :physchem_descriptors, type: Hash, default: {} + field :dataset_ids, type: Array, default: [] end end diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb index 8a6836c..6f241ec 100644 --- a/test/nanoparticles.rb +++ b/test/nanoparticles.rb @@ -3,8 +3,12 @@ require_relative "setup.rb" class NanoparticleTest < MiniTest::Test def test_import - Import::Enanomapper.import + dataset_ids = Import::Enanomapper.import assert_operator Nanoparticle.count , :>, 570, "Only #{Nanoparticle.count} nanoparticles imported" + assert_operator dataset_ids.size, :>, 8, "Only #{dataset_ids.size} bundles imported" + p dataset_ids.collect{|d| Dataset.find(d).name} + assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("NanoWiki") + assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") end def test_create_model -- cgit v1.2.3