From ab7b37541b4f8a762be737009631d3eefd898b4a Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 5 May 2016 16:14:02 +0200 Subject: ambit mirror, import from mirrored json, proteomics import --- lib/compound.rb | 6 +-- lib/import.rb | 101 +++++++++++++++++++++++++++----------------------- lib/model.rb | 4 +- lib/nanoparticle.rb | 21 ++++------- lib/regression.rb | 6 +-- lib/substance.rb | 2 +- test/nanoparticles.rb | 29 +++++++++------ 7 files changed, 89 insertions(+), 80 deletions(-) diff --git a/lib/compound.rb b/lib/compound.rb index c2ce5d0..143c4f2 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -77,7 +77,7 @@ module OpenTox def physchem descriptors=PhysChem.openbabel_descriptors # TODO: speedup java descriptors - calculated_ids = physchem_descriptors.keys + calculated_ids = physchem.keys # BSON::ObjectId instances are not allowed as keys in a BSON document. new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids descs = {} @@ -90,11 +90,11 @@ module OpenTox # avoid recalculating Cdk features with multiple values descs.keys.uniq.each do |k| descs[k].send(k[0].downcase,k[1],self).each do |n,v| - physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document. + physchem[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document. end end save - physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id} + physchem.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id} end def smarts_match smarts, count=false diff --git a/lib/import.rb b/lib/import.rb index 3c1edfe..11cb367 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -5,47 +5,73 @@ module OpenTox class Enanomapper include OpenTox - def self.import + def self.mirror dir="." #get list of bundle URIs bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] + File.open(File.join(dir,"bundles.json"),"w+"){|f| f.puts JSON.pretty_generate(bundles)} datasets = [] bundles.each do |bundle| - uri = bundle["URI"] - dataset = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"]) nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"] - features = JSON.parse(RestClientWrapper.get(bundle["property"]+"?media=application%2Fjson"))["feature"] - nanoparticles.each do |np| - nanoparticle = Nanoparticle.find_or_create_by( - :name => np["values"]["https://data.enanomapper.net/identifier/name"], - :source => np["compound"]["URI"], - ) - dataset.substance_ids << nanoparticle.id - dataset.substance_ids.uniq! - studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study")))["study"] + nanoparticles.each do |nanoparticle| + uuid = nanoparticle["values"]["https://data.enanomapper.net/identifier/uuid"] + $logger.debug uuid + File.open(File.join(dir,"nanoparticle-#{uuid}.json"),"w+"){|f| f.puts JSON.pretty_generate(nanoparticle)} + studies = JSON.parse(RestClientWrapper.get(File.join(nanoparticle["compound"]["URI"],"study")))["study"] studies.each do |study| - study["effects"].each do |effect| - effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature - # TODO parse core/coating - # TODO parse proteomics, they come as a large textValue - $logger.debug File.join(np["compound"]["URI"],"study") - effect["conditions"].delete_if { |k, v| v.nil? } + File.open(File.join(dir,"study-#{uuid}.json"),"w+"){|f| f.puts JSON.pretty_generate(study)} + end + end + end + end + + def self.import dir="." + datasets = {} + JSON.parse(File.read(File.join(dir,"bundles.json"))).each do |bundle| + datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"]) + end + Dir[File.join(dir,"study*.json")].each do |s| + study = JSON.parse(File.read(s)) + np = JSON.parse(File.read(File.join(dir,"nanoparticle-#{study['owner']['substance']['uuid']}.json"))) + nanoparticle = Nanoparticle.find_or_create_by( + :name => np["values"]["https://data.enanomapper.net/identifier/name"], + :source => np["compound"]["URI"], + ) + np["bundles"].keys.each do |bundle_uri| + datasets[bundle_uri].substance_ids << nanoparticle.id + nanoparticle["dataset_ids"] << datasets[bundle_uri].id + end + study["effects"].each do |effect| + effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature + # TODO parse core/coating + # TODO parse proteomics, they come as a large textValue + #$logger.debug File.join(np["compound"]["URI"],"study") + effect["conditions"].delete_if { |k, v| v.nil? } + # parse proteomics data + if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 + JSON.parse(effect["result"]["textValue"]).each do |identifier, value| feature = klass.find_or_create_by( - #:source => File.join(np["compound"]["URI"],"study"), - :name => "#{study["protocol"]["category"]["title"]} #{study["protocol"]["endpoint"]}", - :unit => effect["result"]["unit"], - :category => study["protocol"]["topcategory"], - :conditions => effect["conditions"] + :name => identifier, + :category => "Proteomics", ) - nanoparticle.parse_ambit_value feature, effect["result"] - dataset.feature_ids << feature.id - dataset.feature_ids.uniq! + nanoparticle.parse_ambit_value feature, value end + else + feature = klass.find_or_create_by( + :name => "#{study["protocol"]["category"]["title"]} #{study["protocol"]["endpoint"]}", + :unit => effect["result"]["unit"], + :category => study["protocol"]["topcategory"], + :conditions => effect["conditions"] + ) + nanoparticle.parse_ambit_value feature, effect["result"] end end - dataset.save - datasets << dataset + nanoparticle.save + end + datasets.each do |u,d| + d.feature_ids.uniq! + d.substance_ids.uniq! + d.save end - datasets.collect{|d| d.id} end =begin @@ -64,23 +90,6 @@ module OpenTox end =end - def self.dump - #get list of bundle URIs - `wget 'https://data.enanomapper.net/bundle?media=application%2Fjson' -O bundles.json` - json = JSON.parse File.read('./bundles.json') - json["dataset"].each do |dataset| - uri = dataset["URI"] - id = uri.split("/").last - `wget --header='accept:application/json' '#{uri}' -O 'bundle#{id}'` - `wget --header='accept:application/json' '#{dataset["summary"]}' -O 'summary#{id}.json'` - `wget --header='accept:application/json' '#{dataset["compound"]}' -O 'compound#{id}.json'` - `wget --header='accept:application/json' '#{dataset["substance"]}' -O 'substance#{id}.json'` - `wget --header='accept:application/json' '#{dataset["property"]}' -O 'property#{id}.json'` - `wget --header='accept:application/json' '#{dataset["dataset"]}' -O 'dataset#{id}.json'` - `wget --header='accept:application/json' '#{dataset["matrix"]}' -O 'matrix#{id}.json'` - end - end - end end diff --git a/lib/model.rb b/lib/model.rb index 841ab20..12abc6e 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -47,9 +47,9 @@ module OpenTox end end R.assign "tox", toxicities - feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq + feature_ids = training_dataset.substances.collect{ |s| s["physchem"].keys}.flatten.uniq feature_ids.each do |feature_id| - feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id]} + feature_values = substances.collect{|s| s["physchem"][feature_id]} R.assign "feature", feature_values begin #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')" diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index dda4a9f..c9fbb77 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -6,6 +6,7 @@ module OpenTox field :core, type: String field :coating, type: Array, default: [] field :bundles, type: Array, default: [] + field :proteomics, type: Hash, default: {} def nanoparticle_neighbors params Dataset.find(params[:training_dataset_id]).nanoparticles.collect{|np| np["tanimoto"] = 1; np} @@ -14,21 +15,18 @@ module OpenTox def add_feature feature, value case feature.category when "P-CHEM" - physchem_descriptors[feature.id.to_s] ||= [] - physchem_descriptors[feature.id.to_s] << value - physchem_descriptors[feature.id.to_s].uniq! + physchem[feature.id.to_s] ||= [] + physchem[feature.id.to_s] << value + physchem[feature.id.to_s].uniq! + when "Proteomics" + proteomics[feature.id.to_s] ||= [] + proteomics[feature.id.to_s] << value + proteomics[feature.id.to_s].uniq! when "TOX" toxicities[feature.id.to_s] ||= [] # TODO generic way of parsing TOX values if feature.name == "7.99 Toxicity (other) ICP-AES" and feature.unit == "mL/ug(Mg)" toxicities[feature.id.to_s] << -Math.log10(value) - #if value.numeric? - #begin - #rescue - #p feature - #p value - #exit - #end else toxicities[feature.id.to_s] << value end @@ -36,7 +34,6 @@ module OpenTox else warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted." end - save end def parse_ambit_value feature, v @@ -79,5 +76,3 @@ module OpenTox end end - - diff --git a/lib/regression.rb b/lib/regression.rb index d2c4e91..fe45f99 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -84,7 +84,7 @@ module OpenTox activities = [] weights = [] - pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq + pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem.keys}.flatten.uniq data_frame = [] data_frame[0] = [] @@ -93,7 +93,7 @@ module OpenTox n["toxicities"][params[:prediction_feature_id].to_s].each do |act| data_frame[0][i] = act n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? - neighbor.physchem_descriptors.each do |pid,values| + neighbor.physchem.each do |pid,values| values.uniq! warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1 j = pc_ids.index(pid)+1 @@ -121,7 +121,7 @@ module OpenTox return result else query_descriptors = pc_ids.collect do |i| - compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA" + compound.physchem[i] ? compound.physchem_descriptors[i].for_R : "NA" end remove_idx = [] query_descriptors.each_with_index do |v,i| diff --git a/lib/substance.rb b/lib/substance.rb index 82ca65d..34bc94a 100644 --- a/lib/substance.rb +++ b/lib/substance.rb @@ -1,7 +1,7 @@ module OpenTox class Substance - field :physchem_descriptors, type: Hash, default: {} + field :physchem, type: Hash, default: {} field :toxicities, type: Hash, default: {} field :dataset_ids, type: Array, default: [] end diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb index 7308a83..69cfd30 100644 --- a/test/nanoparticles.rb +++ b/test/nanoparticles.rb @@ -4,22 +4,27 @@ require_relative "setup.rb" class NanoparticleTest < MiniTest::Test def setup - `mongorestore --db=development #{File.join(File.dirname(__FILE__),"..","dump","production")}` + #`mongorestore --db=development #{File.join(File.dirname(__FILE__),"..","dump","production")}` + end + + def test_mirror + Import::Enanomapper.mirror File.join(File.dirname(__FILE__),"..","data") end def test_import - skip - dataset_ids = Import::Enanomapper.import - assert_operator Nanoparticle.count , :>, 570, "Only #{Nanoparticle.count} nanoparticles imported" - assert_operator dataset_ids.size, :>, 8, "Only #{dataset_ids.size} bundles imported" - assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("NanoWiki") - assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") - p dataset_ids.collect{|d| {d => Dataset.find(d).name}} - dataset_ids.collect do |d| - d = Dataset.find(d) + Import::Enanomapper.import File.join(File.dirname(__FILE__),"..","data") +# skip +# dataset_ids = Import::Enanomapper.import +# assert_operator Nanoparticle.count , :>, 570, "Only #{Nanoparticle.count} nanoparticles imported" +# assert_operator dataset_ids.size, :>, 8, "Only #{dataset_ids.size} bundles imported" +# assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("NanoWiki") +# assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") +# p dataset_ids.collect{|d| {d => Dataset.find(d).name}} +# dataset_ids.collect do |d| +# d = Dataset.find(d) #p d.name #puts d.to_csv - end +# end end def test_summaries @@ -35,7 +40,7 @@ class NanoparticleTest < MiniTest::Test toxcounts[t] ||= 0 toxcounts[t] += 1#v.uniq.size end - np.physchem_descriptors.each do |t,v| + np.physchem.each do |t,v| pccounts[t] ||= 0 pccounts[t] += 1#v.uniq.size end -- cgit v1.2.3