summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/compound.rb6
-rw-r--r--lib/import.rb101
-rw-r--r--lib/model.rb4
-rw-r--r--lib/nanoparticle.rb21
-rw-r--r--lib/regression.rb6
-rw-r--r--lib/substance.rb2
-rw-r--r--test/nanoparticles.rb29
7 files changed, 89 insertions, 80 deletions
diff --git a/lib/compound.rb b/lib/compound.rb
index c2ce5d0..143c4f2 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -77,7 +77,7 @@ module OpenTox
def physchem descriptors=PhysChem.openbabel_descriptors
# TODO: speedup java descriptors
- calculated_ids = physchem_descriptors.keys
+ calculated_ids = physchem.keys
# BSON::ObjectId instances are not allowed as keys in a BSON document.
new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids
descs = {}
@@ -90,11 +90,11 @@ module OpenTox
# avoid recalculating Cdk features with multiple values
descs.keys.uniq.each do |k|
descs[k].send(k[0].downcase,k[1],self).each do |n,v|
- physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
+ physchem[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
end
end
save
- physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id}
+ physchem.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id}
end
def smarts_match smarts, count=false
diff --git a/lib/import.rb b/lib/import.rb
index 3c1edfe..11cb367 100644
--- a/lib/import.rb
+++ b/lib/import.rb
@@ -5,47 +5,73 @@ module OpenTox
class Enanomapper
include OpenTox
- def self.import
+ def self.mirror dir="."
#get list of bundle URIs
bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
+ File.open(File.join(dir,"bundles.json"),"w+"){|f| f.puts JSON.pretty_generate(bundles)}
datasets = []
bundles.each do |bundle|
- uri = bundle["URI"]
- dataset = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"])
nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"]
- features = JSON.parse(RestClientWrapper.get(bundle["property"]+"?media=application%2Fjson"))["feature"]
- nanoparticles.each do |np|
- nanoparticle = Nanoparticle.find_or_create_by(
- :name => np["values"]["https://data.enanomapper.net/identifier/name"],
- :source => np["compound"]["URI"],
- )
- dataset.substance_ids << nanoparticle.id
- dataset.substance_ids.uniq!
- studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study")))["study"]
+ nanoparticles.each do |nanoparticle|
+ uuid = nanoparticle["values"]["https://data.enanomapper.net/identifier/uuid"]
+ $logger.debug uuid
+ File.open(File.join(dir,"nanoparticle-#{uuid}.json"),"w+"){|f| f.puts JSON.pretty_generate(nanoparticle)}
+ studies = JSON.parse(RestClientWrapper.get(File.join(nanoparticle["compound"]["URI"],"study")))["study"]
studies.each do |study|
- study["effects"].each do |effect|
- effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature
- # TODO parse core/coating
- # TODO parse proteomics, they come as a large textValue
- $logger.debug File.join(np["compound"]["URI"],"study")
- effect["conditions"].delete_if { |k, v| v.nil? }
+ File.open(File.join(dir,"study-#{uuid}.json"),"w+"){|f| f.puts JSON.pretty_generate(study)}
+ end
+ end
+ end
+ end
+
+ def self.import dir="."
+ datasets = {}
+ JSON.parse(File.read(File.join(dir,"bundles.json"))).each do |bundle|
+ datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"])
+ end
+ Dir[File.join(dir,"study*.json")].each do |s|
+ study = JSON.parse(File.read(s))
+ np = JSON.parse(File.read(File.join(dir,"nanoparticle-#{study['owner']['substance']['uuid']}.json")))
+ nanoparticle = Nanoparticle.find_or_create_by(
+ :name => np["values"]["https://data.enanomapper.net/identifier/name"],
+ :source => np["compound"]["URI"],
+ )
+ np["bundles"].keys.each do |bundle_uri|
+ datasets[bundle_uri].substance_ids << nanoparticle.id
+ nanoparticle["dataset_ids"] << datasets[bundle_uri].id
+ end
+ study["effects"].each do |effect|
+ effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature
+ # TODO parse core/coating
+ # TODO parse proteomics, they come as a large textValue
+ #$logger.debug File.join(np["compound"]["URI"],"study")
+ effect["conditions"].delete_if { |k, v| v.nil? }
+ # parse proteomics data
+ if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50
+ JSON.parse(effect["result"]["textValue"]).each do |identifier, value|
feature = klass.find_or_create_by(
- #:source => File.join(np["compound"]["URI"],"study"),
- :name => "#{study["protocol"]["category"]["title"]} #{study["protocol"]["endpoint"]}",
- :unit => effect["result"]["unit"],
- :category => study["protocol"]["topcategory"],
- :conditions => effect["conditions"]
+ :name => identifier,
+ :category => "Proteomics",
)
- nanoparticle.parse_ambit_value feature, effect["result"]
- dataset.feature_ids << feature.id
- dataset.feature_ids.uniq!
+ nanoparticle.parse_ambit_value feature, value
end
+ else
+ feature = klass.find_or_create_by(
+ :name => "#{study["protocol"]["category"]["title"]} #{study["protocol"]["endpoint"]}",
+ :unit => effect["result"]["unit"],
+ :category => study["protocol"]["topcategory"],
+ :conditions => effect["conditions"]
+ )
+ nanoparticle.parse_ambit_value feature, effect["result"]
end
end
- dataset.save
- datasets << dataset
+ nanoparticle.save
+ end
+ datasets.each do |u,d|
+ d.feature_ids.uniq!
+ d.substance_ids.uniq!
+ d.save
end
- datasets.collect{|d| d.id}
end
=begin
@@ -64,23 +90,6 @@ module OpenTox
end
=end
- def self.dump
- #get list of bundle URIs
- `wget 'https://data.enanomapper.net/bundle?media=application%2Fjson' -O bundles.json`
- json = JSON.parse File.read('./bundles.json')
- json["dataset"].each do |dataset|
- uri = dataset["URI"]
- id = uri.split("/").last
- `wget --header='accept:application/json' '#{uri}' -O 'bundle#{id}'`
- `wget --header='accept:application/json' '#{dataset["summary"]}' -O 'summary#{id}.json'`
- `wget --header='accept:application/json' '#{dataset["compound"]}' -O 'compound#{id}.json'`
- `wget --header='accept:application/json' '#{dataset["substance"]}' -O 'substance#{id}.json'`
- `wget --header='accept:application/json' '#{dataset["property"]}' -O 'property#{id}.json'`
- `wget --header='accept:application/json' '#{dataset["dataset"]}' -O 'dataset#{id}.json'`
- `wget --header='accept:application/json' '#{dataset["matrix"]}' -O 'matrix#{id}.json'`
- end
- end
-
end
end
diff --git a/lib/model.rb b/lib/model.rb
index 841ab20..12abc6e 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -47,9 +47,9 @@ module OpenTox
end
end
R.assign "tox", toxicities
- feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq
+ feature_ids = training_dataset.substances.collect{ |s| s["physchem"].keys}.flatten.uniq
feature_ids.each do |feature_id|
- feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id]}
+ feature_values = substances.collect{|s| s["physchem"][feature_id]}
R.assign "feature", feature_values
begin
#R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')"
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
index dda4a9f..c9fbb77 100644
--- a/lib/nanoparticle.rb
+++ b/lib/nanoparticle.rb
@@ -6,6 +6,7 @@ module OpenTox
field :core, type: String
field :coating, type: Array, default: []
field :bundles, type: Array, default: []
+ field :proteomics, type: Hash, default: {}
def nanoparticle_neighbors params
Dataset.find(params[:training_dataset_id]).nanoparticles.collect{|np| np["tanimoto"] = 1; np}
@@ -14,21 +15,18 @@ module OpenTox
def add_feature feature, value
case feature.category
when "P-CHEM"
- physchem_descriptors[feature.id.to_s] ||= []
- physchem_descriptors[feature.id.to_s] << value
- physchem_descriptors[feature.id.to_s].uniq!
+ physchem[feature.id.to_s] ||= []
+ physchem[feature.id.to_s] << value
+ physchem[feature.id.to_s].uniq!
+ when "Proteomics"
+ proteomics[feature.id.to_s] ||= []
+ proteomics[feature.id.to_s] << value
+ proteomics[feature.id.to_s].uniq!
when "TOX"
toxicities[feature.id.to_s] ||= []
# TODO generic way of parsing TOX values
if feature.name == "7.99 Toxicity (other) ICP-AES" and feature.unit == "mL/ug(Mg)"
toxicities[feature.id.to_s] << -Math.log10(value)
- #if value.numeric?
- #begin
- #rescue
- #p feature
- #p value
- #exit
- #end
else
toxicities[feature.id.to_s] << value
end
@@ -36,7 +34,6 @@ module OpenTox
else
warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted."
end
- save
end
def parse_ambit_value feature, v
@@ -79,5 +76,3 @@ module OpenTox
end
end
-
-
diff --git a/lib/regression.rb b/lib/regression.rb
index d2c4e91..fe45f99 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -84,7 +84,7 @@ module OpenTox
activities = []
weights = []
- pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq
+ pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem.keys}.flatten.uniq
data_frame = []
data_frame[0] = []
@@ -93,7 +93,7 @@ module OpenTox
n["toxicities"][params[:prediction_feature_id].to_s].each do |act|
data_frame[0][i] = act
n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
- neighbor.physchem_descriptors.each do |pid,values|
+ neighbor.physchem.each do |pid,values|
values.uniq!
warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1
j = pc_ids.index(pid)+1
@@ -121,7 +121,7 @@ module OpenTox
return result
else
query_descriptors = pc_ids.collect do |i|
- compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA"
+ compound.physchem[i] ? compound.physchem_descriptors[i].for_R : "NA"
end
remove_idx = []
query_descriptors.each_with_index do |v,i|
diff --git a/lib/substance.rb b/lib/substance.rb
index 82ca65d..34bc94a 100644
--- a/lib/substance.rb
+++ b/lib/substance.rb
@@ -1,7 +1,7 @@
module OpenTox
class Substance
- field :physchem_descriptors, type: Hash, default: {}
+ field :physchem, type: Hash, default: {}
field :toxicities, type: Hash, default: {}
field :dataset_ids, type: Array, default: []
end
diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb
index 7308a83..69cfd30 100644
--- a/test/nanoparticles.rb
+++ b/test/nanoparticles.rb
@@ -4,22 +4,27 @@ require_relative "setup.rb"
class NanoparticleTest < MiniTest::Test
def setup
- `mongorestore --db=development #{File.join(File.dirname(__FILE__),"..","dump","production")}`
+ #`mongorestore --db=development #{File.join(File.dirname(__FILE__),"..","dump","production")}`
+ end
+
+ def test_mirror
+ Import::Enanomapper.mirror File.join(File.dirname(__FILE__),"..","data")
end
def test_import
- skip
- dataset_ids = Import::Enanomapper.import
- assert_operator Nanoparticle.count , :>, 570, "Only #{Nanoparticle.count} nanoparticles imported"
- assert_operator dataset_ids.size, :>, 8, "Only #{dataset_ids.size} bundles imported"
- assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("NanoWiki")
- assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
- p dataset_ids.collect{|d| {d => Dataset.find(d).name}}
- dataset_ids.collect do |d|
- d = Dataset.find(d)
+ Import::Enanomapper.import File.join(File.dirname(__FILE__),"..","data")
+# skip
+# dataset_ids = Import::Enanomapper.import
+# assert_operator Nanoparticle.count , :>, 570, "Only #{Nanoparticle.count} nanoparticles imported"
+# assert_operator dataset_ids.size, :>, 8, "Only #{dataset_ids.size} bundles imported"
+# assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("NanoWiki")
+# assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
+# p dataset_ids.collect{|d| {d => Dataset.find(d).name}}
+# dataset_ids.collect do |d|
+# d = Dataset.find(d)
#p d.name
#puts d.to_csv
- end
+# end
end
def test_summaries
@@ -35,7 +40,7 @@ class NanoparticleTest < MiniTest::Test
toxcounts[t] ||= 0
toxcounts[t] += 1#v.uniq.size
end
- np.physchem_descriptors.each do |t,v|
+ np.physchem.each do |t,v|
pccounts[t] ||= 0
pccounts[t] += 1#v.uniq.size
end