From 611bac891177f8d9185d45486dd574b6ef4d1912 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 9 May 2016 15:11:46 +0200 Subject: nanoparticle models fixed --- data/enm-dump.rb | 17 --------- data/enm-import.rb | 47 ----------------------- lib/dataset.rb | 8 ++-- lib/import.rb | 6 +-- lib/model.rb | 1 + lib/nanoparticle.rb | 37 ++++++++++-------- lib/regression.rb | 2 +- scripts/import-enm.rb | 7 ---- scripts/mirror-enm2test.rb | 4 ++ test/nanoparticles.rb | 95 +++++++++++++++++++--------------------------- 10 files changed, 72 insertions(+), 152 deletions(-) delete mode 100644 data/enm-dump.rb delete mode 100644 data/enm-import.rb delete mode 100755 scripts/import-enm.rb create mode 100755 scripts/mirror-enm2test.rb diff --git a/data/enm-dump.rb b/data/enm-dump.rb deleted file mode 100644 index 88667fc..0000000 --- a/data/enm-dump.rb +++ /dev/null @@ -1,17 +0,0 @@ -require 'json' - -#get list of bundle URIs -`wget 'https://data.enanomapper.net/bundle?media=application%2Fjson' -O bundles.json` -json = JSON.parse File.read('./bundles.json') -json["dataset"].each do |dataset| - uri = dataset["URI"] - id = uri.split("/").last - #`wget --header='accept:application/json' '#{uri}' -O 'bundle#{id}'` - `wget --header='accept:application/ld+json' '#{uri}/substance' -O 'study#{id}.json'` - #`wget --header='accept:application/json' '#{dataset["summary"]}' -O 'summary#{id}.json'` - #`wget --header='accept:application/json' '#{dataset["compound"]}' -O 'compound#{id}.json'` - #`wget --header='accept:application/json' '#{dataset["substance"]}' -O 'substance#{id}.json'` - #`wget --header='accept:application/json' '#{dataset["property"]}' -O 'property#{id}.json'` - #`wget --header='accept:application/json' '#{dataset["dataset"]}' -O 'dataset#{id}.json'` - #`wget --header='accept:application/json' '#{dataset["matrix"]}' -O 'matrix#{id}.json'` -end diff --git a/data/enm-import.rb b/data/enm-import.rb deleted file mode 100644 index 37bc22b..0000000 --- a/data/enm-import.rb +++ /dev/null @@ -1,47 +0,0 @@ -require_relative '../lib/lazar.rb' -include OpenTox -$mongo.database.drop -$gridfs = $mongo.database.fs - -#get list of bundle URIs -bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] -bundles.each do |bundle| - uri = bundle["URI"] - nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"] - features = JSON.parse(RestClientWrapper.get(bundle["property"]+"?media=application%2Fjson"))["feature"] - nanoparticles.each do |np| - nanoparticle = Nanoparticle.find_or_create_by( - :name => np["values"]["https://data.enanomapper.net/identifier/name"], - :source => np["compound"]["URI"], - ) - nanoparticle.bundles << uri - nanoparticle.bundles.uniq! - np["composition"].each do |comp| - case comp["relation"] - when "HAS_CORE" - nanoparticle.core = comp["component"]["compound"]["URI"] - when "HAS_COATING" - nanoparticle.coating << comp["component"]["compound"]["URI"] - end - end if np["composition"] - np["values"].each do |u,v| - if u.match(/property/) - name, unit, source = nil - features.each do |uri,feat| - if u.match(/#{uri}/) - name = feat["title"] - unit = feat["units"] - source = uri - end - end - feature = Feature.find_or_create_by( - :name => name, - :unit => unit, - :source => source - ) - end - v.each{|value| nanoparticle.parse_ambit_value feature, value} if v.is_a? Array - end - nanoparticle.save! - end -end diff --git a/lib/dataset.rb b/lib/dataset.rb index 86800c6..9738c1f 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -62,12 +62,12 @@ module OpenTox training_cids = training_idxs.collect{|i| substance_ids[i]} chunk = [training_cids,test_cids].collect do |cids| dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id ) - dataset.compounds.each do |compound| - compound.dataset_ids << dataset.id - compound.toxicities.each do |feature_id,data| + dataset.substances.each do |substance| + substance.dataset_ids << dataset.id + substance.toxicities.each do |feature_id,data| data[dataset.id.to_s] = data[self.id.to_s] # copy data entries end - compound.save + substance.save end dataset end diff --git a/lib/import.rb b/lib/import.rb index 11cb367..dfe5e2d 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -40,10 +40,10 @@ module OpenTox datasets[bundle_uri].substance_ids << nanoparticle.id nanoparticle["dataset_ids"] << datasets[bundle_uri].id end + bundle = datasets[np["bundles"].keys.first].id if np["bundles"].size == 1 study["effects"].each do |effect| effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature # TODO parse core/coating - # TODO parse proteomics, they come as a large textValue #$logger.debug File.join(np["compound"]["URI"],"study") effect["conditions"].delete_if { |k, v| v.nil? } # parse proteomics data @@ -53,7 +53,7 @@ module OpenTox :name => identifier, :category => "Proteomics", ) - nanoparticle.parse_ambit_value feature, value + nanoparticle.parse_ambit_value feature, value, bundle end else feature = klass.find_or_create_by( @@ -62,7 +62,7 @@ module OpenTox :category => study["protocol"]["topcategory"], :conditions => effect["conditions"] ) - nanoparticle.parse_ambit_value feature, effect["result"] + nanoparticle.parse_ambit_value feature, effect["result"], bundle end end nanoparticle.save diff --git a/lib/model.rb b/lib/model.rb index 5b094fb..070248a 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -76,6 +76,7 @@ module OpenTox prediction = {} if neighbors.collect{|n| n["_id"]}.include? compound.id + me = neighbors.select{|n| n["_id"] == compound.id}.first database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s][training_dataset_id.to_s].uniq prediction[:database_activities] = database_activities prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 9bf419d..b79981d 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -9,10 +9,14 @@ module OpenTox field :proteomics, type: Hash, default: {} def nanoparticle_neighbors params - Dataset.find(params[:training_dataset_id]).nanoparticles.collect{|np| np["tanimoto"] = 1; np} + dataset = Dataset.find(params[:training_dataset_id]) + Dataset.find(params[:training_dataset_id]).nanoparticles.collect do |np| + np["tanimoto"] = 1 + np unless np.toxicities.empty? + end.compact end - def add_feature feature, value + def add_feature feature, value, dataset_id case feature.category when "P-CHEM" physchem_descriptors[feature.id.to_s] ||= [] @@ -23,51 +27,52 @@ module OpenTox proteomics[feature.id.to_s] << value proteomics[feature.id.to_s].uniq! when "TOX" - toxicities[feature.id.to_s] ||= [] + toxicities[feature.id.to_s] ||= {} + toxicities[feature.id.to_s][dataset_id.to_s] ||= [] # TODO generic way of parsing TOX values if feature.name == "7.99 Toxicity (other) ICP-AES" and feature.unit == "mL/ug(Mg)" - toxicities[feature.id.to_s] << -Math.log10(value) + toxicities[feature.id.to_s][dataset_id.to_s] << -Math.log10(value) else - toxicities[feature.id.to_s] << value + toxicities[feature.id.to_s][dataset_id.to_s] << value end - toxicities[feature.id.to_s].uniq! + toxicities[feature.id.to_s][dataset_id.to_s].uniq! else warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted." end end - def parse_ambit_value feature, v + def parse_ambit_value feature, v, dataset_id v.delete "unit" # TODO: ppm instead of weights if v.keys == ["textValue"] - add_feature feature, v["textValue"] + add_feature feature, v["textValue"], dataset_id elsif v.keys == ["loValue"] - add_feature feature, v["loValue"] + add_feature feature, v["loValue"], dataset_id elsif v.keys.size == 2 and v["errorValue"] - add_feature feature, v["loValue"] + add_feature feature, v["loValue"], dataset_id warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." elsif v.keys.size == 2 and v["loQualifier"] == "mean" - add_feature feature, v["loValue"] + add_feature feature, v["loValue"], dataset_id warn "'#{feature.name}' is a mean value. Original data is not available." elsif v.keys.size == 2 and v["loQualifier"] #== ">=" warn "Only min value available for '#{feature.name}', entry ignored" elsif v.keys.size == 2 and v["upQualifier"] #== ">=" warn "Only max value available for '#{feature.name}', entry ignored" elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil? - add_feature feature, v["loValue"] + add_feature feature, v["loValue"], dataset_id warn "loQualifier and upQualifier are empty." elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"] == "" and v["upQualifier"] == "" - add_feature feature, v["loValue"] + add_feature feature, v["loValue"], dataset_id warn "loQualifier and upQualifier are empty." elsif v.keys.size == 4 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil? - add_feature feature, v["loValue"] + add_feature feature, v["loValue"], dataset_id warn "loQualifier and upQualifier are empty." elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] and v["loValue"] and v["upValue"] - add_feature feature, [v["loValue"],v["upValue"]].mean + add_feature feature, [v["loValue"],v["upValue"]].mean, dataset_id warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available." elsif v.size == 4 and v["loQualifier"] == "mean" and v["errorValue"] warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." - add_feature feature, v["loValue"] + add_feature feature, v["loValue"], dataset_id elsif v == {} # do nothing else warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'." diff --git a/lib/regression.rb b/lib/regression.rb index b8a7e5f..691f903 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -10,7 +10,7 @@ module OpenTox neighbors.each do |row| sim = row["tanimoto"] sim ||= 1 # TODO: sim f nanoparticles - if row["toxicities"][params[:prediction_feature_id].to_s] + if row["toxicities"][params[:prediction_feature_id].to_s] and row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s] row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| weighted_sum += sim*act sim_sum += sim diff --git a/scripts/import-enm.rb b/scripts/import-enm.rb deleted file mode 100755 index 4fb414b..0000000 --- a/scripts/import-enm.rb +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env ruby -require_relative '../lib/lazar' -include OpenTox -$mongo.database.drop -$gridfs = $mongo.database.fs # recreate GridFS indexes -Import::Enanomapper.import -`mongodump -h 127.0.0.1 -d production` diff --git a/scripts/mirror-enm2test.rb b/scripts/mirror-enm2test.rb new file mode 100755 index 0000000..f6638bc --- /dev/null +++ b/scripts/mirror-enm2test.rb @@ -0,0 +1,4 @@ +#!/usr/bin/env ruby +require_relative '../lib/lazar' +include OpenTox +Import::Enanomapper.mirror File.join(File.dirname(__FILE__),"..","test","data","enm") diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb index 6d91103..2082ec4 100644 --- a/test/nanoparticles.rb +++ b/test/nanoparticles.rb @@ -4,66 +4,12 @@ require_relative "setup.rb" class NanoparticleTest < MiniTest::Test def setup + Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm") #`mongorestore --db=development #{File.join(File.dirname(__FILE__),"..","dump","production")}` end - def test_mirror - Import::Enanomapper.mirror File.join(File.dirname(__FILE__),"..","data") - end - - def test_import - Import::Enanomapper.import File.join(File.dirname(__FILE__),"..","data") -# skip -# dataset_ids = Import::Enanomapper.import -# assert_operator Nanoparticle.count , :>, 570, "Only #{Nanoparticle.count} nanoparticles imported" -# assert_operator dataset_ids.size, :>, 8, "Only #{dataset_ids.size} bundles imported" -# assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("NanoWiki") -# assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") -# p dataset_ids.collect{|d| {d => Dataset.find(d).name}} -# dataset_ids.collect do |d| -# d = Dataset.find(d) - #p d.name - #puts d.to_csv -# end - end - - def test_summaries - skip - features = Feature.all.to_a - #p features.collect do |f| - #f if f.category == "TOX" - #end.to_a.flatten.size - toxcounts = {} - pccounts = {} - Nanoparticle.all.each do |np| - np.toxicities.each do |t,v| - toxcounts[t] ||= 0 - toxcounts[t] += 1#v.uniq.size - end - np.physchem_descriptors.each do |t,v| - pccounts[t] ||= 0 - pccounts[t] += 1#v.uniq.size - end - end - #puts counts.keys.collect{|i| Feature.find(i)}.to_yaml - #pccounts.each{|e,n| p Feature.find(e),n if n > 100} - #p toxcounts.collect{|e,n| Feature.find(e).name if n > 1}.uniq - toxcounts.each{|e,n| p Feature.find(e),n if n > 100} - end - - - def test_import_ld - skip - dataset_ids = Import::Enanomapper.import_ld - end - - def test_export - Dataset.all.each do |d| - puts d.to_csv - end - end - def test_create_model_with_feature_selection + skip training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") feature = Feature.find_or_create_by(name: "7.99 Toxicity (other) ICP-AES", category: "TOX", unit: "mL/ug(Mg)") model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors"}) @@ -80,7 +26,6 @@ class NanoparticleTest < MiniTest::Test feature = Feature.find_or_create_by(name: "7.99 Toxicity (other) ICP-AES", category: "TOX", unit: "mL/ug(Mg)") model = Model::LazarRegression.create(feature, training_dataset, {:prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors"}) nanoparticle = training_dataset.nanoparticles[-34] - #p nanoparticle.neighbors prediction = model.predict nanoparticle p prediction #p prediction @@ -97,4 +42,40 @@ class NanoparticleTest < MiniTest::Test p cv end + def test_export + skip + Dataset.all.each do |d| + puts d.to_csv + end + end + + def test_summaries + skip + features = Feature.all.to_a + #p features.collect do |f| + #f if f.category == "TOX" + #end.to_a.flatten.size + toxcounts = {} + pccounts = {} + Nanoparticle.all.each do |np| + np.toxicities.each do |t,v| + toxcounts[t] ||= 0 + toxcounts[t] += 1#v.uniq.size + end + np.physchem_descriptors.each do |t,v| + pccounts[t] ||= 0 + pccounts[t] += 1#v.uniq.size + end + end + #puts counts.keys.collect{|i| Feature.find(i)}.to_yaml + #pccounts.each{|e,n| p Feature.find(e),n if n > 100} + #p toxcounts.collect{|e,n| Feature.find(e).name if n > 1}.uniq + toxcounts.each{|e,n| p Feature.find(e),n if n > 100} + end + + + def test_import_ld + skip + dataset_ids = Import::Enanomapper.import_ld + end end -- cgit v1.2.3