From b6116bc4705066da30668ff3370f3b1c307e44e7 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 11 Nov 2016 13:07:53 +0100 Subject: enm import fixed --- lazar.gemspec | 1 - lib/import.rb | 194 ++++++++++++++------------------- lib/model.rb | 21 +--- test/descriptor.rb | 1 - test/model-nanoparticle.rb | 1 - test/nanomaterial-prediction-models.rb | 1 - test/setup.rb | 4 + test/validation-nanoparticle.rb | 43 ++++---- test/validation-regression.rb | 1 - 9 files changed, 106 insertions(+), 161 deletions(-) diff --git a/lazar.gemspec b/lazar.gemspec index a805edb..dfdaac8 100644 --- a/lazar.gemspec +++ b/lazar.gemspec @@ -24,5 +24,4 @@ Gem::Specification.new do |s| s.add_runtime_dependency 'rserve-client', '~> 0.3' s.add_runtime_dependency 'mongoid', '~> 5.0' s.add_runtime_dependency 'openbabel', '~> 2.3', '>= 2.3.2.2' - end diff --git a/lib/import.rb b/lib/import.rb index 8f640b1..aa2ee75 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -5,129 +5,95 @@ module OpenTox class Enanomapper include OpenTox - def self.mirror dir=nil - # clean download dir - dir ||= File.join(File.dirname(__FILE__),"..","data","enm") - FileUtils.rm_rf dir - FileUtils.mkdir_p dir - - #get list of bundle URIs + # time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%) + def self.import dir="." + datasets = {} bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] - File.open(File.join(dir,"bundles.json"),"w+"){|f| f.puts JSON.pretty_generate(bundles)} - # bundles - # id/summary - # id/compound - # id/substance - # id/property - bundles.each do |bundle| + datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"]) $logger.debug bundle["title"] nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"] - $logger.debug nanoparticles.size - nanoparticles.each do |nanoparticle| - uuid = nanoparticle["values"]["https://data.enanomapper.net/identifier/uuid"] - $logger.debug uuid - File.open(File.join(dir,"nanoparticle-#{uuid}.json"),"w+"){|f| f.puts JSON.pretty_generate(nanoparticle)} - studies = JSON.parse(RestClientWrapper.get(File.join(nanoparticle["compound"]["URI"],"study")))["study"] - $logger.debug uuid if studies.size < 1 - studies.each do |study| - File.open(File.join(dir,"study-#{study["uuid"]}.json"),"w+"){|f| f.puts JSON.pretty_generate(study)} - end - end - end - end - - def self.import dir="." - start_time = Time.now - t1 = 0 - t2 = 0 - datasets = {} - JSON.parse(File.read(File.join(dir,"bundles.json"))).each do |bundle| - if bundle["id"] == 3 - datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"]) - end - end - # TODO this is only for protein corona - Dir[File.join(dir,"study-F*.json")].each do |s| - t = Time.now - study = JSON.parse(File.read(s)) - np = JSON.parse(File.read(File.join(dir,"nanoparticle-#{study['owner']['substance']['uuid']}.json"))) - core_id = nil - coating_ids = [] - np["composition"].each do |c| - uri = c["component"]["compound"]["URI"] - uri = CGI.escape File.join(uri,"&media=application/json") - data = JSON.parse(RestClientWrapper.get "https://data.enanomapper.net/query/compound/url/all?media=application/json&search=#{uri}") - smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"] - names = [] - names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"] - names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"] - if smiles - compound = Compound.find_or_create_by(:smiles => smiles) - compound.names = names.compact - else - compound = Compound.find_or_create_by(:names => names) - end - compound.save - if c["relation"] == "HAS_CORE" - core_id = compound.id.to_s - elsif c["relation"] == "HAS_COATING" - coating_ids << compound.id.to_s + nanoparticles.each_with_index do |np,n| + core_id = nil + coating_ids = [] + np["composition"].each do |c| + uri = c["component"]["compound"]["URI"] + uri = CGI.escape File.join(uri,"&media=application/json") + data = JSON.parse(RestClientWrapper.get "https://data.enanomapper.net/query/compound/url/all?media=application/json&search=#{uri}") + smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"] + names = [] + names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"] + names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"] + if smiles + compound = Compound.find_or_create_by(:smiles => smiles) + compound.name = names.first + compound.names = names.compact + else + compound = Compound.find_or_create_by(:name => names.first,:names => names) + end + compound.save + if c["relation"] == "HAS_CORE" + core_id = compound.id.to_s + elsif c["relation"] == "HAS_COATING" + coating_ids << compound.id.to_s + end + end if np["composition"] + nanoparticle = Nanoparticle.find_or_create_by( + :name => np["values"]["https://data.enanomapper.net/identifier/name"], + :source => np["compound"]["URI"], + :core_id => core_id, + :coating_ids => coating_ids + ) + np["bundles"].keys.each do |bundle_uri| + nanoparticle.dataset_ids << datasets[bundle_uri].id end - end if np["composition"] - nanoparticle = Nanoparticle.find_or_create_by( - :name => np["values"]["https://data.enanomapper.net/identifier/name"], - :source => np["compound"]["URI"], - :core_id => core_id, - :coating_ids => coating_ids - ) - np["bundles"].keys.each do |bundle_uri| - nanoparticle.dataset_ids << datasets[bundle_uri].id - end - dataset = datasets[np["bundles"].keys.first] - proteomics_features = {} - category = study["protocol"]["topcategory"] - source = study["protocol"]["category"]["term"] - - study["effects"].each do |effect| - - effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature - effect["conditions"].delete_if { |k, v| v.nil? } - - if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data - - JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step - proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true) - nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset - end - else - name = effect["endpoint"] - unit = effect["result"]["unit"] - warnings = [] - case name - when "Log2 transformed" # use a sensible name - name = "log2(Net cell association)" - warnings = ["Original name was 'Log2 transformed'"] - unit = "log2(mL/ug(Mg))" - when "Total protein (BCA assay)" - category = "P-CHEM" - warnings = ["Category changed from TOX to P-CHEM"] + studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study")))["study"] + studies.each do |study| + dataset = datasets[np["bundles"].keys.first] + proteomics_features = {} + category = study["protocol"]["topcategory"] + source = study["protocol"]["category"]["term"] + study["effects"].each do |effect| + + effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature + effect["conditions"].delete_if { |k, v| v.nil? } + + if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data + + JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step + proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true) + nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset + end + else + name = effect["endpoint"] + unit = effect["result"]["unit"] + warnings = [] + case name + when "Log2 transformed" # use a sensible name + name = "log2(Net cell association)" + warnings = ["Original name was 'Log2 transformed'"] + unit = "log2(mL/ug(Mg))" + when "Total protein (BCA assay)" + category = "P-CHEM" + warnings = ["Category changed from TOX to P-CHEM"] + end + feature = klass.find_or_create_by( + :name => name, + :unit => unit, + :category => category, + :conditions => effect["conditions"], + :source => study["protocol"]["category"]["term"], + :measured => true, + :warnings => warnings + ) + nanoparticle.parse_ambit_value feature, effect["result"], dataset + end end - feature = klass.find_or_create_by( - :name => name, - :unit => unit, - :category => category, - :conditions => effect["conditions"], - :source => study["protocol"]["category"]["term"], - :measured => true, - :warnings => warnings - ) - nanoparticle.parse_ambit_value feature, effect["result"], dataset end + nanoparticle.save + print "#{n}, " end - p nanoparticle - nanoparticle.save end datasets.each { |u,d| d.save } end diff --git a/lib/model.rb b/lib/model.rb index 809dc48..9be0fa0 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -152,10 +152,7 @@ module OpenTox categories.each do |category| Feature.where(category:category).each{|f| feature_ids << f.id.to_s} end - #p feature_ids - #properties = Nanoparticle.all.collect { |s| p s.name; p s.id; p s.properties } properties = model.substances.collect { |s| s.properties } - #p properties property_ids = properties.collect{|p| p.keys}.flatten.uniq model.descriptor_ids = feature_ids & property_ids model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} @@ -223,10 +220,10 @@ module OpenTox prediction[:measurements] << dependent_variables[i] prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance." else - next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core if fingerprints? neighbor_descriptors = fingerprints[i] else + next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions neighbor_descriptors = scaled_variables.collect{|v| v[i]} end sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights] @@ -344,7 +341,6 @@ module OpenTox field :unit, type: String field :model_id, type: BSON::ObjectId field :repeated_crossvalidation_id, type: BSON::ObjectId - #field :leave_one_out_validation_id, type: BSON::ObjectId def predict object model.predict object @@ -370,10 +366,6 @@ module OpenTox repeated_crossvalidation.crossvalidations end - def leave_one_out_validation - Validation::LeaveOneOut.find leave_one_out_validation_id - end - def regression? model.is_a? LazarRegression end @@ -390,7 +382,6 @@ module OpenTox model = Lazar.create training_dataset: training_dataset prediction_model[:model_id] = model.id prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id - #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id prediction_model.save prediction_model end @@ -406,12 +397,7 @@ module OpenTox unless training_dataset # try to import from json dump Import::Enanomapper.import training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - unless training_dataset - Import::Enanomapper.mirror - Import::Enanomapper.import - training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset - end + bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset end prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first @@ -424,8 +410,7 @@ module OpenTox model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms) prediction_model[:model_id] = model.id repeated_cv = Validation::RepeatedCrossValidation.create model - prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id - #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id + prediction_model[:repeated_crossvalidation_id] = repeated_cv.id prediction_model.save prediction_model end diff --git a/test/descriptor.rb b/test/descriptor.rb index 6eb4316..563cdce 100644 --- a/test/descriptor.rb +++ b/test/descriptor.rb @@ -6,7 +6,6 @@ class DescriptorTest < MiniTest::Test # check available descriptors assert_equal 15,PhysChem.openbabel_descriptors.size,"incorrect number of Openbabel descriptors" assert_equal 45,PhysChem.joelib_descriptors.size,"incorrect number of Joelib descriptors" - p PhysChem.cdk_descriptors assert_equal 286,PhysChem.cdk_descriptors.size,"incorrect number of Cdk descriptors" assert_equal 346,PhysChem.descriptors.size,"incorrect number of physchem descriptors" end diff --git a/test/model-nanoparticle.rb b/test/model-nanoparticle.rb index c5f3223..7fb944e 100644 --- a/test/model-nanoparticle.rb +++ b/test/model-nanoparticle.rb @@ -108,7 +108,6 @@ class NanoparticleModelTest < MiniTest::Test }, } model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms - p model refute_empty model.dependent_variables refute_empty model.descriptor_ids refute_empty model.independent_variables diff --git a/test/nanomaterial-prediction-models.rb b/test/nanomaterial-prediction-models.rb index b0c05f3..f90a822 100644 --- a/test/nanomaterial-prediction-models.rb +++ b/test/nanomaterial-prediction-models.rb @@ -13,7 +13,6 @@ class NanomaterialPredictionModelTest < MiniTest::Test def test_default_nanomaterial_prediction_model prediction_model = Model::NanoPrediction.create - p prediction_model [:endpoint,:species,:source].each do |p| refute_empty prediction_model[p] end diff --git a/test/setup.rb b/test/setup.rb index 6c97282..63b59fb 100644 --- a/test/setup.rb +++ b/test/setup.rb @@ -5,5 +5,9 @@ require_relative '../lib/lazar.rb' include OpenTox TEST_DIR ||= File.expand_path(File.dirname(__FILE__)) DATA_DIR ||= File.join(TEST_DIR,"data") +training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first +unless training_dataset + Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm") +end #$mongo.database.drop #$gridfs = $mongo.database.fs diff --git a/test/validation-nanoparticle.rb b/test/validation-nanoparticle.rb index 5ed70f2..9351e1b 100644 --- a/test/validation-nanoparticle.rb +++ b/test/validation-nanoparticle.rb @@ -5,74 +5,72 @@ class NanoparticleValidationTest < MiniTest::Test def setup @training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - unless @training_dataset - Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm") - @training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - end @prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first end def test_validate_default_nanoparticle_model model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature cv = CrossValidation.create model - p cv - p cv.rmse - p cv.r_squared #File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot} refute_nil cv.r_squared refute_nil cv.rmse end - def test_validate_pls_nanoparticle_model + def test_validate_pls_pchem_model algorithms = { :descriptors => { :method => "properties", :categories => ["P-CHEM"] }, :prediction => {:method => 'Algorithm::Caret.pls' }, + :feature_selection => { + :method => "Algorithm::FeatureSelection.correlation_filter", + }, } model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method] cv = CrossValidation.create model - p cv.rmse - p cv.r_squared refute_nil cv.r_squared refute_nil cv.rmse end - def test_validate_proteomics_pls_nanoparticle_model +=begin + def test_validate_proteomics_pls_pchem_model algorithms = { :descriptors => { :method => "properties", :categories => ["Proteomics"] }, :prediction => {:method => 'Algorithm::Caret.pls' }, + :feature_selection => { + :method => "Algorithm::FeatureSelection.correlation_filter", + }, } model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method] cv = CrossValidation.create model - p cv.rmse - p cv.r_squared refute_nil cv.r_squared refute_nil cv.rmse end +=end - def test_validate_all_default_nanoparticle_model + def test_validate_proteomics_pchem_default_model algorithms = { :descriptors => { :method => "properties", :categories => ["Proteomics","P-CHEM"] }, + :feature_selection => { + :method => "Algorithm::FeatureSelection.correlation_filter", + }, } model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms cv = CrossValidation.create model - p cv.rmse - p cv.r_squared refute_nil cv.r_squared refute_nil cv.rmse end - def test_nanoparticle_fingerprint_model + def test_nanoparticle_fingerprint_model_without_feature_selection algorithms = { :descriptors => { :method => "fingerprint", @@ -86,13 +84,11 @@ class NanoparticleValidationTest < MiniTest::Test } model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms cv = CrossValidation.create model - p cv.rmse - p cv.r_squared refute_nil cv.r_squared refute_nil cv.rmse end - def test_nanoparticle_fingerprint_weighted_average_model + def test_nanoparticle_fingerprint_weighted_average_model_without_feature_selection algorithms = { :descriptors => { :method => "fingerprint", @@ -107,8 +103,6 @@ class NanoparticleValidationTest < MiniTest::Test } model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms cv = CrossValidation.create model - p cv.rmse - p cv.r_squared refute_nil cv.r_squared refute_nil cv.rmse end @@ -123,11 +117,12 @@ class NanoparticleValidationTest < MiniTest::Test :method => "Algorithm::Similarity.tanimoto", :min => 0.1 }, + :feature_selection => { + :method => "Algorithm::FeatureSelection.correlation_filter", + }, } model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms cv = CrossValidation.create model - p cv.rmse - p cv.r_squared refute_nil cv.r_squared refute_nil cv.rmse end diff --git a/test/validation-regression.rb b/test/validation-regression.rb index a0895f9..7630521 100644 --- a/test/validation-regression.rb +++ b/test/validation-regression.rb @@ -86,7 +86,6 @@ class ValidationRegressionTest < MiniTest::Test #assert cv.r_squared > 0.34, "R^2 (#{cv.r_squared}) should be larger than 0.034" #assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split" end - p repeated_cv File.open("tmp.png","w+"){|f| f.puts repeated_cv.correlation_plot} end -- cgit v1.2.3