summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-11-11 13:07:53 +0100
committerChristoph Helma <helma@in-silico.ch>2016-11-11 13:07:53 +0100
commitb6116bc4705066da30668ff3370f3b1c307e44e7 (patch)
tree387c3f43cd9cc4d8fecbaeaf80773a83e52c93ff
parent9a06f2ff5ae6bdbe7dc90555599e186f1585e0d2 (diff)
enm import fixed
-rw-r--r--lazar.gemspec1
-rw-r--r--lib/import.rb194
-rw-r--r--lib/model.rb21
-rw-r--r--test/descriptor.rb1
-rw-r--r--test/model-nanoparticle.rb1
-rw-r--r--test/nanomaterial-prediction-models.rb1
-rw-r--r--test/setup.rb4
-rw-r--r--test/validation-nanoparticle.rb43
-rw-r--r--test/validation-regression.rb1
9 files changed, 106 insertions, 161 deletions
diff --git a/lazar.gemspec b/lazar.gemspec
index a805edb..dfdaac8 100644
--- a/lazar.gemspec
+++ b/lazar.gemspec
@@ -24,5 +24,4 @@ Gem::Specification.new do |s|
s.add_runtime_dependency 'rserve-client', '~> 0.3'
s.add_runtime_dependency 'mongoid', '~> 5.0'
s.add_runtime_dependency 'openbabel', '~> 2.3', '>= 2.3.2.2'
-
end
diff --git a/lib/import.rb b/lib/import.rb
index 8f640b1..aa2ee75 100644
--- a/lib/import.rb
+++ b/lib/import.rb
@@ -5,129 +5,95 @@ module OpenTox
class Enanomapper
include OpenTox
- def self.mirror dir=nil
- # clean download dir
- dir ||= File.join(File.dirname(__FILE__),"..","data","enm")
- FileUtils.rm_rf dir
- FileUtils.mkdir_p dir
-
- #get list of bundle URIs
+ # time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%)
+ def self.import dir="."
+ datasets = {}
bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
- File.open(File.join(dir,"bundles.json"),"w+"){|f| f.puts JSON.pretty_generate(bundles)}
- # bundles
- # id/summary
- # id/compound
- # id/substance
- # id/property
-
bundles.each do |bundle|
+ datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"])
$logger.debug bundle["title"]
nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"]
- $logger.debug nanoparticles.size
- nanoparticles.each do |nanoparticle|
- uuid = nanoparticle["values"]["https://data.enanomapper.net/identifier/uuid"]
- $logger.debug uuid
- File.open(File.join(dir,"nanoparticle-#{uuid}.json"),"w+"){|f| f.puts JSON.pretty_generate(nanoparticle)}
- studies = JSON.parse(RestClientWrapper.get(File.join(nanoparticle["compound"]["URI"],"study")))["study"]
- $logger.debug uuid if studies.size < 1
- studies.each do |study|
- File.open(File.join(dir,"study-#{study["uuid"]}.json"),"w+"){|f| f.puts JSON.pretty_generate(study)}
- end
- end
- end
- end
-
- def self.import dir="."
- start_time = Time.now
- t1 = 0
- t2 = 0
- datasets = {}
- JSON.parse(File.read(File.join(dir,"bundles.json"))).each do |bundle|
- if bundle["id"] == 3
- datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"])
- end
- end
- # TODO this is only for protein corona
- Dir[File.join(dir,"study-F*.json")].each do |s|
- t = Time.now
- study = JSON.parse(File.read(s))
- np = JSON.parse(File.read(File.join(dir,"nanoparticle-#{study['owner']['substance']['uuid']}.json")))
- core_id = nil
- coating_ids = []
- np["composition"].each do |c|
- uri = c["component"]["compound"]["URI"]
- uri = CGI.escape File.join(uri,"&media=application/json")
- data = JSON.parse(RestClientWrapper.get "https://data.enanomapper.net/query/compound/url/all?media=application/json&search=#{uri}")
- smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"]
- names = []
- names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"]
- names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"]
- if smiles
- compound = Compound.find_or_create_by(:smiles => smiles)
- compound.names = names.compact
- else
- compound = Compound.find_or_create_by(:names => names)
- end
- compound.save
- if c["relation"] == "HAS_CORE"
- core_id = compound.id.to_s
- elsif c["relation"] == "HAS_COATING"
- coating_ids << compound.id.to_s
+ nanoparticles.each_with_index do |np,n|
+ core_id = nil
+ coating_ids = []
+ np["composition"].each do |c|
+ uri = c["component"]["compound"]["URI"]
+ uri = CGI.escape File.join(uri,"&media=application/json")
+ data = JSON.parse(RestClientWrapper.get "https://data.enanomapper.net/query/compound/url/all?media=application/json&search=#{uri}")
+ smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"]
+ names = []
+ names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"]
+ names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"]
+ if smiles
+ compound = Compound.find_or_create_by(:smiles => smiles)
+ compound.name = names.first
+ compound.names = names.compact
+ else
+ compound = Compound.find_or_create_by(:name => names.first,:names => names)
+ end
+ compound.save
+ if c["relation"] == "HAS_CORE"
+ core_id = compound.id.to_s
+ elsif c["relation"] == "HAS_COATING"
+ coating_ids << compound.id.to_s
+ end
+ end if np["composition"]
+ nanoparticle = Nanoparticle.find_or_create_by(
+ :name => np["values"]["https://data.enanomapper.net/identifier/name"],
+ :source => np["compound"]["URI"],
+ :core_id => core_id,
+ :coating_ids => coating_ids
+ )
+ np["bundles"].keys.each do |bundle_uri|
+ nanoparticle.dataset_ids << datasets[bundle_uri].id
end
- end if np["composition"]
- nanoparticle = Nanoparticle.find_or_create_by(
- :name => np["values"]["https://data.enanomapper.net/identifier/name"],
- :source => np["compound"]["URI"],
- :core_id => core_id,
- :coating_ids => coating_ids
- )
- np["bundles"].keys.each do |bundle_uri|
- nanoparticle.dataset_ids << datasets[bundle_uri].id
- end
- dataset = datasets[np["bundles"].keys.first]
- proteomics_features = {}
- category = study["protocol"]["topcategory"]
- source = study["protocol"]["category"]["term"]
-
- study["effects"].each do |effect|
-
- effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature
- effect["conditions"].delete_if { |k, v| v.nil? }
-
- if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data
-
- JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step
- proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true)
- nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset
- end
- else
- name = effect["endpoint"]
- unit = effect["result"]["unit"]
- warnings = []
- case name
- when "Log2 transformed" # use a sensible name
- name = "log2(Net cell association)"
- warnings = ["Original name was 'Log2 transformed'"]
- unit = "log2(mL/ug(Mg))"
- when "Total protein (BCA assay)"
- category = "P-CHEM"
- warnings = ["Category changed from TOX to P-CHEM"]
+ studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study")))["study"]
+ studies.each do |study|
+ dataset = datasets[np["bundles"].keys.first]
+ proteomics_features = {}
+ category = study["protocol"]["topcategory"]
+ source = study["protocol"]["category"]["term"]
+ study["effects"].each do |effect|
+
+ effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature
+ effect["conditions"].delete_if { |k, v| v.nil? }
+
+ if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data
+
+ JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step
+ proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true)
+ nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset
+ end
+ else
+ name = effect["endpoint"]
+ unit = effect["result"]["unit"]
+ warnings = []
+ case name
+ when "Log2 transformed" # use a sensible name
+ name = "log2(Net cell association)"
+ warnings = ["Original name was 'Log2 transformed'"]
+ unit = "log2(mL/ug(Mg))"
+ when "Total protein (BCA assay)"
+ category = "P-CHEM"
+ warnings = ["Category changed from TOX to P-CHEM"]
+ end
+ feature = klass.find_or_create_by(
+ :name => name,
+ :unit => unit,
+ :category => category,
+ :conditions => effect["conditions"],
+ :source => study["protocol"]["category"]["term"],
+ :measured => true,
+ :warnings => warnings
+ )
+ nanoparticle.parse_ambit_value feature, effect["result"], dataset
+ end
end
- feature = klass.find_or_create_by(
- :name => name,
- :unit => unit,
- :category => category,
- :conditions => effect["conditions"],
- :source => study["protocol"]["category"]["term"],
- :measured => true,
- :warnings => warnings
- )
- nanoparticle.parse_ambit_value feature, effect["result"], dataset
end
+ nanoparticle.save
+ print "#{n}, "
end
- p nanoparticle
- nanoparticle.save
end
datasets.each { |u,d| d.save }
end
diff --git a/lib/model.rb b/lib/model.rb
index 809dc48..9be0fa0 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -152,10 +152,7 @@ module OpenTox
categories.each do |category|
Feature.where(category:category).each{|f| feature_ids << f.id.to_s}
end
- #p feature_ids
- #properties = Nanoparticle.all.collect { |s| p s.name; p s.id; p s.properties }
properties = model.substances.collect { |s| s.properties }
- #p properties
property_ids = properties.collect{|p| p.keys}.flatten.uniq
model.descriptor_ids = feature_ids & property_ids
model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}}
@@ -223,10 +220,10 @@ module OpenTox
prediction[:measurements] << dependent_variables[i]
prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
else
- next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core
if fingerprints?
neighbor_descriptors = fingerprints[i]
else
+ next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions
neighbor_descriptors = scaled_variables.collect{|v| v[i]}
end
sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights]
@@ -344,7 +341,6 @@ module OpenTox
field :unit, type: String
field :model_id, type: BSON::ObjectId
field :repeated_crossvalidation_id, type: BSON::ObjectId
- #field :leave_one_out_validation_id, type: BSON::ObjectId
def predict object
model.predict object
@@ -370,10 +366,6 @@ module OpenTox
repeated_crossvalidation.crossvalidations
end
- def leave_one_out_validation
- Validation::LeaveOneOut.find leave_one_out_validation_id
- end
-
def regression?
model.is_a? LazarRegression
end
@@ -390,7 +382,6 @@ module OpenTox
model = Lazar.create training_dataset: training_dataset
prediction_model[:model_id] = model.id
prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id
- #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id
prediction_model.save
prediction_model
end
@@ -406,12 +397,7 @@ module OpenTox
unless training_dataset # try to import from json dump
Import::Enanomapper.import
training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
- unless training_dataset
- Import::Enanomapper.mirror
- Import::Enanomapper.import
- training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
- bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset
- end
+ bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset
end
prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first
@@ -424,8 +410,7 @@ module OpenTox
model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms)
prediction_model[:model_id] = model.id
repeated_cv = Validation::RepeatedCrossValidation.create model
- prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id
- #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id
+ prediction_model[:repeated_crossvalidation_id] = repeated_cv.id
prediction_model.save
prediction_model
end
diff --git a/test/descriptor.rb b/test/descriptor.rb
index 6eb4316..563cdce 100644
--- a/test/descriptor.rb
+++ b/test/descriptor.rb
@@ -6,7 +6,6 @@ class DescriptorTest < MiniTest::Test
# check available descriptors
assert_equal 15,PhysChem.openbabel_descriptors.size,"incorrect number of Openbabel descriptors"
assert_equal 45,PhysChem.joelib_descriptors.size,"incorrect number of Joelib descriptors"
- p PhysChem.cdk_descriptors
assert_equal 286,PhysChem.cdk_descriptors.size,"incorrect number of Cdk descriptors"
assert_equal 346,PhysChem.descriptors.size,"incorrect number of physchem descriptors"
end
diff --git a/test/model-nanoparticle.rb b/test/model-nanoparticle.rb
index c5f3223..7fb944e 100644
--- a/test/model-nanoparticle.rb
+++ b/test/model-nanoparticle.rb
@@ -108,7 +108,6 @@ class NanoparticleModelTest < MiniTest::Test
},
}
model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature, algorithms: algorithms
- p model
refute_empty model.dependent_variables
refute_empty model.descriptor_ids
refute_empty model.independent_variables
diff --git a/test/nanomaterial-prediction-models.rb b/test/nanomaterial-prediction-models.rb
index b0c05f3..f90a822 100644
--- a/test/nanomaterial-prediction-models.rb
+++ b/test/nanomaterial-prediction-models.rb
@@ -13,7 +13,6 @@ class NanomaterialPredictionModelTest < MiniTest::Test
def test_default_nanomaterial_prediction_model
prediction_model = Model::NanoPrediction.create
- p prediction_model
[:endpoint,:species,:source].each do |p|
refute_empty prediction_model[p]
end
diff --git a/test/setup.rb b/test/setup.rb
index 6c97282..63b59fb 100644
--- a/test/setup.rb
+++ b/test/setup.rb
@@ -5,5 +5,9 @@ require_relative '../lib/lazar.rb'
include OpenTox
TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
DATA_DIR ||= File.join(TEST_DIR,"data")
+training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
+unless training_dataset
+ Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm")
+end
#$mongo.database.drop
#$gridfs = $mongo.database.fs
diff --git a/test/validation-nanoparticle.rb b/test/validation-nanoparticle.rb
index 5ed70f2..9351e1b 100644
--- a/test/validation-nanoparticle.rb
+++ b/test/validation-nanoparticle.rb
@@ -5,74 +5,72 @@ class NanoparticleValidationTest < MiniTest::Test
def setup
@training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
- unless @training_dataset
- Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm")
- @training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
- end
@prediction_feature = @training_dataset.features.select{|f| f["name"] == 'log2(Net cell association)'}.first
end
def test_validate_default_nanoparticle_model
model = Model::Lazar.create training_dataset: @training_dataset, prediction_feature: @prediction_feature
cv = CrossValidation.create model
- p cv
- p cv.rmse
- p cv.r_squared
#File.open("tmp.pdf","w+"){|f| f.puts cv.correlation_plot}
refute_nil cv.r_squared
refute_nil cv.rmse
end
- def test_validate_pls_nanoparticle_model
+ def test_validate_pls_pchem_model
algorithms = {
:descriptors => {
:method => "properties",
:categories => ["P-CHEM"]
},
:prediction => {:method => 'Algorithm::Caret.pls' },
+ :feature_selection => {
+ :method => "Algorithm::FeatureSelection.correlation_filter",
+ },
}
model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms
assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method]
cv = CrossValidation.create model
- p cv.rmse
- p cv.r_squared
refute_nil cv.r_squared
refute_nil cv.rmse
end
- def test_validate_proteomics_pls_nanoparticle_model
+=begin
+ def test_validate_proteomics_pls_pchem_model
algorithms = {
:descriptors => {
:method => "properties",
:categories => ["Proteomics"]
},
:prediction => {:method => 'Algorithm::Caret.pls' },
+ :feature_selection => {
+ :method => "Algorithm::FeatureSelection.correlation_filter",
+ },
}
model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms
assert_equal "Algorithm::Caret.pls", model.algorithms[:prediction][:method]
cv = CrossValidation.create model
- p cv.rmse
- p cv.r_squared
refute_nil cv.r_squared
refute_nil cv.rmse
end
+=end
- def test_validate_all_default_nanoparticle_model
+ def test_validate_proteomics_pchem_default_model
algorithms = {
:descriptors => {
:method => "properties",
:categories => ["Proteomics","P-CHEM"]
},
+ :feature_selection => {
+ :method => "Algorithm::FeatureSelection.correlation_filter",
+ },
}
model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms
cv = CrossValidation.create model
- p cv.rmse
- p cv.r_squared
refute_nil cv.r_squared
refute_nil cv.rmse
end
- def test_nanoparticle_fingerprint_model
+ def test_nanoparticle_fingerprint_model_without_feature_selection
algorithms = {
:descriptors => {
:method => "fingerprint",
@@ -86,13 +84,11 @@ class NanoparticleValidationTest < MiniTest::Test
}
model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms
cv = CrossValidation.create model
- p cv.rmse
- p cv.r_squared
refute_nil cv.r_squared
refute_nil cv.rmse
end
- def test_nanoparticle_fingerprint_weighted_average_model
+ def test_nanoparticle_fingerprint_weighted_average_model_without_feature_selection
algorithms = {
:descriptors => {
:method => "fingerprint",
@@ -107,8 +103,6 @@ class NanoparticleValidationTest < MiniTest::Test
}
model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms
cv = CrossValidation.create model
- p cv.rmse
- p cv.r_squared
refute_nil cv.r_squared
refute_nil cv.rmse
end
@@ -123,11 +117,12 @@ class NanoparticleValidationTest < MiniTest::Test
:method => "Algorithm::Similarity.tanimoto",
:min => 0.1
},
+ :feature_selection => {
+ :method => "Algorithm::FeatureSelection.correlation_filter",
+ },
}
model = Model::Lazar.create prediction_feature: @prediction_feature, training_dataset: @training_dataset, algorithms: algorithms
cv = CrossValidation.create model
- p cv.rmse
- p cv.r_squared
refute_nil cv.r_squared
refute_nil cv.rmse
end
diff --git a/test/validation-regression.rb b/test/validation-regression.rb
index a0895f9..7630521 100644
--- a/test/validation-regression.rb
+++ b/test/validation-regression.rb
@@ -86,7 +86,6 @@ class ValidationRegressionTest < MiniTest::Test
#assert cv.r_squared > 0.34, "R^2 (#{cv.r_squared}) should be larger than 0.034"
#assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
end
- p repeated_cv
File.open("tmp.png","w+"){|f| f.puts repeated_cv.correlation_plot}
end