diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/caret.rb | 2 | ||||
-rw-r--r-- | lib/compound.rb | 3 | ||||
-rw-r--r-- | lib/crossvalidation.rb | 1 | ||||
-rw-r--r-- | lib/import.rb | 165 | ||||
-rw-r--r-- | lib/model.rb | 94 | ||||
-rw-r--r-- | lib/nanoparticle.rb | 46 | ||||
-rw-r--r-- | lib/similarity.rb | 4 |
7 files changed, 152 insertions, 163 deletions
diff --git a/lib/caret.rb b/lib/caret.rb index 18bfc41..7e4f771 100644 --- a/lib/caret.rb +++ b/lib/caret.rb @@ -12,7 +12,7 @@ module OpenTox independent_variables.delete_at i query_variables.delete_at i end - if independent_variables.flatten.uniq == ["NA"] + if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == [] prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." elsif diff --git a/lib/compound.rb b/lib/compound.rb index a399169..8a1143b 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -136,9 +136,6 @@ module OpenTox # @param inchi [String] smiles InChI string # @return [OpenTox::Compound] Compound def self.from_inchi inchi - # Temporary workaround for OpenBabels Inchi bug - # http://sourceforge.net/p/openbabel/bugs/957/ - # bug has not been fixed in latest git/development version #smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip smiles = obconversion(inchi,"inchi","can") if smiles.empty? diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index be680ae..5a05955 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -6,6 +6,7 @@ module OpenTox field :folds, type: Integer, default: 10 def self.create model, n=10 + $logger.debug model.algorithms klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification klass = RegressionCrossValidation if model.is_a? Model::LazarRegression bad_request_error "Unknown model class #{model.class}." unless klass diff --git a/lib/import.rb b/lib/import.rb index 8e57401..aa2ee75 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -5,106 +5,95 @@ module OpenTox class Enanomapper include OpenTox - def self.mirror dir="." - #get list of bundle URIs + # time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%) + def self.import dir="." + datasets = {} bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] - File.open(File.join(dir,"bundles.json"),"w+"){|f| f.puts JSON.pretty_generate(bundles)} bundles.each do |bundle| + datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"]) $logger.debug bundle["title"] nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"] - $logger.debug nanoparticles.size - nanoparticles.each do |nanoparticle| - uuid = nanoparticle["values"]["https://data.enanomapper.net/identifier/uuid"] - $logger.debug uuid - File.open(File.join(dir,"nanoparticle-#{uuid}.json"),"w+"){|f| f.puts JSON.pretty_generate(nanoparticle)} - studies = JSON.parse(RestClientWrapper.get(File.join(nanoparticle["compound"]["URI"],"study")))["study"] - $logger.debug uuid if studies.size < 1 - studies.each do |study| - File.open(File.join(dir,"study-#{study["uuid"]}.json"),"w+"){|f| f.puts JSON.pretty_generate(study)} - end - end - end - end - - def self.import dir="." - start_time = Time.now - t1 = 0 - t2 = 0 - datasets = {} - JSON.parse(File.read(File.join(dir,"bundles.json"))).each do |bundle| - datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"]) - end - Dir[File.join(dir,"study*.json")].each do |s| - t = Time.now - study = JSON.parse(File.read(s)) - np = JSON.parse(File.read(File.join(dir,"nanoparticle-#{study['owner']['substance']['uuid']}.json"))) - core = {} - coating = [] - np["composition"].each do |c| - if c["relation"] == "HAS_CORE" - core = { - :uri => c["component"]["compound"]["URI"], - :name => c["component"]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"] - } - elsif c["relation"] == "HAS_COATING" - coating << { - :uri => c["component"]["compound"]["URI"], - :name => c["component"]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"] - } + nanoparticles.each_with_index do |np,n| + core_id = nil + coating_ids = [] + np["composition"].each do |c| + uri = c["component"]["compound"]["URI"] + uri = CGI.escape File.join(uri,"&media=application/json") + data = JSON.parse(RestClientWrapper.get "https://data.enanomapper.net/query/compound/url/all?media=application/json&search=#{uri}") + smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"] + names = [] + names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"] + names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"] + if smiles + compound = Compound.find_or_create_by(:smiles => smiles) + compound.name = names.first + compound.names = names.compact + else + compound = Compound.find_or_create_by(:name => names.first,:names => names) + end + compound.save + if c["relation"] == "HAS_CORE" + core_id = compound.id.to_s + elsif c["relation"] == "HAS_COATING" + coating_ids << compound.id.to_s + end + end if np["composition"] + nanoparticle = Nanoparticle.find_or_create_by( + :name => np["values"]["https://data.enanomapper.net/identifier/name"], + :source => np["compound"]["URI"], + :core_id => core_id, + :coating_ids => coating_ids + ) + np["bundles"].keys.each do |bundle_uri| + nanoparticle.dataset_ids << datasets[bundle_uri].id end - end if np["composition"] - nanoparticle = Nanoparticle.find_or_create_by( - :name => np["values"]["https://data.enanomapper.net/identifier/name"], - :source => np["compound"]["URI"], - :core => core, - :coating => coating - ) - np["bundles"].keys.each do |bundle_uri| - nanoparticle.dataset_ids << datasets[bundle_uri].id - end - - dataset = datasets[np["bundles"].keys.first] - proteomics_features = {} - category = study["protocol"]["topcategory"] - source = study["protocol"]["category"]["term"] - study["effects"].each do |effect| + studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study")))["study"] + studies.each do |study| + dataset = datasets[np["bundles"].keys.first] + proteomics_features = {} + category = study["protocol"]["topcategory"] + source = study["protocol"]["category"]["term"] + study["effects"].each do |effect| - effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature - effect["conditions"].delete_if { |k, v| v.nil? } + effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature + effect["conditions"].delete_if { |k, v| v.nil? } - if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data + if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data - JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step - proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true) - nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset - end - else - name = effect["endpoint"] - unit = effect["result"]["unit"] - warnings = [] - case name - when "Log2 transformed" # use a sensible name - name = "log2(Net cell association)" - warnings = ["Original name was 'Log2 transformed'"] - unit = "log2(mL/ug(Mg))" - when "Total protein (BCA assay)" - category = "P-CHEM" - warnings = ["Category changed from TOX to P-CHEM"] + JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step + proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true) + nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset + end + else + name = effect["endpoint"] + unit = effect["result"]["unit"] + warnings = [] + case name + when "Log2 transformed" # use a sensible name + name = "log2(Net cell association)" + warnings = ["Original name was 'Log2 transformed'"] + unit = "log2(mL/ug(Mg))" + when "Total protein (BCA assay)" + category = "P-CHEM" + warnings = ["Category changed from TOX to P-CHEM"] + end + feature = klass.find_or_create_by( + :name => name, + :unit => unit, + :category => category, + :conditions => effect["conditions"], + :source => study["protocol"]["category"]["term"], + :measured => true, + :warnings => warnings + ) + nanoparticle.parse_ambit_value feature, effect["result"], dataset + end end - feature = klass.find_or_create_by( - :name => name, - :unit => unit, - :category => category, - :conditions => effect["conditions"], - :source => study["protocol"]["category"]["term"], - :measured => true, - :warnings => warnings - ) - nanoparticle.parse_ambit_value feature, effect["result"], dataset end + nanoparticle.save + print "#{n}, " end - nanoparticle.save end datasets.each { |u,d| d.save } end diff --git a/lib/model.rb b/lib/model.rb index adcbcc6..e8b30ca 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -81,7 +81,6 @@ module OpenTox :method => "properties", :categories => ["P-CHEM"], }, - #:descriptors => ["P-CHEM","Proteomics"], :similarity => { :method => "Algorithm::Similarity.weighted_cosine", :min => 0.5 @@ -103,11 +102,12 @@ module OpenTox parameters.each do |p,v| model.algorithms[type] ||= {} model.algorithms[type][p] = v + model.algorithms[:descriptors].delete :categories if type == :descriptors and p == :type end else model.algorithms[type] = parameters end - end + end if algorithms # parse dependent_variables from training dataset training_dataset.substances.each do |substance| @@ -140,10 +140,11 @@ module OpenTox model.algorithms[:descriptors].delete(:features) model.algorithms[:descriptors].delete(:type) model.substances.each_with_index do |s,i| - s.calculate_properties(features).each_with_index do |v,j| + props = s.calculate_properties(features) + props.each_with_index do |v,j| model.independent_variables[j] ||= [] model.independent_variables[j][i] = v - end + end if props and !props.empty? end # parse independent_variables when "properties" @@ -152,7 +153,7 @@ module OpenTox categories.each do |category| Feature.where(category:category).each{|f| feature_ids << f.id.to_s} end - properties = model.substances.collect { |s| s.properties } + properties = model.substances.collect { |s| s.properties } property_ids = properties.collect{|p| p.keys}.flatten.uniq model.descriptor_ids = feature_ids & property_ids model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} @@ -220,10 +221,10 @@ module OpenTox prediction[:measurements] << dependent_variables[i] prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance." else - next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core if fingerprints? neighbor_descriptors = fingerprints[i] else + next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions neighbor_descriptors = scaled_variables.collect{|v| v[i]} end sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights] @@ -246,6 +247,7 @@ module OpenTox elsif neighbor_similarities.size == 1 prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]}) else + query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint" # call prediction algorithm result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors prediction.merge! result @@ -328,7 +330,7 @@ module OpenTox class LazarRegression < Lazar end - class Prediction + class Validation include OpenTox include Mongoid::Document @@ -340,7 +342,6 @@ module OpenTox field :unit, type: String field :model_id, type: BSON::ObjectId field :repeated_crossvalidation_id, type: BSON::ObjectId - field :leave_one_out_validation_id, type: BSON::ObjectId def predict object model.predict object @@ -354,6 +355,10 @@ module OpenTox Lazar.find model_id end + def algorithms + model.algorithms + end + def prediction_feature model.prediction_feature end @@ -366,10 +371,6 @@ module OpenTox repeated_crossvalidation.crossvalidations end - def leave_one_out_validation - Validation::LeaveOneOut.find leave_one_out_validation_id - end - def regression? model.is_a? LazarRegression end @@ -381,63 +382,38 @@ module OpenTox def self.from_csv_file file metadata_file = file.sub(/csv$/,"json") bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file - prediction_model = self.new JSON.parse(File.read(metadata_file)) + model_validation = self.new JSON.parse(File.read(metadata_file)) training_dataset = Dataset.from_csv_file file model = Lazar.create training_dataset: training_dataset - prediction_model[:model_id] = model.id - prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id - #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id - prediction_model.save - prediction_model + model_validation[:model_id] = model.id + model_validation[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id + model_validation.save + model_validation end - end - - class NanoPrediction < Prediction - - def self.from_json_dump dir, category - Import::Enanomapper.import dir - training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - unless training_dataset - Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm") + def self.from_enanomapper training_dataset: nil, prediction_feature:nil, algorithms: nil + + # find/import training_dataset + training_dataset ||= Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + unless training_dataset # try to import from json dump + Import::Enanomapper.import training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset end - prediction_model = self.new( - :endpoint => "log2(Net cell association)", - :source => "https://data.enanomapper.net/", - :species => "A549 human lung epithelial carcinoma cells", - :unit => "log2(ug/Mg)" - ) - prediction_feature = Feature.where(name: "log2(Net cell association)", category: "TOX").first - model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset) - prediction_model[:model_id] = model.id - repeated_cv = Validation::RepeatedCrossValidation.create model - prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id - #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id - prediction_model.save - prediction_model - end + prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first - def self.create dir: dir, algorithms: algorithms - training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - unless training_dataset - Import::Enanomapper.import dir - training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - end - prediction_model = self.new( - :endpoint => "log2(Net cell association)", - :source => "https://data.enanomapper.net/", + model_validation = self.new( + :endpoint => prediction_feature.name, + :source => prediction_feature.source, :species => "A549 human lung epithelial carcinoma cells", - :unit => "log2(ug/Mg)" + :unit => prediction_feature.unit ) - prediction_feature = Feature.where(name: "log2(Net cell association)", category: "TOX").first - model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms) - prediction_model[:model_id] = model.id + model = LazarRegression.create prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms + model_validation[:model_id] = model.id repeated_cv = Validation::RepeatedCrossValidation.create model - prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id - #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id - prediction_model.save - prediction_model + model_validation[:repeated_crossvalidation_id] = repeated_cv.id + model_validation.save + model_validation end end diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 23e155c..02d9a89 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -3,8 +3,30 @@ module OpenTox class Nanoparticle < Substance include OpenTox - field :core, type: Hash, default: {} - field :coating, type: Array, default: [] + field :core_id, type: String, default: nil + field :coating_ids, type: Array, default: [] + + def core + Compound.find core_id + end + + def coating + coating_ids.collect{|i| Compound.find i } + end + + def fingerprint type=DEFAULT_FINGERPRINT + core_fp = core.fingerprint type + coating_fp = coating.collect{|c| c.fingerprint type}.flatten.uniq.compact + (core_fp.empty? or coating_fp.empty?) ? [] : (core_fp+coating_fp).uniq.compact + end + + def calculate_properties descriptors=PhysChem::OPENBABEL + if core.smiles and !coating.collect{|c| c.smiles}.compact.empty? + core_prop = core.calculate_properties descriptors + coating_prop = coating.collect{|c| c.calculate_properties descriptors if c.smiles} + descriptors.collect_with_index{|d,i| [core_prop[i],coating_prop.collect{|c| c[i] if c}]} + end + end def add_feature feature, value, dataset unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand @@ -37,28 +59,28 @@ module OpenTox add_feature feature, v["loValue"], dataset elsif v.keys.size == 2 and v["errorValue"] add_feature feature, v["loValue"], dataset - warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." + #warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." elsif v.keys.size == 2 and v["loQualifier"] == "mean" add_feature feature, v["loValue"], dataset - warn "'#{feature.name}' is a mean value. Original data is not available." + #warn "'#{feature.name}' is a mean value. Original data is not available." elsif v.keys.size == 2 and v["loQualifier"] #== ">=" - warn "Only min value available for '#{feature.name}', entry ignored" + #warn "Only min value available for '#{feature.name}', entry ignored" elsif v.keys.size == 2 and v["upQualifier"] #== ">=" - warn "Only max value available for '#{feature.name}', entry ignored" + #warn "Only max value available for '#{feature.name}', entry ignored" elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil? add_feature feature, v["loValue"], dataset - warn "loQualifier and upQualifier are empty." + #warn "loQualifier and upQualifier are empty." elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"] == "" and v["upQualifier"] == "" add_feature feature, v["loValue"], dataset - warn "loQualifier and upQualifier are empty." + #warn "loQualifier and upQualifier are empty." elsif v.keys.size == 4 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil? add_feature feature, v["loValue"], dataset - warn "loQualifier and upQualifier are empty." + #warn "loQualifier and upQualifier are empty." elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] and v["loValue"] and v["upValue"] - add_feature feature, [v["loValue"],v["upValue"]].mean, dataset - warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available." + #add_feature feature, [v["loValue"],v["upValue"]].mean, dataset + #warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available." elsif v.size == 4 and v["loQualifier"] == "mean" and v["errorValue"] - warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." + #warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." add_feature feature, v["loValue"], dataset elsif v == {} # do nothing else diff --git a/lib/similarity.rb b/lib/similarity.rb index 772e812..0901936 100644 --- a/lib/similarity.rb +++ b/lib/similarity.rb @@ -19,6 +19,10 @@ module OpenTox ( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f end + #def self.weighted_tanimoto fingerprints + #( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f + #end + def self.euclid scaled_properties sq = scaled_properties[0].zip(scaled_properties[1]).map{|a,b| (a - b) ** 2} Math.sqrt(sq.inject(0) {|s,c| s + c}) |