From 063acd4dc63e9287287cc1ff78fff2064ff74e4f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 7 Apr 2016 17:39:14 +0200 Subject: initial ambit import --- lib/dataset.rb | 1 - lib/feature.rb | 1 + lib/lazar.rb | 3 ++- lib/nanoparticle.rb | 17 +++++++++++++++++ lib/opentox.rb | 1 + 5 files changed, 21 insertions(+), 2 deletions(-) create mode 100644 lib/nanoparticle.rb (limited to 'lib') diff --git a/lib/dataset.rb b/lib/dataset.rb index 5d8aeaf..2e48626 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -9,7 +9,6 @@ module OpenTox field :feature_ids, type: Array, default: [] field :compound_ids, type: Array, default: [] field :data_entries, type: Array, default: [] - field :source, type: String # Readers diff --git a/lib/feature.rb b/lib/feature.rb index b58946b..f13a3fb 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -6,6 +6,7 @@ module OpenTox field :numeric, type: Boolean field :measured, type: Boolean field :calculated, type: Boolean + field :unit, type: String end # Feature for categorical variables diff --git a/lib/lazar.rb b/lib/lazar.rb index a28ba3a..39dd8fa 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -61,7 +61,7 @@ suppressPackageStartupMessages({ " # OpenTox classes and includes -CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules +CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment","Nanoparticle"]# Algorithm and Models are modules [ # be aware of the require sequence as it affects class/method overwrites "overwrite.rb", @@ -71,6 +71,7 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveO "feature.rb", "physchem.rb", "compound.rb", + "nanoparticle.rb", "dataset.rb", "algorithm.rb", "model.rb", diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb new file mode 100644 index 0000000..3783ece --- /dev/null +++ b/lib/nanoparticle.rb @@ -0,0 +1,17 @@ +module OpenTox + + class Nanoparticle + include OpenTox + + field :particle_id, type: String + field :core, type: String + field :coatings, type: Array + + #field :physchem_descriptors, type: Hash, default: {} + #field :toxicities, type: Hash, default: {} + field :features, type: Hash, default: {} + + end +end + + diff --git a/lib/opentox.rb b/lib/opentox.rb index 186c87a..cc18cc6 100644 --- a/lib/opentox.rb +++ b/lib/opentox.rb @@ -13,6 +13,7 @@ module OpenTox include Mongoid::Timestamps store_in collection: klass.downcase.pluralize field :name, type: String + field :source, type: String field :warnings, type: Array, default: [] end OpenTox.const_set klass,c -- cgit v1.2.3 From f3780d7507092b643216054fa3ca1e6146281e43 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 8 Apr 2016 13:04:56 +0200 Subject: enm import test --- lib/compound.rb | 1 + lib/lazar.rb | 1 + lib/nanoparticle.rb | 45 ++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 42 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index 2a79fd6..a7518ed 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -21,6 +21,7 @@ module OpenTox field :default_fingerprint_size, type: Integer field :physchem_descriptors, type: Hash, default: {} field :dataset_ids, type: Array, default: [] + # TODO separate between physchem, bio and tox field :features, type: Hash, default: {} index({smiles: 1}, {unique: true}) diff --git a/lib/lazar.rb b/lib/lazar.rb index 39dd8fa..0e2cec2 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -81,5 +81,6 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveO "crossvalidation.rb", "leave-one-out-validation.rb", "experiment.rb", + "import.rb", ].each{ |f| require_relative f } OpenTox::PhysChem.descriptors # load descriptor features diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 3783ece..cda431a 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -3,13 +3,48 @@ module OpenTox class Nanoparticle include OpenTox - field :particle_id, type: String + #field :particle_id, type: String field :core, type: String - field :coatings, type: Array + field :coating, type: Array, default: [] - #field :physchem_descriptors, type: Hash, default: {} - #field :toxicities, type: Hash, default: {} - field :features, type: Hash, default: {} + field :physchem_descriptors, type: Hash, default: {} + field :toxicities, type: Hash, default: {} + #field :features, type: Hash, default: {} + field :bundles, type: Array, default: [] + + def predict + end + + def add_feature feature, value + if feature.source.match /property\/P-CHEM/ + physchem_descriptors[feature.id.to_s] ||= [] + physchem_descriptors[feature.id.to_s] << value + elsif feature.source.match /property\/TOX/ + toxicities[feature.id.to_s] ||= [] + toxicities[feature.id.to_s] << value + else + $logger.warn "Unknown feature type '#{feature.source}'. Value '#{value}' not inserted." + warnings << "Unknown feature type '#{feature.source}'. Value '#{value}' not inserted." + end + end + + def parse_ambit_value feature, v + if v.keys == ["loValue"] + add_feature feature, v["loValue"] + elsif v.keys.size == 2 and v["loQualifier"] == "mean" + add_feature feature, {:mean => v["loValue"]} + elsif v.keys.size == 2 and v["loQualifier"] #== ">=" + add_feature feature, {:min => v["loValue"],:max => Float::INFINITY} + elsif v.keys.size == 2 and v["upQualifier"] #== ">=" + add_feature feature, {:max => v["upValue"],:min => -Float::INFINITY} + elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] + add_feature feature, {:min => v["loValue"],:max => v["upValue"]} + elsif v == {} # do nothing + else + $logger.warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'." + warnings << "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'." + end + end end end -- cgit v1.2.3 From 515e644423998a94f07be06bf6460bcf4f96f968 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 8 Apr 2016 13:05:52 +0200 Subject: enm import test --- lib/import.rb | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 lib/import.rb (limited to 'lib') diff --git a/lib/import.rb b/lib/import.rb new file mode 100644 index 0000000..86c633a --- /dev/null +++ b/lib/import.rb @@ -0,0 +1,77 @@ +module OpenTox + + module Import + + class Enanomapper + include OpenTox + + def self.import + #get list of bundle URIs + bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] + bundles.each do |bundle| + uri = bundle["URI"] + nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"] + features = JSON.parse(RestClientWrapper.get(bundle["property"]+"?media=application%2Fjson"))["feature"] + nanoparticles.each do |np| + nanoparticle = Nanoparticle.find_or_create_by( + :name => np["values"]["https://data.enanomapper.net/identifier/name"], + :source => np["compound"]["URI"], + ) + nanoparticle.bundles << uri + np["composition"].each do |comp| + case comp["relation"] + when "HAS_CORE" + nanoparticle.core = comp["component"]["compound"]["URI"] + when "HAS_COATING" + nanoparticle.coating << comp["component"]["compound"]["URI"] + end + end if np["composition"] + np["values"].each do |u,v| + if u.match(/property/) + name, unit, source = nil + features.each do |uri,feat| + if u.match(/#{uri}/) + name = feat["title"] + unit = feat["units"] + source = uri + end + end + feature = Feature.find_or_create_by( + :name => name, + :unit => unit, + :source => source + ) + end + v.each{|value| nanoparticle.parse_ambit_value feature, value} if v.is_a? Array + end + nanoparticle.bundles.uniq! + nanoparticle.physchem_descriptors.each{|f,v| v.uniq!} + nanoparticle.toxicities.each{|f,v| v.uniq!} + nanoparticle.save! + end + end + + def self.dump + #get list of bundle URIs + `wget 'https://data.enanomapper.net/bundle?media=application%2Fjson' -O bundles.json` + json = JSON.parse File.read('./bundles.json') + json["dataset"].each do |dataset| + uri = dataset["URI"] + id = uri.split("/").last + `wget --header='accept:application/json' '#{uri}' -O 'bundle#{id}'` + `wget --header='accept:application/json' '#{dataset["summary"]}' -O 'summary#{id}.json'` + `wget --header='accept:application/json' '#{dataset["compound"]}' -O 'compound#{id}.json'` + `wget --header='accept:application/json' '#{dataset["substance"]}' -O 'substance#{id}.json'` + `wget --header='accept:application/json' '#{dataset["property"]}' -O 'property#{id}.json'` + `wget --header='accept:application/json' '#{dataset["dataset"]}' -O 'dataset#{id}.json'` + `wget --header='accept:application/json' '#{dataset["matrix"]}' -O 'matrix#{id}.json'` + end + end + end + + end + + end + +end + -- cgit v1.2.3 From 84222bae2bbb9fb3e0ce3e65de1be8e7f94d2147 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 12 Apr 2016 12:37:37 +0200 Subject: new dataset structure --- lib/compound.rb | 10 ++- lib/crossvalidation.rb | 1 - lib/dataset.rb | 173 ++++++++++++++++--------------------------------- lib/lazar.rb | 3 +- lib/model.rb | 39 +++++++++++ lib/nanoparticle.rb | 3 +- 6 files changed, 103 insertions(+), 126 deletions(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index a7518ed..84d8891 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -2,10 +2,8 @@ CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/" module OpenTox - class Compound + class Compound < Substance require_relative "unique_descriptors.rb" - include OpenTox - DEFAULT_FINGERPRINT = "MP2D" field :inchi, type: String @@ -347,14 +345,14 @@ module OpenTox end - # Convert mg to mmol + # Convert mmol to mg # @return [Float] value in mg def mmol_to_mg mmol mmol.to_f*molecular_weight end - # Convert mmol to mg - # @return [Float] value in mg + # Convert mg to mmol + # @return [Float] value in mmol def mg_to_mmol mg mg.to_f/molecular_weight end diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 15dfb21..b7cd7bf 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -297,5 +297,4 @@ module OpenTox end end - end diff --git a/lib/dataset.rb b/lib/dataset.rb index 2e48626..5c04382 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -6,21 +6,25 @@ module OpenTox class Dataset # associations like has_many, belongs_to deteriorate performance - field :feature_ids, type: Array, default: [] - field :compound_ids, type: Array, default: [] - field :data_entries, type: Array, default: [] + #field :feature_ids, type: Array, default: [] + #field :substance_ids, type: Array, default: [] + field :data_entries, type: Hash, default: {} # Readers - # Get all compounds def compounds - @compounds ||= self.compound_ids.collect{|id| OpenTox::Compound.find id} - @compounds + substances.select{|s| s.is_a? Compound} + end + + # Get all substances + def substances + @substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id} + @substances end # Get all features def features - @features ||= self.feature_ids.collect{|id| OpenTox::Feature.find(id)} + @features ||= data_entries.collect{|cid,f| f.keys}.flatten.uniq.collect{|id| OpenTox::Feature.find(id)} @features end @@ -29,22 +33,20 @@ module OpenTox # @param feature [OpenTox::Feature] OpenTox Feature object # @return [Array] Data entry values def values(compound, feature) - rows = compound_ids.each_index.select{|r| compound_ids[r] == compound.id } - col = feature_ids.index feature.id - rows.collect{|row| data_entries[row][col]} + data_entries[compound.id,feature.id] end # Writers # Set compounds def compounds=(compounds) - self.compound_ids = compounds.collect{|c| c.id} + self.substance_ids = compounds.collect{|c| c.id} end # Set features - def features=(features) - self.feature_ids = features.collect{|f| f.id} - end + #def features=(features) + #self.feature_ids = features.collect{|f| f.id} + #end # Dataset operations @@ -52,13 +54,8 @@ module OpenTox # @param [Integer] number of folds # @return [Array] Array with folds [training_dataset,test_dataset] def folds n - unique_compound_data = {} - compound_ids.each_with_index do |cid,i| - unique_compound_data[cid] ||= [] - unique_compound_data[cid] << data_entries[i] - end - unique_compound_ids = unique_compound_data.keys - len = unique_compound_ids.size + substance_ids = data_entries.keys + len = substance_ids.size indices = (0..len-1).to_a.shuffle mid = (len/n) chunks = [] @@ -67,19 +64,19 @@ module OpenTox last = start+mid last = last-1 unless len%n >= i test_idxs = indices[start..last] || [] - test_cids = test_idxs.collect{|i| unique_compound_ids[i]} + test_cids = test_idxs.collect{|i| substance_ids[i]} training_idxs = indices-test_idxs - training_cids = training_idxs.collect{|i| unique_compound_ids[i]} - chunk = [training_cids,test_cids].collect do |unique_cids| - cids = [] - data_entries = [] - unique_cids.each do |cid| - unique_compound_data[cid].each do |de| - cids << cid - data_entries << de + training_cids = training_idxs.collect{|i| substance_ids[i]} + chunk = [training_cids,test_cids].collect do |cids| + new_cids = [] + new_data_entries = [] + cids.each do |cid| + data_entries[cid].each do |de| + new_cids << cid + new_data_entries << de end end - dataset = self.class.new(:compound_ids => cids, :feature_ids => self.feature_ids, :data_entries => data_entries, :source => self.id ) + dataset = self.class.new(:data_entries => data_entries, :source => self.id ) dataset.compounds.each do |compound| compound.dataset_ids << dataset.id compound.save @@ -96,27 +93,7 @@ module OpenTox # Diagnostics def duplicates feature=self.features.first - col = feature_ids.index feature.id - dups = {} - compound_ids.each_with_index do |cid,i| - rows = compound_ids.each_index.select{|r| compound_ids[r] == cid } - values = rows.collect{|row| data_entries[row][col]} - dups[cid] = values if values.size > 1 - end - dups - end - - def correlation_plot training_dataset - # TODO: create/store svg - R.assign "features", data_entries - R.assign "activities", training_dataset.data_entries.collect{|de| de.first} - R.eval "featurePlot(features,activities)" - end - - def density_plot - # TODO: create/store svg - R.assign "acts", data_entries.collect{|r| r.first }#.compact - R.eval "plot(density(-log(acts),na.rm= TRUE), main='-log(#{features.first.name})')" + data_entries.select{|sid,f| f[feature.id].size > 1} end # Serialisation @@ -124,10 +101,15 @@ module OpenTox # converts dataset to csv format including compound smiles as first column, other column headers are feature names # @return [String] def to_csv(inchi=false) - CSV.generate() do |csv| #{:force_quotes=>true} + CSV.generate() do |csv| csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} - compounds.each_with_index do |c,i| - csv << [inchi ? c.inchi : c.smiles] + data_entries[i] + data_entries.each do |sid,f| + substance = Substance.find cid + features.each do |feature| + f[feature.id].each do |v| + csv << [inchi ? substance.inchi : substance.smiles , v] + end + end end end end @@ -143,7 +125,7 @@ module OpenTox # Create a dataset from CSV file # TODO: document structure - def self.from_csv_file file, source=nil, bioassay=true#, layout={} + def self.from_csv_file file, source=nil source ||= file name = File.basename(file,".*") dataset = self.find_by(:source => source, :name => name) @@ -153,21 +135,22 @@ module OpenTox $logger.debug "Parsing #{file}." table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8' dataset = self.new(:source => source, :name => name) - dataset.parse_table table, bioassay#, layout + dataset.parse_table table end dataset end # parse data in tabular format (e.g. from csv) # does a lot of guesswork in order to determine feature types - def parse_table table, bioassay=true + def parse_table table time = Time.now # features feature_names = table.shift.collect{|f| f.strip} - warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size + warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size compound_format = feature_names.shift.strip + # TODO nanoparticles bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i numeric = [] @@ -176,30 +159,20 @@ module OpenTox metadata = {:name => f} values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact types = values.collect{|v| v.numeric? ? true : false}.uniq + feature = nil if values.size == 0 # empty feature elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes metadata["numeric"] = true numeric[i] = true + feature = NumericFeature.find_or_create_by(metadata) else metadata["nominal"] = true metadata["accept_values"] = values numeric[i] = false + feature = NominalFeature.find_or_create_by(metadata) end - if bioassay - if metadata["numeric"] - feature = NumericBioAssay.find_or_create_by(metadata) - elsif metadata["nominal"] - feature = NominalBioAssay.find_or_create_by(metadata) - end - else - metadata.merge({:measured => false, :calculated => true}) - if metadata["numeric"] - feature = NumericFeature.find_or_create_by(metadata) - elsif metadata["nominal"] - feature = NominalFeature.find_or_create_by(metadata) - end - end - feature_ids << feature.id if feature + @features ||= [] + @features << feature if feature end $logger.debug "Feature values: #{Time.now-time}" @@ -210,7 +183,6 @@ module OpenTox value_time = 0 # compounds and values - self.data_entries = [] table.each_with_index do |vals,i| ct = Time.now @@ -222,6 +194,7 @@ module OpenTox compound = OpenTox::Compound.from_smiles(identifier) when /InChI/i compound = OpenTox::Compound.from_inchi(identifier) + # TODO nanoparticle end rescue compound = nil @@ -235,13 +208,13 @@ module OpenTox compound_time += Time.now-ct r += 1 - unless vals.size == feature_ids.size # way cheaper than accessing features + unless vals.size == @features.size warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." next end - compound_ids << compound.id - table.first.size == 0 ? self.data_entries << Array.new(0) : self.data_entries << Array.new(table.first.size-1) + #substance_ids << compound.id + #table.first.size == 0 ? self.data_entries[compound.id] = Array.new(0) : self.data_entries[compound.id] = Array.new(table.first.size-1) vals.each_with_index do |v,j| if v.blank? @@ -252,10 +225,13 @@ module OpenTox else v = v.strip end - self.data_entries.last[j] = v + self.data_entries[compound.id.to_s] ||= {} + self.data_entries[compound.id.to_s][@features[j].id.to_s] ||= [] + self.data_entries[compound.id.to_s][@features[j].id.to_s] << v #i = compound.feature_ids.index feature_ids[j] - compound.features[feature_ids[j].to_s] ||= [] - compound.features[feature_ids[j].to_s] << v + #TODO + #compound.features[feature_ids[j].to_s] ||= [] + #compound.features[feature_ids[j].to_s] << v compound.save end end @@ -272,17 +248,6 @@ module OpenTox end - # Fill unset data entries - # @param any value - def fill_nil_with n - (0 .. compound_ids.size-1).each do |i| - data_entries[i] ||= [] - (0 .. feature_ids.size-1).each do |j| - data_entries[i][j] ||= n - end - end - end - end # Dataset for lazar predictions @@ -296,28 +261,4 @@ module OpenTox end - # Dataset for descriptors (physchem) - class DescriptorDataset < Dataset - field :feature_calculation_algorithm, type: String - - end - - class ScaledDataset < DescriptorDataset - - field :centers, type: Array, default: [] - field :scales, type: Array, default: [] - - def original_value value, i - value * scales[i] + centers[i] - end - end - - # Dataset for fminer descriptors - class FminerDataset < DescriptorDataset - field :training_algorithm, type: String - field :training_dataset_id, type: BSON::ObjectId - field :training_feature_id, type: BSON::ObjectId - field :training_parameters, type: Hash - end - end diff --git a/lib/lazar.rb b/lib/lazar.rb index 0e2cec2..2bcecc5 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -61,7 +61,8 @@ suppressPackageStartupMessages({ " # OpenTox classes and includes -CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment","Nanoparticle"]# Algorithm and Models are modules +#CLASSES = ["Feature","Substance::Compound","Substance::Nanoparticle","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules +CLASSES = ["Feature","Substance","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules [ # be aware of the require sequence as it affects class/method overwrites "overwrite.rb", diff --git a/lib/model.rb b/lib/model.rb index 8e657b8..1f9942b 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -227,6 +227,45 @@ module OpenTox end end + class NanoLazar + include OpenTox + include Mongoid::Document + include Mongoid::Timestamps + store_in collection: "models" + + field :name, type: String + field :creator, type: String, default: __FILE__ + # datasets + field :training_dataset_id, type: BSON::ObjectId + # algorithms + field :prediction_algorithm, type: String + # prediction feature + field :prediction_feature_id, type: BSON::ObjectId + field :training_particle_ids, type: Array + + def self.create_all + nanoparticles = Nanoparticle.all + toxfeatures = Nanoparticle.all.collect{|np| np.toxicities.keys}.flatten.uniq.collect{|id| Feature.find id} + tox = {} + toxfeatures.each do |t| + tox[t] = nanoparticles.select{|np| np.toxicities.keys.include? t.id.to_s} + end + tox.select!{|t,nps| nps.size > 50} + tox.collect do |t,nps| + find_or_create_by(:prediction_feature_id => t.id, :training_particle_ids => nps.collect{|np| np.id}) + end + end + + def predict nanoparticle + training = training_particle_ids.collect{|id| Nanoparticle.find id} + training_features = training.collect{|t| t.physchem_descriptors.keys}.flatten.uniq + query_features = nanoparticle.physchem_descriptors.keys + common_features = (training_features & query_features) + p common_features + end + + end + end end diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index cda431a..c58dc8c 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -1,9 +1,8 @@ module OpenTox - class Nanoparticle + class Nanoparticle < Substance include OpenTox - #field :particle_id, type: String field :core, type: String field :coating, type: Array, default: [] -- cgit v1.2.3 From a8368dda776c05331474adf7eaf9a6e413a3b1eb Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 13 Apr 2016 15:15:51 +0200 Subject: validation tests pass --- lib/compound.rb | 2 +- lib/crossvalidation.rb | 109 +++------------------------------------- lib/dataset.rb | 40 +++++++-------- lib/lazar.rb | 3 +- lib/leave-one-out-validation.rb | 108 +++++++-------------------------------- lib/model.rb | 23 +++++---- lib/validation.rb | 62 ++++------------------- 7 files changed, 71 insertions(+), 276 deletions(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index 84d8891..757ba1a 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -341,7 +341,7 @@ module OpenTox {'$sort' => {'tanimoto' => -1}} ] - $mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]} + $mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]} end diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index b7cd7bf..f93a04c 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -6,7 +6,7 @@ module OpenTox field :folds, type: Integer field :nr_instances, type: Integer field :nr_unpredicted, type: Integer - field :predictions, type: Array, default: [] + field :predictions, type: Hash, default: {} field :finished_at, type: Time def time @@ -32,7 +32,7 @@ module OpenTox cv.save # set created_at nr_instances = 0 nr_unpredicted = 0 - predictions = [] + predictions = {} training_dataset = Dataset.find model.training_dataset_id training_dataset.folds(n).each_with_index do |fold,fold_nr| #fork do # parallel execution of validations @@ -42,12 +42,12 @@ module OpenTox $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" #end end - #Process.waitall + Process.waitall cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id) cv.validations.each do |validation| nr_instances += validation.nr_instances nr_unpredicted += validation.nr_unpredicted - predictions += validation.predictions + predictions.merge! validation.predictions end cv.update_attributes( nr_instances: nr_instances, @@ -73,61 +73,8 @@ module OpenTox # TODO auc, f-measure (usability??) def statistics - accept_values = Feature.find(model.prediction_feature_id).accept_values - confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - true_rate = {} - predictivity = {} - predictions.each do |pred| - compound_id,activities,prediction,confidence = pred - if activities and prediction #and confidence.numeric? - if activities.uniq.size == 1 - activity = activities.uniq.first - if prediction == activity - if prediction == accept_values[0] - confusion_matrix[0][0] += 1 - #weighted_confusion_matrix[0][0] += confidence - elsif prediction == accept_values[1] - confusion_matrix[1][1] += 1 - #weighted_confusion_matrix[1][1] += confidence - end - elsif prediction != activity - if prediction == accept_values[0] - confusion_matrix[0][1] += 1 - #weighted_confusion_matrix[0][1] += confidence - elsif prediction == accept_values[1] - confusion_matrix[1][0] += 1 - #weighted_confusion_matrix[1][0] += confidence - end - end - end - else - nr_unpredicted += 1 if prediction.nil? - end - end - true_rate = {} - predictivity = {} - accept_values.each_with_index do |v,i| - true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f - predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f - end - confidence_sum = 0 - #weighted_confusion_matrix.each do |r| - #r.each do |c| - #confidence_sum += c - #end - #end - update_attributes( - accept_values: accept_values, - confusion_matrix: confusion_matrix, - #weighted_confusion_matrix: weighted_confusion_matrix, - accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f, - #weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, - true_rate: true_rate, - predictivity: predictivity, - finished_at: Time.now - ) - $logger.debug "Accuracy #{accuracy}" + stat = ValidationStatistics.classification(predictions, Feature.find(model.prediction_feature_id).accept_values) + update_attributes(stat) end def confidence_plot @@ -169,48 +116,8 @@ module OpenTox field :correlation_plot_id, type: BSON::ObjectId def statistics - rmse = 0 - mae = 0 - x = [] - y = [] - predictions.each do |pred| - compound_id,activity,prediction,confidence = pred - if activity and prediction - unless activity == [nil] - x << -Math.log10(activity.median) - y << -Math.log10(prediction) - error = Math.log10(prediction)-Math.log10(activity.median) - rmse += error**2 - #weighted_rmse += confidence*error**2 - mae += error.abs - #weighted_mae += confidence*error.abs - #confidence_sum += confidence - end - else - warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - end - end - R.assign "measurement", x - R.assign "prediction", y - R.eval "r <- cor(measurement,prediction,use='complete')" - r = R.eval("r").to_ruby - - mae = mae/predictions.size - #weighted_mae = weighted_mae/confidence_sum - rmse = Math.sqrt(rmse/predictions.size) - #weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) - update_attributes( - mae: mae, - rmse: rmse, - #weighted_mae: weighted_mae, - #weighted_rmse: weighted_rmse, - r_squared: r**2, - finished_at: Time.now - ) - $logger.debug "R^2 #{r**2}" - $logger.debug "RMSE #{rmse}" - $logger.debug "MAE #{mae}" + stat = ValidationStatistics.regression predictions + update_attributes(stat) end def misclassifications n=nil diff --git a/lib/dataset.rb b/lib/dataset.rb index 5c04382..25307c9 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -5,9 +5,6 @@ module OpenTox class Dataset - # associations like has_many, belongs_to deteriorate performance - #field :feature_ids, type: Array, default: [] - #field :substance_ids, type: Array, default: [] field :data_entries, type: Hash, default: {} # Readers @@ -24,7 +21,7 @@ module OpenTox # Get all features def features - @features ||= data_entries.collect{|cid,f| f.keys}.flatten.uniq.collect{|id| OpenTox::Feature.find(id)} + @features ||= data_entries.collect{|cid,f| f.first}.flatten.uniq.collect{|id| OpenTox::Feature.find(id)} @features end @@ -33,7 +30,7 @@ module OpenTox # @param feature [OpenTox::Feature] OpenTox Feature object # @return [Array] Data entry values def values(compound, feature) - data_entries[compound.id,feature.id] + data_entries[compound.id.to_s][feature.id.to_s] end # Writers @@ -68,15 +65,14 @@ module OpenTox training_idxs = indices-test_idxs training_cids = training_idxs.collect{|i| substance_ids[i]} chunk = [training_cids,test_cids].collect do |cids| - new_cids = [] - new_data_entries = [] + new_data_entries = {} cids.each do |cid| - data_entries[cid].each do |de| - new_cids << cid - new_data_entries << de + data_entries[cid].each do |f,v| + new_data_entries[cid] ||= {} + new_data_entries[cid][f] = v end end - dataset = self.class.new(:data_entries => data_entries, :source => self.id ) + dataset = self.class.new(:data_entries => new_data_entries, :source => self.id ) dataset.compounds.each do |compound| compound.dataset_ids << dataset.id compound.save @@ -213,9 +209,6 @@ module OpenTox next end - #substance_ids << compound.id - #table.first.size == 0 ? self.data_entries[compound.id] = Array.new(0) : self.data_entries[compound.id] = Array.new(table.first.size-1) - vals.each_with_index do |v,j| if v.blank? warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})." @@ -228,10 +221,8 @@ module OpenTox self.data_entries[compound.id.to_s] ||= {} self.data_entries[compound.id.to_s][@features[j].id.to_s] ||= [] self.data_entries[compound.id.to_s][@features[j].id.to_s] << v - #i = compound.feature_ids.index feature_ids[j] - #TODO - #compound.features[feature_ids[j].to_s] ||= [] - #compound.features[feature_ids[j].to_s] << v + compound.features[@features[j].id.to_s] ||= [] + compound.features[@features[j].id.to_s] << v compound.save end end @@ -251,14 +242,23 @@ module OpenTox end # Dataset for lazar predictions - class LazarPrediction < Dataset + class LazarPrediction #< Dataset field :creator, type: String - field :prediction_feature_id, type: String + field :prediction_feature_id, type: BSON::ObjectId + field :predictions, type: Hash, default: {} def prediction_feature Feature.find prediction_feature_id end + def compounds + substances.select{|s| s.is_a? Compound} + end + + def substances + predictions.keys.collect{|id| Substance.find id} + end + end end diff --git a/lib/lazar.rb b/lib/lazar.rb index 2bcecc5..a1ad551 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -62,7 +62,7 @@ suppressPackageStartupMessages({ # OpenTox classes and includes #CLASSES = ["Feature","Substance::Compound","Substance::Nanoparticle","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules -CLASSES = ["Feature","Substance","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules +CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules [ # be aware of the require sequence as it affects class/method overwrites "overwrite.rb", @@ -81,6 +81,7 @@ CLASSES = ["Feature","Substance","Dataset","Validation","CrossValidation","Leave "validation.rb", "crossvalidation.rb", "leave-one-out-validation.rb", + "validation-statistics.rb", "experiment.rb", "import.rb", ].each{ |f| require_relative f } diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index 2cd13db..10fbe85 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -6,20 +6,26 @@ module OpenTox field :dataset_id, type: BSON::ObjectId field :nr_instances, type: Integer field :nr_unpredicted, type: Integer - field :predictions, type: Array + field :predictions, type: Hash field :finished_at, type: Time def self.create model model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation loo = klass.new :model_id => model.id, :dataset_id => model.training_dataset_id - compound_ids = model.training_dataset.compound_ids predictions = model.predict model.training_dataset.compounds - predictions = predictions.each_with_index {|p,i| p[:compound_id] = compound_ids[i]} - predictions.select!{|p| p[:database_activities] and !p[:database_activities].empty?} + predictions.each{|cid,p| p.delete(:neighbors)} + nr_unpredicted = 0 + predictions.each do |cid,prediction| + if prediction[:value] + prediction[:measured] = model.training_dataset.data_entries[cid][prediction[:prediction_feature_id].to_s] + else + nr_unpredicted += 1 + end + predictions.delete(cid) unless prediction[:value] and prediction[:measured] + end loo.nr_instances = predictions.size - predictions.select!{|p| p[:value]} # remove unpredicted - loo.predictions = predictions#.sort{|a,b| b[:confidence] <=> a[:confidence]} - loo.nr_unpredicted = loo.nr_instances - loo.predictions.size + loo.nr_unpredicted = nr_unpredicted + loo.predictions = predictions loo.statistics loo.save loo @@ -42,53 +48,8 @@ module OpenTox field :confidence_plot_id, type: BSON::ObjectId def statistics - accept_values = Feature.find(model.prediction_feature_id).accept_values - confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - predictions.each do |pred| - pred[:database_activities].each do |db_act| - if pred[:value] - if pred[:value] == db_act - if pred[:value] == accept_values[0] - confusion_matrix[0][0] += 1 - weighted_confusion_matrix[0][0] += pred[:confidence] - elsif pred[:value] == accept_values[1] - confusion_matrix[1][1] += 1 - weighted_confusion_matrix[1][1] += pred[:confidence] - end - else - if pred[:value] == accept_values[0] - confusion_matrix[0][1] += 1 - weighted_confusion_matrix[0][1] += pred[:confidence] - elsif pred[:value] == accept_values[1] - confusion_matrix[1][0] += 1 - weighted_confusion_matrix[1][0] += pred[:confidence] - end - end - end - end - end - accept_values.each_with_index do |v,i| - true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f - predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f - end - confidence_sum = 0 - weighted_confusion_matrix.each do |r| - r.each do |c| - confidence_sum += c - end - end - update_attributes( - accept_values: accept_values, - confusion_matrix: confusion_matrix, - weighted_confusion_matrix: weighted_confusion_matrix, - accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f, - weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, - true_rate: true_rate, - predictivity: predictivity, - finished_at: Time.now - ) - $logger.debug "Accuracy #{accuracy}" + stat = ValidationStatistics.classification(predictions, Feature.find(model.prediction_feature_id).accept_values) + update_attributes(stat) end def confidence_plot @@ -132,43 +93,10 @@ module OpenTox field :correlation_plot_id, type: BSON::ObjectId field :confidence_plot_id, type: BSON::ObjectId + def statistics - confidence_sum = 0 - predicted_values = [] - measured_values = [] - predictions.each do |pred| - pred[:database_activities].each do |activity| - if pred[:value] - predicted_values << pred[:value] - measured_values << activity - error = Math.log10(pred[:value])-Math.log10(activity) - self.rmse += error**2 - #self.weighted_rmse += pred[:confidence]*error**2 - self.mae += error.abs - #self.weighted_mae += pred[:confidence]*error.abs - #confidence_sum += pred[:confidence] - end - end - if pred[:database_activities].empty? - warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - end - end - R.assign "measurement", measured_values - R.assign "prediction", predicted_values - R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" - r = R.eval("r").to_ruby - - self.mae = self.mae/predictions.size - #self.weighted_mae = self.weighted_mae/confidence_sum - self.rmse = Math.sqrt(self.rmse/predictions.size) - #self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum) - self.r_squared = r**2 - self.finished_at = Time.now - save - $logger.debug "R^2 #{r**2}" - $logger.debug "RMSE #{rmse}" - $logger.debug "MAE #{mae}" + stat = ValidationStatistics.regression predictions + update_attributes(stat) end def correlation_plot diff --git a/lib/model.rb b/lib/model.rb index 1f9942b..5140d5a 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -90,33 +90,36 @@ module OpenTox end # make predictions - predictions = [] - predictions = compounds.collect{|c| predict_compound c} + predictions = {} + compounds.each do |c| + predictions[c.id.to_s] = predict_compound c + predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id + end # serialize result case object.class.to_s when "OpenTox::Compound" - prediction = predictions.first + prediction = predictions[compounds.first.id.to_s] prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity - return prediction + return predictions when "Array" return predictions when "OpenTox::Dataset" + predictions.each{|cid,p| p.delete(:neighbors)} # prepare prediction dataset measurement_feature = Feature.find prediction_feature_id - prediction_feature = OpenTox::NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" ) + prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" ) prediction_dataset = LazarPrediction.new( :name => "Lazar prediction for #{prediction_feature.name}", :creator => __FILE__, :prediction_feature_id => prediction_feature.id ) - confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Model RMSE" ) - warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings") - prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ] - prediction_dataset.compounds = compounds - prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:rmse] , p[:dataset_activities].to_s, p[:warning]]} + + compounds.each_with_index do |c,i| + prediction_dataset.predictions[c.id.to_s] = predictions[i] + end prediction_dataset.save return prediction_dataset end diff --git a/lib/validation.rb b/lib/validation.rb index b72d273..484e22e 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -8,7 +8,7 @@ module OpenTox field :test_dataset_id, type: BSON::ObjectId field :nr_instances, type: Integer field :nr_unpredicted, type: Integer - field :predictions, type: Array + field :predictions, type: Hash def prediction_dataset Dataset.find prediction_dataset_id @@ -29,30 +29,22 @@ module OpenTox atts[:training_dataset_id] = training_set.id validation_model = model.class.create training_set, atts validation_model.save - cids = test_set.compound_ids - - test_set_without_activities = Dataset.new(:compound_ids => cids.uniq) # remove duplicates and make sure that activities cannot be used - prediction_dataset = validation_model.predict test_set_without_activities - predictions = [] + predictions = validation_model.predict test_set.compounds + predictions.each{|cid,p| p.delete(:neighbors)} nr_unpredicted = 0 - activities = test_set.data_entries.collect{|de| de.first} - prediction_dataset.data_entries.each_with_index do |de,i| - if de[0] #and de[1] - cid = prediction_dataset.compound_ids[i] - rows = cids.each_index.select{|r| cids[r] == cid } - activities = rows.collect{|r| test_set.data_entries[r][0]} - prediction = de.first - confidence = de[1] - predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]] + predictions.each do |cid,prediction| + if prediction[:value] + prediction[:measured] = test_set.data_entries[cid][prediction[:prediction_feature_id].to_s] else nr_unpredicted += 1 end + predictions.delete(cid) unless prediction[:value] and prediction[:measured] end validation = self.new( :model_id => validation_model.id, - :prediction_dataset_id => prediction_dataset.id, + #:prediction_dataset_id => prediction_dataset.id, :test_dataset_id => test_set.id, - :nr_instances => test_set.compound_ids.size, + :nr_instances => test_set.compounds.size, :nr_unpredicted => nr_unpredicted, :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence ) @@ -67,42 +59,6 @@ module OpenTox end class RegressionValidation < Validation - - def statistics - rmse = 0 - weighted_rmse = 0 - rse = 0 - weighted_rse = 0 - mae = 0 - weighted_mae = 0 - confidence_sum = 0 - predictions.each do |pred| - compound_id,activity,prediction,confidence = pred - if activity and prediction - error = Math.log10(prediction)-Math.log10(activity.median) - rmse += error**2 - weighted_rmse += confidence*error**2 - mae += error.abs - weighted_mae += confidence*error.abs - confidence_sum += confidence - else - warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - end - end - x = predictions.collect{|p| p[1].median} - y = predictions.collect{|p| p[2]} - R.assign "measurement", x - R.assign "prediction", y - R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" - r = R.eval("r").to_ruby - - mae = mae/predictions.size - weighted_mae = weighted_mae/confidence_sum - rmse = Math.sqrt(rmse/predictions.size) - weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) - { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae } - end end end -- cgit v1.2.3 From 815cf6ba1543fc323eb7cbd1202fadbf03bcfbca Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 13 Apr 2016 15:35:01 +0200 Subject: new files added --- lib/substance.rb | 10 +++++ lib/validation-statistics.rb | 100 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100644 lib/substance.rb create mode 100644 lib/validation-statistics.rb (limited to 'lib') diff --git a/lib/substance.rb b/lib/substance.rb new file mode 100644 index 0000000..a5b9825 --- /dev/null +++ b/lib/substance.rb @@ -0,0 +1,10 @@ +module OpenTox + + class Substance + include OpenTox + include Mongoid::Document + include Mongoid::Timestamps + end + +end + diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb new file mode 100644 index 0000000..570b2d4 --- /dev/null +++ b/lib/validation-statistics.rb @@ -0,0 +1,100 @@ +module OpenTox + class ValidationStatistics + include OpenTox + def self.classification predictions, accept_values + confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} + weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} + true_rate = {} + predictivity = {} + nr_instances = 0 + predictions.each do |cid,pred| + # TODO use measured majority class + if pred[:measured].uniq.size == 1 + m = pred[:measured].first + #pred[:measured].each do |m| + if pred[:value] == m + if pred[:value] == accept_values[0] + confusion_matrix[0][0] += 1 + weighted_confusion_matrix[0][0] += pred[:confidence] + nr_instances += 1 + elsif pred[:value] == accept_values[1] + confusion_matrix[1][1] += 1 + weighted_confusion_matrix[1][1] += pred[:confidence] + nr_instances += 1 + end + elsif pred[:value] != m + if pred[:value] == accept_values[0] + confusion_matrix[0][1] += 1 + weighted_confusion_matrix[0][1] += pred[:confidence] + nr_instances += 1 + elsif pred[:value] == accept_values[1] + confusion_matrix[1][0] += 1 + weighted_confusion_matrix[1][0] += pred[:confidence] + nr_instances += 1 + end + end + end + end + true_rate = {} + predictivity = {} + accept_values.each_with_index do |v,i| + true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f + predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f + end + confidence_sum = 0 + weighted_confusion_matrix.each do |r| + r.each do |c| + confidence_sum += c + end + end + accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f + $logger.debug "Accuracy #{accuracy}" + { + :accept_values => accept_values, + :confusion_matrix => confusion_matrix, + :weighted_confusion_matrix => weighted_confusion_matrix, + :accuracy => accuracy, + :weighted_accuracy => (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, + :true_rate => true_rate, + :predictivity => predictivity, + :finished_at => Time.now + } + end + + def self.regression predictions + # TODO: prediction intervals + rmse = 0 + mae = 0 + x = [] + y = [] + predictions.each do |cid,pred| + if pred[:value] and pred[:measured] #and pred[:measured] != [nil] + x << -Math.log10(pred[:measured].median) + y << -Math.log10(pred[:value]) + error = Math.log10(pred[:value])-Math.log10(pred[:measured].median) + rmse += error**2 + mae += error.abs + else + warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + end + end + R.assign "measurement", x + R.assign "prediction", y + R.eval "r <- cor(measurement,prediction,use='complete')" + r = R.eval("r").to_ruby + + mae = mae/predictions.size + rmse = Math.sqrt(rmse/predictions.size) + $logger.debug "R^2 #{r**2}" + $logger.debug "RMSE #{rmse}" + $logger.debug "MAE #{mae}" + { + :mae => mae, + :rmse => rmse, + :r_squared => r**2, + :finished_at => Time.now + } + end + end +end -- cgit v1.2.3 From 64f1f32ced77afb278bdb7c27397c5299a73675c Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 13 Apr 2016 18:18:36 +0200 Subject: improved enm import --- lib/compound.rb | 2 - lib/import.rb | 105 +++++++++++++++++++++++++++++----------------------- lib/lazar.rb | 1 + lib/nanoparticle.rb | 1 - lib/substance.rb | 5 +-- 5 files changed, 61 insertions(+), 53 deletions(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index 757ba1a..7895619 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -17,8 +17,6 @@ module OpenTox field :sdf_id, type: BSON::ObjectId field :fingerprints, type: Hash, default: {} field :default_fingerprint_size, type: Integer - field :physchem_descriptors, type: Hash, default: {} - field :dataset_ids, type: Array, default: [] # TODO separate between physchem, bio and tox field :features, type: Hash, default: {} diff --git a/lib/import.rb b/lib/import.rb index 86c633a..cf0855e 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -8,64 +8,75 @@ module OpenTox def self.import #get list of bundle URIs bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] + datasets = [] bundles.each do |bundle| uri = bundle["URI"] + dataset = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"]) nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"] features = JSON.parse(RestClientWrapper.get(bundle["property"]+"?media=application%2Fjson"))["feature"] nanoparticles.each do |np| - nanoparticle = Nanoparticle.find_or_create_by( - :name => np["values"]["https://data.enanomapper.net/identifier/name"], - :source => np["compound"]["URI"], - ) - nanoparticle.bundles << uri - np["composition"].each do |comp| - case comp["relation"] - when "HAS_CORE" - nanoparticle.core = comp["component"]["compound"]["URI"] - when "HAS_COATING" - nanoparticle.coating << comp["component"]["compound"]["URI"] - end - end if np["composition"] - np["values"].each do |u,v| - if u.match(/property/) - name, unit, source = nil - features.each do |uri,feat| - if u.match(/#{uri}/) - name = feat["title"] - unit = feat["units"] - source = uri - end + nanoparticle = Nanoparticle.find_or_create_by( + :name => np["values"]["https://data.enanomapper.net/identifier/name"], + :source => np["compound"]["URI"], + ) + dataset.data_entries[nanoparticle.id.to_s] ||= {} + nanoparticle.bundles << uri + nanoparticle.dataset_ids << dataset.id + np["composition"].each do |comp| + case comp["relation"] + when "HAS_CORE" + nanoparticle.core = comp["component"]["compound"]["URI"] + when "HAS_COATING" + nanoparticle.coating << comp["component"]["compound"]["URI"] + end + end if np["composition"] + np["values"].each do |u,v| + if u.match(/property/) + name, unit, source = nil + features.each do |uri,feat| + if u.match(/#{uri}/) + name = feat["title"] + unit = feat["units"] + source = uri end - feature = Feature.find_or_create_by( - :name => name, - :unit => unit, - :source => source - ) end - v.each{|value| nanoparticle.parse_ambit_value feature, value} if v.is_a? Array + feature = Feature.find_or_create_by( + :name => name, + :unit => unit, + :source => source + ) end - nanoparticle.bundles.uniq! - nanoparticle.physchem_descriptors.each{|f,v| v.uniq!} - nanoparticle.toxicities.each{|f,v| v.uniq!} - nanoparticle.save! + v.each{|value| nanoparticle.parse_ambit_value feature, value} if v.is_a? Array + end + nanoparticle.bundles.uniq! + nanoparticle.physchem_descriptors.each{|f,v| v.uniq!} + #nanoparticle.toxicities.each{|f,v| v.uniq!} + nanoparticle.toxicities.each do |f,v| + dataset.data_entries[nanoparticle.id.to_s][f.to_s] ||= [] + dataset.data_entries[nanoparticle.id.to_s][f.to_s] += v + end + nanoparticle.save end + dataset.save + datasets << dataset end + datasets.collect{|d| d.id} + end - def self.dump - #get list of bundle URIs - `wget 'https://data.enanomapper.net/bundle?media=application%2Fjson' -O bundles.json` - json = JSON.parse File.read('./bundles.json') - json["dataset"].each do |dataset| - uri = dataset["URI"] - id = uri.split("/").last - `wget --header='accept:application/json' '#{uri}' -O 'bundle#{id}'` - `wget --header='accept:application/json' '#{dataset["summary"]}' -O 'summary#{id}.json'` - `wget --header='accept:application/json' '#{dataset["compound"]}' -O 'compound#{id}.json'` - `wget --header='accept:application/json' '#{dataset["substance"]}' -O 'substance#{id}.json'` - `wget --header='accept:application/json' '#{dataset["property"]}' -O 'property#{id}.json'` - `wget --header='accept:application/json' '#{dataset["dataset"]}' -O 'dataset#{id}.json'` - `wget --header='accept:application/json' '#{dataset["matrix"]}' -O 'matrix#{id}.json'` - end + def self.dump + #get list of bundle URIs + `wget 'https://data.enanomapper.net/bundle?media=application%2Fjson' -O bundles.json` + json = JSON.parse File.read('./bundles.json') + json["dataset"].each do |dataset| + uri = dataset["URI"] + id = uri.split("/").last + `wget --header='accept:application/json' '#{uri}' -O 'bundle#{id}'` + `wget --header='accept:application/json' '#{dataset["summary"]}' -O 'summary#{id}.json'` + `wget --header='accept:application/json' '#{dataset["compound"]}' -O 'compound#{id}.json'` + `wget --header='accept:application/json' '#{dataset["substance"]}' -O 'substance#{id}.json'` + `wget --header='accept:application/json' '#{dataset["property"]}' -O 'property#{id}.json'` + `wget --header='accept:application/json' '#{dataset["dataset"]}' -O 'dataset#{id}.json'` + `wget --header='accept:application/json' '#{dataset["matrix"]}' -O 'matrix#{id}.json'` end end diff --git a/lib/lazar.rb b/lib/lazar.rb index a1ad551..8eb46e0 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -71,6 +71,7 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","Cross "opentox.rb", "feature.rb", "physchem.rb", + "substance.rb", "compound.rb", "nanoparticle.rb", "dataset.rb", diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index c58dc8c..6e9b0ea 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -6,7 +6,6 @@ module OpenTox field :core, type: String field :coating, type: Array, default: [] - field :physchem_descriptors, type: Hash, default: {} field :toxicities, type: Hash, default: {} #field :features, type: Hash, default: {} field :bundles, type: Array, default: [] diff --git a/lib/substance.rb b/lib/substance.rb index a5b9825..6768ce7 100644 --- a/lib/substance.rb +++ b/lib/substance.rb @@ -1,9 +1,8 @@ module OpenTox class Substance - include OpenTox - include Mongoid::Document - include Mongoid::Timestamps + field :physchem_descriptors, type: Hash, default: {} + field :dataset_ids, type: Array, default: [] end end -- cgit v1.2.3 From 753fcc204d93d86c76860bee6e2f7d0468c3c940 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 14 Apr 2016 19:43:24 +0200 Subject: features/toxicities fixed --- lib/classification.rb | 2 +- lib/compound.rb | 6 ++---- lib/dataset.rb | 29 +++++++++++++++++++++-------- lib/model.rb | 35 ++++++++++++++++------------------- lib/nanoparticle.rb | 30 +++++++++++++++++++----------- lib/opentox.rb | 5 +++++ lib/regression.rb | 35 ++++++++++++++++++++--------------- lib/substance.rb | 1 + 8 files changed, 85 insertions(+), 58 deletions(-) (limited to 'lib') diff --git a/lib/classification.rb b/lib/classification.rb index 0202940..4a17546 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -10,7 +10,7 @@ module OpenTox confidence = 0.0 neighbors.each do |row| sim = row["tanimoto"] - row["features"][params[:prediction_feature_id].to_s].each do |act| + row["toxicities"][params[:prediction_feature_id].to_s].each do |act| weighted_sum[act] ||= 0 weighted_sum[act] += sim end diff --git a/lib/compound.rb b/lib/compound.rb index 7895619..55cd482 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -17,8 +17,6 @@ module OpenTox field :sdf_id, type: BSON::ObjectId field :fingerprints, type: Hash, default: {} field :default_fingerprint_size, type: Integer - # TODO separate between physchem, bio and tox - field :features, type: Hash, default: {} index({smiles: 1}, {unique: true}) @@ -291,7 +289,7 @@ module OpenTox candidate_fingerprint = compound.fingerprint params[:type] sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f feature_values = training_dataset.values(compound,prediction_feature) - neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim] + neighbors << {"_id" => compound.id, "toxicities" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim] end neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]} end @@ -332,7 +330,7 @@ module OpenTox 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]} }}, '_id' => 1, - 'features' => 1, + 'toxicities' => 1, 'dataset_ids' => 1 }}, {'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}}, diff --git a/lib/dataset.rb b/lib/dataset.rb index 25307c9..274c475 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -13,6 +13,10 @@ module OpenTox substances.select{|s| s.is_a? Compound} end + def nanoparticles + substances.select{|s| s.is_a? Nanoparticle} + end + # Get all substances def substances @substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id} @@ -21,7 +25,7 @@ module OpenTox # Get all features def features - @features ||= data_entries.collect{|cid,f| f.first}.flatten.uniq.collect{|id| OpenTox::Feature.find(id)} + @features ||= data_entries.collect{|cid,f| f.first}.flatten.uniq.compact.collect{|id| OpenTox::Feature.find(id)}.compact @features end @@ -98,13 +102,22 @@ module OpenTox # @return [String] def to_csv(inchi=false) CSV.generate() do |csv| - csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} + compound = Substance.find(data_entries.first.first).is_a? Compound + if compound + csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} + else + csv << ["Name"] + features.collect{|f| f.name} + end data_entries.each do |sid,f| - substance = Substance.find cid + substance = Substance.find sid features.each do |feature| - f[feature.id].each do |v| - csv << [inchi ? substance.inchi : substance.smiles , v] - end + f[feature.id.to_s].each do |v| + if compound + csv << [inchi ? substance.inchi : substance.smiles , v] + else + csv << [substance.name , v] + end + end if f[feature.id.to_s] end end end @@ -221,8 +234,8 @@ module OpenTox self.data_entries[compound.id.to_s] ||= {} self.data_entries[compound.id.to_s][@features[j].id.to_s] ||= [] self.data_entries[compound.id.to_s][@features[j].id.to_s] << v - compound.features[@features[j].id.to_s] ||= [] - compound.features[@features[j].id.to_s] << v + compound.toxicities[@features[j].id.to_s] ||= [] + compound.toxicities[@features[j].id.to_s] << v compound.save end end diff --git a/lib/model.rb b/lib/model.rb index 5140d5a..1960c10 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -36,6 +36,7 @@ module OpenTox super params # TODO document convention + #p training_dataset.features prediction_feature = training_dataset.features.first # set defaults for empty parameters self.prediction_feature_id ||= prediction_feature.id @@ -56,12 +57,13 @@ module OpenTox prediction = {} if neighbors.collect{|n| n["_id"]}.include? compound.id - database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq + #TODO restrict to dataset features + database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s].uniq prediction[:database_activities] = database_activities prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." neighbors.delete_if{|n| n["_id"] == compound.id} end - neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] } + neighbors.delete_if{|n| n['toxicities'].empty? or n['toxicities'][prediction_feature.id.to_s] == [nil] } if neighbors.empty? prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []}) else @@ -78,12 +80,11 @@ module OpenTox # parse data compounds = [] - case object.class.to_s - when "OpenTox::Compound" + if object.is_a? Substance compounds = [object] - when "Array" + elsif object.is_a? Array compounds = object - when "OpenTox::Dataset" + elsif object.is_a? Dataset compounds = object.compounds else bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter." @@ -97,30 +98,26 @@ module OpenTox end # serialize result - case object.class.to_s - when "OpenTox::Compound" + if object.is_a? Substance prediction = predictions[compounds.first.id.to_s] prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity + return prediction + elsif object.is_a? Array return predictions - when "Array" - return predictions - when "OpenTox::Dataset" + elsif object.is_a? Dataset predictions.each{|cid,p| p.delete(:neighbors)} # prepare prediction dataset measurement_feature = Feature.find prediction_feature_id prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" ) - prediction_dataset = LazarPrediction.new( + prediction_dataset = LazarPrediction.create( :name => "Lazar prediction for #{prediction_feature.name}", :creator => __FILE__, - :prediction_feature_id => prediction_feature.id - + :prediction_feature_id => prediction_feature.id, + :predictions => predictions ) - compounds.each_with_index do |c,i| - prediction_dataset.predictions[c.id.to_s] = predictions[i] - end - prediction_dataset.save + #prediction_dataset.save return prediction_dataset end @@ -264,7 +261,7 @@ module OpenTox training_features = training.collect{|t| t.physchem_descriptors.keys}.flatten.uniq query_features = nanoparticle.physchem_descriptors.keys common_features = (training_features & query_features) - p common_features + #p common_features end end diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 6e9b0ea..0350363 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -5,12 +5,10 @@ module OpenTox field :core, type: String field :coating, type: Array, default: [] - - field :toxicities, type: Hash, default: {} - #field :features, type: Hash, default: {} field :bundles, type: Array, default: [] - def predict + def nanoparticle_neighbors params + Dataset.find(params[:training_dataset_id]).nanoparticles end def add_feature feature, value @@ -21,22 +19,32 @@ module OpenTox toxicities[feature.id.to_s] ||= [] toxicities[feature.id.to_s] << value else - $logger.warn "Unknown feature type '#{feature.source}'. Value '#{value}' not inserted." - warnings << "Unknown feature type '#{feature.source}'. Value '#{value}' not inserted." + warn "Unknown feature type '#{feature.source}'. Value '#{value}' not inserted." end end def parse_ambit_value feature, v + # TODO: units, mmol/log10 conversion if v.keys == ["loValue"] - add_feature feature, v["loValue"] + #if v["loValue"].numeric? + add_feature feature, v["loValue"] + #else + #warn "'#{v["loValue"]}' is not a numeric value, entry ignored." + #end elsif v.keys.size == 2 and v["loQualifier"] == "mean" - add_feature feature, {:mean => v["loValue"]} + #add_feature feature, {:mean => v["loValue"]} + add_feature feature, v["loValue"] + warn "'#{feature.name}' is a mean value. Original data is not available." elsif v.keys.size == 2 and v["loQualifier"] #== ">=" - add_feature feature, {:min => v["loValue"],:max => Float::INFINITY} + #add_feature feature, {:min => v["loValue"],:max => Float::INFINITY} + warn "Only min value available for '#{feature.name}', entry ignored" elsif v.keys.size == 2 and v["upQualifier"] #== ">=" - add_feature feature, {:max => v["upValue"],:min => -Float::INFINITY} + #add_feature feature, {:max => v["upValue"],:min => -Float::INFINITY} + warn "Only max value available for '#{feature.name}', entry ignored" elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] - add_feature feature, {:min => v["loValue"],:max => v["upValue"]} + #add_feature feature, {:min => v["loValue"],:max => v["upValue"]} + add_feature feature, [v["loValue"],v["upValue"]].mean + warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available." elsif v == {} # do nothing else $logger.warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'." diff --git a/lib/opentox.rb b/lib/opentox.rb index cc18cc6..7d8a8a2 100644 --- a/lib/opentox.rb +++ b/lib/opentox.rb @@ -15,6 +15,11 @@ module OpenTox field :name, type: String field :source, type: String field :warnings, type: Array, default: [] + + def warn warning + $logger.warn warning + warnings << warning + end end OpenTox.const_set klass,c end diff --git a/lib/regression.rb b/lib/regression.rb index 5021fb3..cb17f25 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -9,8 +9,8 @@ module OpenTox neighbors = params[:neighbors] neighbors.each do |row| sim = row["tanimoto"] - if row["features"][params[:prediction_feature_id].to_s] - row["features"][params[:prediction_feature_id].to_s].each do |act| + if row["toxicities"][params[:prediction_feature_id].to_s] + row["toxicities"][params[:prediction_feature_id].to_s].each do |act| weighted_sum += sim*Math.log10(act) sim_sum += sim end @@ -32,8 +32,8 @@ module OpenTox neighbors.each_with_index do |row,i| neighbor = Compound.find row["_id"] fingerprint = neighbor.fingerprint - if row["features"][params[:prediction_feature_id].to_s] - row["features"][params[:prediction_feature_id].to_s].each do |act| + if row["toxicities"][params[:prediction_feature_id].to_s] + row["toxicities"][params[:prediction_feature_id].to_s].each do |act| activities << Math.log10(act) weights << row["tanimoto"] fingerprint_ids.each_with_index do |id,j| @@ -79,21 +79,24 @@ module OpenTox neighbors = params[:neighbors] return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 - return {:value => neighbors.first["features"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 + return {:value => neighbors.first["toxicities"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 activities = [] weights = [] physchem = {} - neighbors.each_with_index do |row,i| - neighbor = Compound.find row["_id"] - if row["features"][params[:prediction_feature_id].to_s] - row["features"][params[:prediction_feature_id].to_s].each do |act| - activities << Math.log10(act) - weights << row["tanimoto"] # TODO cosine ? - neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity + neighbors.each_with_index do |n,i| + if n["toxicities"][params[:prediction_feature_id].to_s] + n["toxicities"][params[:prediction_feature_id].to_s].each do |act| + # TODO fix!!!! + activities << -Math.log10(act) + #if act.numeric? + #activities << act + n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? + neighbor = Substance.find(n["_id"]) + neighbor.physchem_descriptors.each do |pid,v| # insert physchem only if there is an activity physchem[pid] ||= [] - physchem[pid] << v + physchem[pid] += v end end end @@ -110,8 +113,8 @@ module OpenTox return result else - data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] } - prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]} + data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid].collect{|v| "\"#{v.sub('[','').sub(']','')}\"" if v.is_a? String }} + prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem_descriptors[pid]} if prediction.nil? prediction = local_weighted_average(compound, params) prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." @@ -127,6 +130,8 @@ module OpenTox def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values R.assign "weights", training_weights r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" + #p r_data_frame + File.open("tmp.R","w+"){|f| f.puts "data <- #{r_data_frame}\n"} R.eval "data <- #{r_data_frame}" R.assign "features", training_features R.eval "names(data) <- append(c('activities'),features)" # diff --git a/lib/substance.rb b/lib/substance.rb index 6768ce7..82ca65d 100644 --- a/lib/substance.rb +++ b/lib/substance.rb @@ -2,6 +2,7 @@ module OpenTox class Substance field :physchem_descriptors, type: Hash, default: {} + field :toxicities, type: Hash, default: {} field :dataset_ids, type: Array, default: [] end -- cgit v1.2.3 From 8aab046eb1ad39aaf10c5a8596102c35c7b2ee0b Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 15 Apr 2016 11:01:16 +0200 Subject: data_entries removed from datasets. datasets are now just containers for compounds and features, feature values have to be retrieved from substances. --- lib/compound.rb | 3 +- lib/crossvalidation.rb | 12 ++++---- lib/dataset.rb | 65 +++++++++++++++-------------------------- lib/leave-one-out-validation.rb | 11 ++++--- lib/model.rb | 44 +++++++++++++++------------- lib/validation.rb | 5 ++-- 6 files changed, 60 insertions(+), 80 deletions(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index 55cd482..049d77b 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -288,8 +288,7 @@ module OpenTox training_dataset.compounds.each do |compound| candidate_fingerprint = compound.fingerprint params[:type] sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f - feature_values = training_dataset.values(compound,prediction_feature) - neighbors << {"_id" => compound.id, "toxicities" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim] + neighbors << {"_id" => compound.id, "toxicities" => {prediction_feature.id.to_s => compound.toxicities[prediction_feature.id.to_s]}, "tanimoto" => sim} if sim >= params[:min_sim] end neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]} end diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index f93a04c..752d393 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -22,8 +22,10 @@ module OpenTox end def self.create model, n=10 - model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation - bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass + klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification + klass = RegressionCrossValidation if model.is_a? Model::LazarRegression + bad_request_error "Unknown model class #{model.class}." unless klass + cv = klass.new( name: model.name, model_id: model.id, @@ -35,7 +37,7 @@ module OpenTox predictions = {} training_dataset = Dataset.find model.training_dataset_id training_dataset.folds(n).each_with_index do |fold,fold_nr| - #fork do # parallel execution of validations + #fork do # parallel execution of validations can lead to Rserve and memory problems $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" t = Time.now validation = Validation.create(model, fold[0], fold[1],cv) @@ -121,7 +123,6 @@ module OpenTox end def misclassifications n=nil - #n = predictions.size unless n n ||= 10 model = Model::Lazar.find(self.model_id) training_dataset = Dataset.find(model.training_dataset_id) @@ -132,8 +133,7 @@ module OpenTox neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters) neighbors.collect! do |n| neighbor = Compound.find(n[0]) - values = training_dataset.values(neighbor,prediction_feature) - { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values} + { :smiles => neighbor.smiles, :similarity => n[1], :measurements => neighbor.toxicities[prediction_feature.id.to_s]} end { :smiles => compound.smiles, diff --git a/lib/dataset.rb b/lib/dataset.rb index 274c475..fdf1bfc 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -5,7 +5,8 @@ module OpenTox class Dataset - field :data_entries, type: Hash, default: {} + field :substance_ids, type: Array, default: [] + field :feature_ids, type: Array, default: [] # Readers @@ -19,13 +20,13 @@ module OpenTox # Get all substances def substances - @substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id} + @substances ||= substance_ids.collect{|id| OpenTox::Substance.find id} @substances end # Get all features def features - @features ||= data_entries.collect{|cid,f| f.first}.flatten.uniq.compact.collect{|id| OpenTox::Feature.find(id)}.compact + @features ||= feature_ids.collect{|id| OpenTox::Feature.find(id)} @features end @@ -33,9 +34,9 @@ module OpenTox # @param compound [OpenTox::Compound] OpenTox Compound object # @param feature [OpenTox::Feature] OpenTox Feature object # @return [Array] Data entry values - def values(compound, feature) - data_entries[compound.id.to_s][feature.id.to_s] - end + #def values(compound, feature) + #data_entries[compound.id.to_s][feature.id.to_s] + #end # Writers @@ -45,9 +46,9 @@ module OpenTox end # Set features - #def features=(features) - #self.feature_ids = features.collect{|f| f.id} - #end + def features=(features) + self.feature_ids = features.collect{|f| f.id} + end # Dataset operations @@ -55,8 +56,7 @@ module OpenTox # @param [Integer] number of folds # @return [Array] Array with folds [training_dataset,test_dataset] def folds n - substance_ids = data_entries.keys - len = substance_ids.size + len = self.substance_ids.size indices = (0..len-1).to_a.shuffle mid = (len/n) chunks = [] @@ -69,19 +69,11 @@ module OpenTox training_idxs = indices-test_idxs training_cids = training_idxs.collect{|i| substance_ids[i]} chunk = [training_cids,test_cids].collect do |cids| - new_data_entries = {} - cids.each do |cid| - data_entries[cid].each do |f,v| - new_data_entries[cid] ||= {} - new_data_entries[cid][f] = v - end - end - dataset = self.class.new(:data_entries => new_data_entries, :source => self.id ) + dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id ) dataset.compounds.each do |compound| compound.dataset_ids << dataset.id compound.save end - dataset.save dataset end start = last+1 @@ -90,12 +82,6 @@ module OpenTox chunks end - # Diagnostics - - def duplicates feature=self.features.first - data_entries.select{|sid,f| f[feature.id].size > 1} - end - # Serialisation # converts dataset to csv format including compound smiles as first column, other column headers are feature names @@ -161,7 +147,6 @@ module OpenTox compound_format = feature_names.shift.strip # TODO nanoparticles bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i - numeric = [] # guess feature types feature_names.each_with_index do |f,i| @@ -180,8 +165,7 @@ module OpenTox numeric[i] = false feature = NominalFeature.find_or_create_by(metadata) end - @features ||= [] - @features << feature if feature + feature_ids << feature.id if feature end $logger.debug "Feature values: #{Time.now-time}" @@ -196,7 +180,7 @@ module OpenTox table.each_with_index do |vals,i| ct = Time.now identifier = vals.shift.strip - warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty? + warn "No feature values for compound at position #{i+2}." if vals.compact.empty? begin case compound_format when /SMILES/i @@ -208,41 +192,38 @@ module OpenTox rescue compound = nil end - if compound.nil? - # compound parsers may return nil - warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." + if compound.nil? # compound parsers may return nil + warn "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." next end + substance_ids << compound.id compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id compound_time += Time.now-ct r += 1 - unless vals.size == @features.size - warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." + unless vals.size == feature_ids.size + warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." next end vals.each_with_index do |v,j| if v.blank? - warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})." + warn "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})." next elsif numeric[j] v = v.to_f else v = v.strip end - self.data_entries[compound.id.to_s] ||= {} - self.data_entries[compound.id.to_s][@features[j].id.to_s] ||= [] - self.data_entries[compound.id.to_s][@features[j].id.to_s] << v - compound.toxicities[@features[j].id.to_s] ||= [] - compound.toxicities[@features[j].id.to_s] << v + compound.toxicities[feature_ids[j].to_s] ||= [] + compound.toxicities[feature_ids[j].to_s] << v compound.save end end compounds.duplicates.each do |compound| positions = [] compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi} - warnings << "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." + warn "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." end $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})" diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index 10fbe85..ed917eb 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -10,6 +10,8 @@ module OpenTox field :finished_at, type: Time def self.create model + $logger.debug "#{model.name}: LOO validation started" + t = Time.now model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation loo = klass.new :model_id => model.id, :dataset_id => model.training_dataset_id predictions = model.predict model.training_dataset.compounds @@ -17,7 +19,7 @@ module OpenTox nr_unpredicted = 0 predictions.each do |cid,prediction| if prediction[:value] - prediction[:measured] = model.training_dataset.data_entries[cid][prediction[:prediction_feature_id].to_s] + prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s] else nr_unpredicted += 1 end @@ -28,6 +30,7 @@ module OpenTox loo.predictions = predictions loo.statistics loo.save + $logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds" loo end @@ -84,16 +87,12 @@ module OpenTox class RegressionLeaveOneOutValidation < LeaveOneOutValidation - - field :rmse, type: Float, default: 0.0 + field :rmse, type: Float, default: 0 field :mae, type: Float, default: 0 - #field :weighted_rmse, type: Float, default: 0 - #field :weighted_mae, type: Float, default: 0 field :r_squared, type: Float field :correlation_plot_id, type: BSON::ObjectId field :confidence_plot_id, type: BSON::ObjectId - def statistics stat = ValidationStatistics.regression predictions update_attributes(stat) diff --git a/lib/model.rb b/lib/model.rb index 1960c10..b82f098 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -20,6 +20,10 @@ module OpenTox def training_dataset Dataset.find(training_dataset_id) end + + def prediction_feature + Feature.find(prediction_feature_id) + end end class Lazar < Model @@ -31,13 +35,10 @@ module OpenTox # Create a lazar model from a training_dataset and a feature_dataset # @param [OpenTox::Dataset] training_dataset # @return [OpenTox::Model::Lazar] Regression or classification model - def initialize training_dataset, params={} + def initialize prediction_feature, training_dataset, params={} super params - # TODO document convention - #p training_dataset.features - prediction_feature = training_dataset.features.first # set defaults for empty parameters self.prediction_feature_id ||= prediction_feature.id self.training_dataset_id ||= training_dataset.id @@ -49,7 +50,6 @@ module OpenTox end def predict_compound compound - prediction_feature = Feature.find prediction_feature_id neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) # remove neighbors without prediction_feature # check for database activities (neighbors may include query compound) @@ -122,18 +122,13 @@ module OpenTox end end - - def training_activities - i = training_dataset.feature_ids.index prediction_feature_id - training_dataset.data_entries.collect{|de| de[i]} - end end class LazarClassification < Lazar - def self.create training_dataset, params={} - model = self.new training_dataset, params + def self.create prediction_feature, training_dataset, params={} + model = self.new prediction_feature, training_dataset, params model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm model.neighbor_algorithm ||= "fingerprint_neighbors" model.neighbor_algorithm_parameters ||= {} @@ -151,8 +146,8 @@ module OpenTox class LazarRegression < Lazar - def self.create training_dataset, params={} - model = self.new training_dataset, params + def self.create prediction_feature, training_dataset, params={} + model = self.new prediction_feature, training_dataset, params model.neighbor_algorithm ||= "fingerprint_neighbors" model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression" model.neighbor_algorithm_parameters ||= {} @@ -173,13 +168,13 @@ module OpenTox include Mongoid::Document include Mongoid::Timestamps - # TODO field Validations field :endpoint, type: String field :species, type: String field :source, type: String field :unit, type: String field :model_id, type: BSON::ObjectId field :repeated_crossvalidation_id, type: BSON::ObjectId + field :leave_one_out_validation_id, type: BSON::ObjectId def predict object Lazar.find(model_id).predict object @@ -201,12 +196,16 @@ module OpenTox repeated_crossvalidation.crossvalidations end + def leave_one_out_validation + LeaveOneOutValidation.find leave_one_out_validation_id + end + def regression? - training_dataset.features.first.numeric? + model.is_a? LazarRegression end def classification? - training_dataset.features.first.nominal? + model.is_a? LazarClassification end def self.from_csv_file file @@ -214,14 +213,17 @@ module OpenTox bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file prediction_model = self.new JSON.parse(File.read(metadata_file)) training_dataset = Dataset.from_csv_file file + prediction_feature = training_dataset.features.first model = nil - if training_dataset.features.first.nominal? - model = LazarClassification.create training_dataset - elsif training_dataset.features.first.numeric? - model = LazarRegression.create training_dataset + if prediction_feature.nominal? + model = LazarClassification.create prediction_feature, training_dataset + elsif prediction_feature.numeric? + model = LazarRegression.create prediction_feature, training_dataset end prediction_model[:model_id] = model.id + prediction_model[:prediction_feature_id] = prediction_feature.id prediction_model[:repeated_crossvalidation_id] = RepeatedCrossValidation.create(model).id + prediction_model[:leave_one_out_validation_id] = LeaveOneOutValidation.create(model).id prediction_model.save prediction_model end diff --git a/lib/validation.rb b/lib/validation.rb index 484e22e..6b515e4 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -27,14 +27,14 @@ module OpenTox atts = model.attributes.dup # do not modify attributes from original model atts["_id"] = BSON::ObjectId.new atts[:training_dataset_id] = training_set.id - validation_model = model.class.create training_set, atts + validation_model = model.class.create model.prediction_feature, training_set, atts validation_model.save predictions = validation_model.predict test_set.compounds predictions.each{|cid,p| p.delete(:neighbors)} nr_unpredicted = 0 predictions.each do |cid,prediction| if prediction[:value] - prediction[:measured] = test_set.data_entries[cid][prediction[:prediction_feature_id].to_s] + prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s] else nr_unpredicted += 1 end @@ -42,7 +42,6 @@ module OpenTox end validation = self.new( :model_id => validation_model.id, - #:prediction_dataset_id => prediction_dataset.id, :test_dataset_id => test_set.id, :nr_instances => test_set.compounds.size, :nr_unpredicted => nr_unpredicted, -- cgit v1.2.3 From 4662e845c12e3e623ec9bec208c42cd4b1886047 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 15 Apr 2016 14:58:17 +0200 Subject: enm study import --- lib/dataset.rb | 11 +++++------ lib/feature.rb | 10 ++-------- lib/import.rb | 53 +++++++++++++++++++---------------------------------- lib/nanoparticle.rb | 42 +++++++++++++++++++++++++----------------- 4 files changed, 51 insertions(+), 65 deletions(-) (limited to 'lib') diff --git a/lib/dataset.rb b/lib/dataset.rb index fdf1bfc..b51d74b 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -88,22 +88,21 @@ module OpenTox # @return [String] def to_csv(inchi=false) CSV.generate() do |csv| - compound = Substance.find(data_entries.first.first).is_a? Compound + compound = Substance.find(substance_ids.first).is_a? Compound if compound csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} else csv << ["Name"] + features.collect{|f| f.name} end - data_entries.each do |sid,f| - substance = Substance.find sid - features.each do |feature| - f[feature.id.to_s].each do |v| + substances.each do |substance| + features.each do |f| + substance.toxicities[f.id.to_s].each do |v| if compound csv << [inchi ? substance.inchi : substance.smiles , v] else csv << [substance.name , v] end - end if f[feature.id.to_s] + end if substance.toxicities[f.id.to_s] end end end diff --git a/lib/feature.rb b/lib/feature.rb index f13a3fb..c6fb68a 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -6,7 +6,9 @@ module OpenTox field :numeric, type: Boolean field :measured, type: Boolean field :calculated, type: Boolean + field :category, type: String field :unit, type: String + field :conditions, type: Hash end # Feature for categorical variables @@ -35,12 +37,4 @@ module OpenTox end end - # Feature for categorical bioassay results - class NominalBioAssay < NominalFeature - end - - # Feature for quantitative bioassay results - class NumericBioAssay < NumericFeature - end - end diff --git a/lib/import.rb b/lib/import.rb index cf0855e..9091207 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -19,43 +19,28 @@ module OpenTox :name => np["values"]["https://data.enanomapper.net/identifier/name"], :source => np["compound"]["URI"], ) - dataset.data_entries[nanoparticle.id.to_s] ||= {} - nanoparticle.bundles << uri - nanoparticle.dataset_ids << dataset.id - np["composition"].each do |comp| - case comp["relation"] - when "HAS_CORE" - nanoparticle.core = comp["component"]["compound"]["URI"] - when "HAS_COATING" - nanoparticle.coating << comp["component"]["compound"]["URI"] - end - end if np["composition"] - np["values"].each do |u,v| - if u.match(/property/) - name, unit, source = nil - features.each do |uri,feat| - if u.match(/#{uri}/) - name = feat["title"] - unit = feat["units"] - source = uri - end - end - feature = Feature.find_or_create_by( - :name => name, - :unit => unit, - :source => source + dataset.substance_ids << nanoparticle.id + dataset.substance_ids.uniq! + studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study")))["study"] + studies.each do |study| + study["effects"].each do |effect| + effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature + # TODO parse core/coating + # TODO parse proteomics, they come as a large textValue + $logger.debug File.join(np["compound"]["URI"],"study") + effect["conditions"].delete_if { |k, v| v.nil? } + feature = klass.find_or_create_by( + :source => File.join(np["compound"]["URI"],"study"), + :name => "#{study["protocol"]["category"]["title"]} #{study["protocol"]["endpoint"]}", + :unit => effect["result"]["unit"], + :category => study["protocol"]["topcategory"], + :conditions => effect["conditions"] ) + nanoparticle.parse_ambit_value feature, effect["result"] + dataset.feature_ids << feature.id + dataset.feature_ids.uniq! end - v.each{|value| nanoparticle.parse_ambit_value feature, value} if v.is_a? Array - end - nanoparticle.bundles.uniq! - nanoparticle.physchem_descriptors.each{|f,v| v.uniq!} - #nanoparticle.toxicities.each{|f,v| v.uniq!} - nanoparticle.toxicities.each do |f,v| - dataset.data_entries[nanoparticle.id.to_s][f.to_s] ||= [] - dataset.data_entries[nanoparticle.id.to_s][f.to_s] += v end - nanoparticle.save end dataset.save datasets << dataset diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 0350363..295b6c0 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -12,43 +12,51 @@ module OpenTox end def add_feature feature, value - if feature.source.match /property\/P-CHEM/ + case feature.category + when "P-CHEM" physchem_descriptors[feature.id.to_s] ||= [] physchem_descriptors[feature.id.to_s] << value - elsif feature.source.match /property\/TOX/ + when "TOX" toxicities[feature.id.to_s] ||= [] toxicities[feature.id.to_s] << value else - warn "Unknown feature type '#{feature.source}'. Value '#{value}' not inserted." + warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted." end + save end def parse_ambit_value feature, v - # TODO: units, mmol/log10 conversion - if v.keys == ["loValue"] - #if v["loValue"].numeric? - add_feature feature, v["loValue"] - #else - #warn "'#{v["loValue"]}' is not a numeric value, entry ignored." - #end + v.delete "unit" + # TODO: mmol/log10 conversion + if v.keys == ["textValue"] + add_feature feature, v["textValue"] + elsif v.keys == ["loValue"] + add_feature feature, v["loValue"] + elsif v.keys.size == 2 and v["errorValue"] + add_feature feature, v["loValue"] + warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." elsif v.keys.size == 2 and v["loQualifier"] == "mean" - #add_feature feature, {:mean => v["loValue"]} add_feature feature, v["loValue"] warn "'#{feature.name}' is a mean value. Original data is not available." elsif v.keys.size == 2 and v["loQualifier"] #== ">=" - #add_feature feature, {:min => v["loValue"],:max => Float::INFINITY} warn "Only min value available for '#{feature.name}', entry ignored" elsif v.keys.size == 2 and v["upQualifier"] #== ">=" - #add_feature feature, {:max => v["upValue"],:min => -Float::INFINITY} warn "Only max value available for '#{feature.name}', entry ignored" - elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] - #add_feature feature, {:min => v["loValue"],:max => v["upValue"]} + elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil? + add_feature feature, v["loValue"] + warn "loQualifier and upQualifier are empty." + elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"] == "" and v["upQualifier"] == "" + add_feature feature, v["loValue"] + warn "loQualifier and upQualifier are empty." + elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] and v["loValue"] and v["upValue"] add_feature feature, [v["loValue"],v["upValue"]].mean warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available." + elsif v.size == 4 and v["loQualifier"] == "mean" and v["errorValue"] + warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." + add_feature feature, v["loValue"] elsif v == {} # do nothing else - $logger.warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'." - warnings << "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'." + warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'." end end -- cgit v1.2.3 From 75b70425ae8699464a18529eb7bf35a216c06243 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 21 Apr 2016 09:56:12 +0200 Subject: AMBIT import expanded --- lib/classification.rb | 1 + lib/nanoparticle.rb | 3 +++ 2 files changed, 4 insertions(+) (limited to 'lib') diff --git a/lib/classification.rb b/lib/classification.rb index 4a17546..0de8726 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -8,6 +8,7 @@ module OpenTox weighted_sum = {} sim_sum = 0.0 confidence = 0.0 + # see ~/src/pubchem-read-across/application.rb:353 neighbors.each do |row| sim = row["tanimoto"] row["toxicities"][params[:prediction_feature_id].to_s].each do |act| diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 295b6c0..b934bb3 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -48,6 +48,9 @@ module OpenTox elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"] == "" and v["upQualifier"] == "" add_feature feature, v["loValue"] warn "loQualifier and upQualifier are empty." + elsif v.keys.size == 4 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil? + add_feature feature, v["loValue"] + warn "loQualifier and upQualifier are empty." elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] and v["loValue"] and v["upValue"] add_feature feature, [v["loValue"],v["upValue"]].mean warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available." -- cgit v1.2.3 From 4ebd80fee52c04bd36781f846eae60019918345d Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 21 Apr 2016 14:29:23 +0200 Subject: initial classification probabilities --- lib/classification.rb | 38 +++++++++++++++++++------------------- lib/crossvalidation.rb | 2 +- lib/leave-one-out-validation.rb | 22 +++++++++++----------- 3 files changed, 31 insertions(+), 31 deletions(-) (limited to 'lib') diff --git a/lib/classification.rb b/lib/classification.rb index 0202940..b9b66f0 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -5,28 +5,28 @@ module OpenTox def self.weighted_majority_vote compound, params neighbors = params[:neighbors] - weighted_sum = {} - sim_sum = 0.0 - confidence = 0.0 - neighbors.each do |row| - sim = row["tanimoto"] - row["features"][params[:prediction_feature_id].to_s].each do |act| - weighted_sum[act] ||= 0 - weighted_sum[act] += sim + feature_id = params[:prediction_feature_id].to_s + sims = {} + neighbors.each do |n| + sim = n["tanimoto"] + n["features"][feature_id].each do |act| + sims[act] ||= [] + sims[act] << sim + #sims[act] << 0.5*sim+0.5 # scale to 1-0.5 end end - case weighted_sum.size - when 1 - return {:value => weighted_sum.keys.first, :confidence => weighted_sum.values.first/neighbors.size.abs} - when 2 - sim_sum = weighted_sum[weighted_sum.keys[0]] - sim_sum -= weighted_sum[weighted_sum.keys[1]] - sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1] - confidence = (sim_sum/neighbors.size).abs - return {:value => prediction,:confidence => confidence} - else - bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'" + sim_all = sims.collect{|a,s| s}.flatten + sim_sum = sim_all.sum + sim_max = sim_all.max + probabilities = {} + sims.each do |a,s| + probabilities[a] = s.sum/sim_sum end + probabilities = probabilities.collect{|a,p| [a,sim_max*p]}.to_h + p_max = probabilities.collect{|a,p| p}.max + prediction = probabilities.key(p_max) + {:value => prediction,:probabilities => probabilities} + end end end diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 15dfb21..6ffeb25 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -52,7 +52,7 @@ module OpenTox cv.update_attributes( nr_instances: nr_instances, nr_unpredicted: nr_unpredicted, - predictions: predictions#.sort{|a,b| b[3] <=> a[3]} # sort according to confidence + predictions: predictions ) $logger.debug "Nr unpredicted: #{nr_unpredicted}" cv.statistics diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index 2cd13db..0a131a4 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -51,18 +51,18 @@ module OpenTox if pred[:value] == db_act if pred[:value] == accept_values[0] confusion_matrix[0][0] += 1 - weighted_confusion_matrix[0][0] += pred[:confidence] + #weighted_confusion_matrix[0][0] += pred[:confidence] elsif pred[:value] == accept_values[1] confusion_matrix[1][1] += 1 - weighted_confusion_matrix[1][1] += pred[:confidence] + #weighted_confusion_matrix[1][1] += pred[:confidence] end else if pred[:value] == accept_values[0] confusion_matrix[0][1] += 1 - weighted_confusion_matrix[0][1] += pred[:confidence] + #weighted_confusion_matrix[0][1] += pred[:confidence] elsif pred[:value] == accept_values[1] confusion_matrix[1][0] += 1 - weighted_confusion_matrix[1][0] += pred[:confidence] + #weighted_confusion_matrix[1][0] += pred[:confidence] end end end @@ -73,17 +73,17 @@ module OpenTox predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f end confidence_sum = 0 - weighted_confusion_matrix.each do |r| - r.each do |c| - confidence_sum += c - end - end +# weighted_confusion_matrix.each do |r| +# r.each do |c| +# confidence_sum += c +# end +# end update_attributes( accept_values: accept_values, confusion_matrix: confusion_matrix, - weighted_confusion_matrix: weighted_confusion_matrix, +# weighted_confusion_matrix: weighted_confusion_matrix, accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f, - weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, +# weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, true_rate: true_rate, predictivity: predictivity, finished_at: Time.now -- cgit v1.2.3 From cfc64a2966ab38698e499f0b44f41208ee77a07f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 26 Apr 2016 17:38:15 +0200 Subject: first nanomaterial prediction --- lib/import.rb | 18 +++++++++- lib/model.rb | 1 + lib/nanoparticle.rb | 2 ++ lib/overwrite.rb | 9 +++++ lib/regression.rb | 99 +++++++++++++++++++++++++++++++++++++---------------- 5 files changed, 98 insertions(+), 31 deletions(-) (limited to 'lib') diff --git a/lib/import.rb b/lib/import.rb index 9091207..3c1edfe 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -30,7 +30,7 @@ module OpenTox $logger.debug File.join(np["compound"]["URI"],"study") effect["conditions"].delete_if { |k, v| v.nil? } feature = klass.find_or_create_by( - :source => File.join(np["compound"]["URI"],"study"), + #:source => File.join(np["compound"]["URI"],"study"), :name => "#{study["protocol"]["category"]["title"]} #{study["protocol"]["endpoint"]}", :unit => effect["result"]["unit"], :category => study["protocol"]["topcategory"], @@ -48,6 +48,22 @@ module OpenTox datasets.collect{|d| d.id} end +=begin + def self.import_ld # defunct, AMBIT JSON_LD does not have substance entries + #get list of bundle URIs + bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] + datasets = [] + bundles.each do |bundle| + uri = bundle["URI"] + study = JSON.parse(`curl -H 'Accept:application/ld+json' '#{uri}/substance'`) + study["@graph"].each do |i| + puts i.to_yaml if i.keys.include? "sio:has-value" + end + end + datasets.collect{|d| d.id} + end +=end + def self.dump #get list of bundle URIs `wget 'https://data.enanomapper.net/bundle?media=application%2Fjson' -O bundles.json` diff --git a/lib/model.rb b/lib/model.rb index b82f098..45054e2 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -50,6 +50,7 @@ module OpenTox end def predict_compound compound + #p compound neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) # remove neighbors without prediction_feature # check for database activities (neighbors may include query compound) diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index b934bb3..b5de5b9 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -16,9 +16,11 @@ module OpenTox when "P-CHEM" physchem_descriptors[feature.id.to_s] ||= [] physchem_descriptors[feature.id.to_s] << value + physchem_descriptors[feature.id.to_s].uniq! when "TOX" toxicities[feature.id.to_s] ||= [] toxicities[feature.id.to_s] << value + toxicities[feature.id.to_s].uniq! else warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted." end diff --git a/lib/overwrite.rb b/lib/overwrite.rb index cef5758..4a79051 100644 --- a/lib/overwrite.rb +++ b/lib/overwrite.rb @@ -114,6 +114,15 @@ class Array Math.sqrt(self.sample_variance) end + def for_R + if self.first.is_a?(String) + #"\"#{self.collect{|v| v.sub('[','').sub(']','')}.join(" ")}\"" # quote and remove square brackets + "NA" + else + self.median + end + end + end module URI diff --git a/lib/regression.rb b/lib/regression.rb index cb17f25..5610a77 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -75,46 +75,62 @@ module OpenTox end - def self.local_physchem_regression compound, params, method="plsr"#, method_params="ncomp = 4" + def self.local_physchem_regression compound, params, method="pls"#, method_params="ncomp = 4" + + neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s]} # use only neighbors with measured activities - neighbors = params[:neighbors] return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 return {:value => neighbors.first["toxicities"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 activities = [] weights = [] - physchem = {} + pc_ids = neighbors.collect{|n| n.physchem_descriptors.keys}.flatten.uniq + data_frame = [] + data_frame[0] = [] neighbors.each_with_index do |n,i| - if n["toxicities"][params[:prediction_feature_id].to_s] - n["toxicities"][params[:prediction_feature_id].to_s].each do |act| - # TODO fix!!!! - activities << -Math.log10(act) - #if act.numeric? - #activities << act - n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? - neighbor = Substance.find(n["_id"]) - neighbor.physchem_descriptors.each do |pid,v| # insert physchem only if there is an activity - physchem[pid] ||= [] - physchem[pid] += v - end + neighbor = Substance.find(n["_id"]) + n["toxicities"][params[:prediction_feature_id].to_s].each do |act| + data_frame[0][i] = act + n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? + neighbor.physchem_descriptors.each do |pid,values| + values.uniq! + warn "More than one value for #{Feature.find(pid).name}: #{values.join(', ')}" unless values.size == 1 + j = pc_ids.index(pid)+1 + data_frame[j] ||= [] + data_frame[j][i] = values.for_R end end + (0..pc_ids.size+1).each do |j| # for R: fill empty values with NA + data_frame[j] ||= [] + data_frame[j][i] ||= "NA" + end end - - # remove properties with a single value - physchem.each do |pid,v| - physchem.delete(pid) if v.uniq.size <= 1 + remove_idx = [] + data_frame.each_with_index do |r,i| + remove_idx << i if r.uniq.size == 1 # remove properties with a single value + end + remove_idx.reverse.each do |i| + data_frame.delete_at i + pc_ids.delete_at i end - if physchem.empty? + if pc_ids.empty? result = local_weighted_average(compound, params) result[:warning] = "No variables for regression model. Using weighted average of similar compounds." return result - else - data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid].collect{|v| "\"#{v.sub('[','').sub(']','')}\"" if v.is_a? String }} - prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem_descriptors[pid]} + query_descriptors = pc_ids.collect{|i| compound.physchem_descriptors[i].for_R} + remove_idx = [] + query_descriptors.each_with_index do |v,i| + remove_idx << i if v == "NA" + end + remove_idx.reverse.each do |i| + data_frame.delete_at i + pc_ids.delete_at i + query_descriptors.delete_at i + end + prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors if prediction.nil? prediction = local_weighted_average(compound, params) prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." @@ -130,16 +146,39 @@ module OpenTox def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values R.assign "weights", training_weights r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" - #p r_data_frame - File.open("tmp.R","w+"){|f| f.puts "data <- #{r_data_frame}\n"} +rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) + File.open("tmp.R","w+"){|f| + f.puts "suppressPackageStartupMessages({ + library(iterators,lib=\"#{rlib}\") + library(foreach,lib=\"#{rlib}\") + library(ggplot2,lib=\"#{rlib}\") + library(grid,lib=\"#{rlib}\") + library(gridExtra,lib=\"#{rlib}\") + library(pls,lib=\"#{rlib}\") + library(caret,lib=\"#{rlib}\") + library(doMC,lib=\"#{rlib}\") + registerDoMC(#{NR_CORES}) +})" + + f.puts "data <- #{r_data_frame}\n" + f.puts "weights <- c(#{training_weights.join(', ')})" + f.puts "features <- c(#{training_features.join(', ')})" + f.puts "names(data) <- append(c('activities'),features)" # + f.puts "model <- train(activities ~ ., data = data, method = '#{method}')" + f.puts "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" + f.puts "names(fingerprint) <- features" + f.puts "prediction <- predict(model,fingerprint)" + } + R.eval "data <- #{r_data_frame}" R.assign "features", training_features R.eval "names(data) <- append(c('activities'),features)" # - begin - R.eval "model <- train(activities ~ ., data = data, method = '#{method}')" - rescue - return nil - end + #begin + R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)" + #rescue + #return nil + #end + p query_feature_values R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" R.eval "names(fingerprint) <- features" R.eval "prediction <- predict(model,fingerprint)" -- cgit v1.2.3 From 32d767ee7cfcc19337892551906950621f348174 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 28 Apr 2016 08:11:12 +0200 Subject: nanoparticle crossvalidation technically working --- lib/crossvalidation.rb | 2 +- lib/regression.rb | 14 +++++++------- lib/validation.rb | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'lib') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 50afb6f..0ae36c4 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -44,7 +44,7 @@ module OpenTox $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" #end end - Process.waitall + #Process.waitall cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id) cv.validations.each do |validation| nr_instances += validation.nr_instances diff --git a/lib/regression.rb b/lib/regression.rb index 5610a77..3a59c14 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -9,6 +9,7 @@ module OpenTox neighbors = params[:neighbors] neighbors.each do |row| sim = row["tanimoto"] + sim ||= 1 # TODO: sim f nanoparticles if row["toxicities"][params[:prediction_feature_id].to_s] row["toxicities"][params[:prediction_feature_id].to_s].each do |act| weighted_sum += sim*Math.log10(act) @@ -120,7 +121,7 @@ module OpenTox result[:warning] = "No variables for regression model. Using weighted average of similar compounds." return result else - query_descriptors = pc_ids.collect{|i| compound.physchem_descriptors[i].for_R} + query_descriptors = pc_ids.collect{|i| compound.physchem_descriptors[i].for_R if compound.physchem_descriptors[i]}.compact remove_idx = [] query_descriptors.each_with_index do |v,i| remove_idx << i if v == "NA" @@ -172,13 +173,9 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) R.eval "data <- #{r_data_frame}" R.assign "features", training_features - R.eval "names(data) <- append(c('activities'),features)" # - #begin + begin + R.eval "names(data) <- append(c('activities'),features)" # R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)" - #rescue - #return nil - #end - p query_feature_values R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" R.eval "names(fingerprint) <- features" R.eval "prediction <- predict(model,fingerprint)" @@ -187,6 +184,9 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f, :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f, } + rescue + return nil + end end end diff --git a/lib/validation.rb b/lib/validation.rb index 6b515e4..68cb1a1 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -24,12 +24,12 @@ module OpenTox def self.create model, training_set, test_set, crossvalidation=nil - atts = model.attributes.dup # do not modify attributes from original model + atts = model.attributes.dup # do not modify attributes of the original model atts["_id"] = BSON::ObjectId.new atts[:training_dataset_id] = training_set.id validation_model = model.class.create model.prediction_feature, training_set, atts validation_model.save - predictions = validation_model.predict test_set.compounds + predictions = validation_model.predict test_set.substances predictions.each{|cid,p| p.delete(:neighbors)} nr_unpredicted = 0 predictions.each do |cid,prediction| @@ -43,7 +43,7 @@ module OpenTox validation = self.new( :model_id => validation_model.id, :test_dataset_id => test_set.id, - :nr_instances => test_set.compounds.size, + :nr_instances => test_set.substances.size, :nr_unpredicted => nr_unpredicted, :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence ) -- cgit v1.2.3 From acf19c81e345ceccde834653a0f0edce27827958 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 28 Apr 2016 11:05:05 +0200 Subject: compound classification fixed --- lib/compound.rb | 2 +- lib/model.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index 049d77b..c2ce5d0 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -335,7 +335,7 @@ module OpenTox {'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}}, {'$sort' => {'tanimoto' => -1}} ] - + $mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]} end diff --git a/lib/model.rb b/lib/model.rb index 45054e2..80b4685 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -64,7 +64,7 @@ module OpenTox prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." neighbors.delete_if{|n| n["_id"] == compound.id} end - neighbors.delete_if{|n| n['toxicities'].empty? or n['toxicities'][prediction_feature.id.to_s] == [nil] } + #neighbors.delete_if{|n| n['toxicities'].empty? or n['toxicities'][prediction_feature.id.to_s] == [nil] } if neighbors.empty? prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []}) else -- cgit v1.2.3 From 79238bddb59607aa9f759caa9e3c8db176709703 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 28 Apr 2016 12:19:48 +0200 Subject: compound validations fixed --- lib/model.rb | 1 - lib/nanoparticle.rb | 2 +- lib/regression.rb | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/model.rb b/lib/model.rb index 80b4685..f61368e 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -50,7 +50,6 @@ module OpenTox end def predict_compound compound - #p compound neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) # remove neighbors without prediction_feature # check for database activities (neighbors may include query compound) diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index b5de5b9..83b97a9 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -8,7 +8,7 @@ module OpenTox field :bundles, type: Array, default: [] def nanoparticle_neighbors params - Dataset.find(params[:training_dataset_id]).nanoparticles + Dataset.find(params[:training_dataset_id]).nanoparticles.collect{|np| {"_id" => np.id, "tanimoto" => 1}} end def add_feature feature, value diff --git a/lib/regression.rb b/lib/regression.rb index 3a59c14..694a2dc 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -85,7 +85,7 @@ module OpenTox activities = [] weights = [] - pc_ids = neighbors.collect{|n| n.physchem_descriptors.keys}.flatten.uniq + pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq data_frame = [] data_frame[0] = [] -- cgit v1.2.3 From 05386e748270c337c66f6f379317ea4b25905236 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 4 May 2016 19:24:42 +0200 Subject: first reasonable results for nanoparticle crossvalidation --- lib/crossvalidation.rb | 4 +- lib/model.rb | 101 +++++++++++++++++++------------------------ lib/nanoparticle.rb | 18 ++++++-- lib/regression.rb | 38 ++++++++-------- lib/validation-statistics.rb | 7 ++- 5 files changed, 84 insertions(+), 84 deletions(-) (limited to 'lib') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 0ae36c4..e1f956b 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -141,7 +141,7 @@ module OpenTox :measured => p[1], :predicted => p[2], #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs, - :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs, + :error => (p[1]-p[2]).abs, :relative_error => (p[1]-p[2]).abs/p[1], :confidence => p[3], :neighbors => neighbors @@ -152,7 +152,7 @@ module OpenTox def confidence_plot tmpfile = "/tmp/#{id.to_s}_confidence.png" - sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact + sorted_predictions = predictions.collect{|p| [(p[1]-p[2]).abs,p[3]] if p[1] and p[2]}.compact R.assign "error", sorted_predictions.collect{|p| p[0]} R.assign "confidence", sorted_predictions.collect{|p| p[1]} # TODO fix axis names diff --git a/lib/model.rb b/lib/model.rb index f61368e..841ab20 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -2,7 +2,7 @@ module OpenTox module Model - class Model + class Lazar include OpenTox include Mongoid::Document include Mongoid::Timestamps @@ -10,27 +10,13 @@ module OpenTox field :name, type: String field :creator, type: String, default: __FILE__ - # datasets field :training_dataset_id, type: BSON::ObjectId - # algorithms field :prediction_algorithm, type: String - # prediction feature field :prediction_feature_id, type: BSON::ObjectId - - def training_dataset - Dataset.find(training_dataset_id) - end - - def prediction_feature - Feature.find(prediction_feature_id) - end - end - - class Lazar < Model - - # algorithms field :neighbor_algorithm, type: String field :neighbor_algorithm_parameters, type: Hash, default: {} + field :feature_selection_algorithm, type: String + field :relevant_features, type: Hash # Create a lazar model from a training_dataset and a feature_dataset # @param [OpenTox::Dataset] training_dataset @@ -45,10 +31,43 @@ module OpenTox self.name ||= "#{training_dataset.name} #{prediction_feature.name}" self.neighbor_algorithm_parameters ||= {} self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id + + Algorithm.run(feature_selection_algorithm, self) if feature_selection_algorithm save self end + def correlation_filter + toxicities = [] + substances = [] + training_dataset.substances.each do |s| + s["toxicities"][prediction_feature_id].each do |act| + toxicities << act + substances << s + end + end + R.assign "tox", toxicities + feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq + feature_ids.each do |feature_id| + feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id]} + R.assign "feature", feature_values + begin + #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')" + R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='complete')" + pvalue = R.eval("cor$p.value").to_ruby + if pvalue <= 0.05 + r = R.eval("cor$estimate").to_ruby + relevant_features[feature] = {} + relevant_features[feature]["pvalue"] = pvalue + relevant_features[feature]["r"] = r + end + rescue + warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{toxicities}) failed." + end + end + relevant_features.sort!{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h + end + def predict_compound compound neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) # remove neighbors without prediction_feature @@ -63,7 +82,6 @@ module OpenTox prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." neighbors.delete_if{|n| n["_id"] == compound.id} end - #neighbors.delete_if{|n| n['toxicities'].empty? or n['toxicities'][prediction_feature.id.to_s] == [nil] } if neighbors.empty? prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []}) else @@ -123,6 +141,14 @@ module OpenTox end + def training_dataset + Dataset.find(training_dataset_id) + end + + def prediction_feature + Feature.find(prediction_feature_id) + end + end class LazarClassification < Lazar @@ -229,45 +255,6 @@ module OpenTox end end - class NanoLazar - include OpenTox - include Mongoid::Document - include Mongoid::Timestamps - store_in collection: "models" - - field :name, type: String - field :creator, type: String, default: __FILE__ - # datasets - field :training_dataset_id, type: BSON::ObjectId - # algorithms - field :prediction_algorithm, type: String - # prediction feature - field :prediction_feature_id, type: BSON::ObjectId - field :training_particle_ids, type: Array - - def self.create_all - nanoparticles = Nanoparticle.all - toxfeatures = Nanoparticle.all.collect{|np| np.toxicities.keys}.flatten.uniq.collect{|id| Feature.find id} - tox = {} - toxfeatures.each do |t| - tox[t] = nanoparticles.select{|np| np.toxicities.keys.include? t.id.to_s} - end - tox.select!{|t,nps| nps.size > 50} - tox.collect do |t,nps| - find_or_create_by(:prediction_feature_id => t.id, :training_particle_ids => nps.collect{|np| np.id}) - end - end - - def predict nanoparticle - training = training_particle_ids.collect{|id| Nanoparticle.find id} - training_features = training.collect{|t| t.physchem_descriptors.keys}.flatten.uniq - query_features = nanoparticle.physchem_descriptors.keys - common_features = (training_features & query_features) - #p common_features - end - - end - end end diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 83b97a9..dda4a9f 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -8,7 +8,7 @@ module OpenTox field :bundles, type: Array, default: [] def nanoparticle_neighbors params - Dataset.find(params[:training_dataset_id]).nanoparticles.collect{|np| {"_id" => np.id, "tanimoto" => 1}} + Dataset.find(params[:training_dataset_id]).nanoparticles.collect{|np| np["tanimoto"] = 1; np} end def add_feature feature, value @@ -19,7 +19,19 @@ module OpenTox physchem_descriptors[feature.id.to_s].uniq! when "TOX" toxicities[feature.id.to_s] ||= [] - toxicities[feature.id.to_s] << value + # TODO generic way of parsing TOX values + if feature.name == "7.99 Toxicity (other) ICP-AES" and feature.unit == "mL/ug(Mg)" + toxicities[feature.id.to_s] << -Math.log10(value) + #if value.numeric? + #begin + #rescue + #p feature + #p value + #exit + #end + else + toxicities[feature.id.to_s] << value + end toxicities[feature.id.to_s].uniq! else warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted." @@ -29,7 +41,7 @@ module OpenTox def parse_ambit_value feature, v v.delete "unit" - # TODO: mmol/log10 conversion + # TODO: ppm instead of weights if v.keys == ["textValue"] add_feature feature, v["textValue"] elsif v.keys == ["loValue"] diff --git a/lib/regression.rb b/lib/regression.rb index 694a2dc..d2c4e91 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -12,16 +12,15 @@ module OpenTox sim ||= 1 # TODO: sim f nanoparticles if row["toxicities"][params[:prediction_feature_id].to_s] row["toxicities"][params[:prediction_feature_id].to_s].each do |act| - weighted_sum += sim*Math.log10(act) + weighted_sum += sim*act sim_sum += sim end end end - sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) + sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum {:value => prediction} end - # TODO explicit neighbors, also for physchem def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05" neighbors = params[:neighbors] return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 @@ -35,7 +34,7 @@ module OpenTox fingerprint = neighbor.fingerprint if row["toxicities"][params[:prediction_feature_id].to_s] row["toxicities"][params[:prediction_feature_id].to_s].each do |act| - activities << Math.log10(act) + activities << act weights << row["tanimoto"] fingerprint_ids.each_with_index do |id,j| fingerprints[id] ||= [] @@ -67,9 +66,9 @@ module OpenTox prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." return prediction else - prediction[:prediction_interval] = [10**(prediction[:value]-1.96*prediction[:rmse]), 10**(prediction[:value]+1.96*prediction[:rmse])] - prediction[:value] = 10**prediction[:value] - prediction[:rmse] = 10**prediction[:rmse] + prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]] + prediction[:value] = prediction[:value] + prediction[:rmse] = prediction[:rmse] prediction end end @@ -96,7 +95,7 @@ module OpenTox n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? neighbor.physchem_descriptors.each do |pid,values| values.uniq! - warn "More than one value for #{Feature.find(pid).name}: #{values.join(', ')}" unless values.size == 1 + warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1 j = pc_ids.index(pid)+1 data_frame[j] ||= [] data_frame[j][i] = values.for_R @@ -121,7 +120,9 @@ module OpenTox result[:warning] = "No variables for regression model. Using weighted average of similar compounds." return result else - query_descriptors = pc_ids.collect{|i| compound.physchem_descriptors[i].for_R if compound.physchem_descriptors[i]}.compact + query_descriptors = pc_ids.collect do |i| + compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA" + end remove_idx = [] query_descriptors.each_with_index do |v,i| remove_idx << i if v == "NA" @@ -137,7 +138,6 @@ module OpenTox prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." return prediction else - prediction[:value] = 10**prediction[:value] prediction end end @@ -148,6 +148,7 @@ module OpenTox R.assign "weights", training_weights r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) +=begin File.open("tmp.R","w+"){|f| f.puts "suppressPackageStartupMessages({ library(iterators,lib=\"#{rlib}\") @@ -170,20 +171,21 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) f.puts "names(fingerprint) <- features" f.puts "prediction <- predict(model,fingerprint)" } +=end R.eval "data <- #{r_data_frame}" R.assign "features", training_features begin R.eval "names(data) <- append(c('activities'),features)" # R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)" - R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" - R.eval "names(fingerprint) <- features" - R.eval "prediction <- predict(model,fingerprint)" - { - :value => R.eval("prediction").to_f, - :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f, - :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f, - } + R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" + R.eval "names(fingerprint) <- features" + R.eval "prediction <- predict(model,fingerprint)" + { + :value => R.eval("prediction").to_f, + :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f, + :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f, + } rescue return nil end diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index c6b2a07..b7c95f6 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -63,16 +63,15 @@ module OpenTox end def self.regression predictions - # TODO: prediction intervals rmse = 0 mae = 0 x = [] y = [] predictions.each do |cid,pred| if pred[:value] and pred[:measured] #and pred[:measured] != [nil] - x << -Math.log10(pred[:measured].median) - y << -Math.log10(pred[:value]) - error = Math.log10(pred[:value])-Math.log10(pred[:measured].median) + x << pred[:measured].median + y << pred[:value] + error = pred[:value]-pred[:measured].median rmse += error**2 mae += error.abs else -- cgit v1.2.3 From ab7b37541b4f8a762be737009631d3eefd898b4a Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 5 May 2016 16:14:02 +0200 Subject: ambit mirror, import from mirrored json, proteomics import --- lib/compound.rb | 6 ++-- lib/import.rb | 101 ++++++++++++++++++++++++++++------------------------ lib/model.rb | 4 +-- lib/nanoparticle.rb | 21 +++++------ lib/regression.rb | 6 ++-- lib/substance.rb | 2 +- 6 files changed, 72 insertions(+), 68 deletions(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index c2ce5d0..143c4f2 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -77,7 +77,7 @@ module OpenTox def physchem descriptors=PhysChem.openbabel_descriptors # TODO: speedup java descriptors - calculated_ids = physchem_descriptors.keys + calculated_ids = physchem.keys # BSON::ObjectId instances are not allowed as keys in a BSON document. new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids descs = {} @@ -90,11 +90,11 @@ module OpenTox # avoid recalculating Cdk features with multiple values descs.keys.uniq.each do |k| descs[k].send(k[0].downcase,k[1],self).each do |n,v| - physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document. + physchem[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document. end end save - physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id} + physchem.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id} end def smarts_match smarts, count=false diff --git a/lib/import.rb b/lib/import.rb index 3c1edfe..11cb367 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -5,47 +5,73 @@ module OpenTox class Enanomapper include OpenTox - def self.import + def self.mirror dir="." #get list of bundle URIs bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] + File.open(File.join(dir,"bundles.json"),"w+"){|f| f.puts JSON.pretty_generate(bundles)} datasets = [] bundles.each do |bundle| - uri = bundle["URI"] - dataset = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"]) nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"] - features = JSON.parse(RestClientWrapper.get(bundle["property"]+"?media=application%2Fjson"))["feature"] - nanoparticles.each do |np| - nanoparticle = Nanoparticle.find_or_create_by( - :name => np["values"]["https://data.enanomapper.net/identifier/name"], - :source => np["compound"]["URI"], - ) - dataset.substance_ids << nanoparticle.id - dataset.substance_ids.uniq! - studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study")))["study"] + nanoparticles.each do |nanoparticle| + uuid = nanoparticle["values"]["https://data.enanomapper.net/identifier/uuid"] + $logger.debug uuid + File.open(File.join(dir,"nanoparticle-#{uuid}.json"),"w+"){|f| f.puts JSON.pretty_generate(nanoparticle)} + studies = JSON.parse(RestClientWrapper.get(File.join(nanoparticle["compound"]["URI"],"study")))["study"] studies.each do |study| - study["effects"].each do |effect| - effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature - # TODO parse core/coating - # TODO parse proteomics, they come as a large textValue - $logger.debug File.join(np["compound"]["URI"],"study") - effect["conditions"].delete_if { |k, v| v.nil? } + File.open(File.join(dir,"study-#{uuid}.json"),"w+"){|f| f.puts JSON.pretty_generate(study)} + end + end + end + end + + def self.import dir="." + datasets = {} + JSON.parse(File.read(File.join(dir,"bundles.json"))).each do |bundle| + datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"]) + end + Dir[File.join(dir,"study*.json")].each do |s| + study = JSON.parse(File.read(s)) + np = JSON.parse(File.read(File.join(dir,"nanoparticle-#{study['owner']['substance']['uuid']}.json"))) + nanoparticle = Nanoparticle.find_or_create_by( + :name => np["values"]["https://data.enanomapper.net/identifier/name"], + :source => np["compound"]["URI"], + ) + np["bundles"].keys.each do |bundle_uri| + datasets[bundle_uri].substance_ids << nanoparticle.id + nanoparticle["dataset_ids"] << datasets[bundle_uri].id + end + study["effects"].each do |effect| + effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature + # TODO parse core/coating + # TODO parse proteomics, they come as a large textValue + #$logger.debug File.join(np["compound"]["URI"],"study") + effect["conditions"].delete_if { |k, v| v.nil? } + # parse proteomics data + if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 + JSON.parse(effect["result"]["textValue"]).each do |identifier, value| feature = klass.find_or_create_by( - #:source => File.join(np["compound"]["URI"],"study"), - :name => "#{study["protocol"]["category"]["title"]} #{study["protocol"]["endpoint"]}", - :unit => effect["result"]["unit"], - :category => study["protocol"]["topcategory"], - :conditions => effect["conditions"] + :name => identifier, + :category => "Proteomics", ) - nanoparticle.parse_ambit_value feature, effect["result"] - dataset.feature_ids << feature.id - dataset.feature_ids.uniq! + nanoparticle.parse_ambit_value feature, value end + else + feature = klass.find_or_create_by( + :name => "#{study["protocol"]["category"]["title"]} #{study["protocol"]["endpoint"]}", + :unit => effect["result"]["unit"], + :category => study["protocol"]["topcategory"], + :conditions => effect["conditions"] + ) + nanoparticle.parse_ambit_value feature, effect["result"] end end - dataset.save - datasets << dataset + nanoparticle.save + end + datasets.each do |u,d| + d.feature_ids.uniq! + d.substance_ids.uniq! + d.save end - datasets.collect{|d| d.id} end =begin @@ -64,23 +90,6 @@ module OpenTox end =end - def self.dump - #get list of bundle URIs - `wget 'https://data.enanomapper.net/bundle?media=application%2Fjson' -O bundles.json` - json = JSON.parse File.read('./bundles.json') - json["dataset"].each do |dataset| - uri = dataset["URI"] - id = uri.split("/").last - `wget --header='accept:application/json' '#{uri}' -O 'bundle#{id}'` - `wget --header='accept:application/json' '#{dataset["summary"]}' -O 'summary#{id}.json'` - `wget --header='accept:application/json' '#{dataset["compound"]}' -O 'compound#{id}.json'` - `wget --header='accept:application/json' '#{dataset["substance"]}' -O 'substance#{id}.json'` - `wget --header='accept:application/json' '#{dataset["property"]}' -O 'property#{id}.json'` - `wget --header='accept:application/json' '#{dataset["dataset"]}' -O 'dataset#{id}.json'` - `wget --header='accept:application/json' '#{dataset["matrix"]}' -O 'matrix#{id}.json'` - end - end - end end diff --git a/lib/model.rb b/lib/model.rb index 841ab20..12abc6e 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -47,9 +47,9 @@ module OpenTox end end R.assign "tox", toxicities - feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq + feature_ids = training_dataset.substances.collect{ |s| s["physchem"].keys}.flatten.uniq feature_ids.each do |feature_id| - feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id]} + feature_values = substances.collect{|s| s["physchem"][feature_id]} R.assign "feature", feature_values begin #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')" diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index dda4a9f..c9fbb77 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -6,6 +6,7 @@ module OpenTox field :core, type: String field :coating, type: Array, default: [] field :bundles, type: Array, default: [] + field :proteomics, type: Hash, default: {} def nanoparticle_neighbors params Dataset.find(params[:training_dataset_id]).nanoparticles.collect{|np| np["tanimoto"] = 1; np} @@ -14,21 +15,18 @@ module OpenTox def add_feature feature, value case feature.category when "P-CHEM" - physchem_descriptors[feature.id.to_s] ||= [] - physchem_descriptors[feature.id.to_s] << value - physchem_descriptors[feature.id.to_s].uniq! + physchem[feature.id.to_s] ||= [] + physchem[feature.id.to_s] << value + physchem[feature.id.to_s].uniq! + when "Proteomics" + proteomics[feature.id.to_s] ||= [] + proteomics[feature.id.to_s] << value + proteomics[feature.id.to_s].uniq! when "TOX" toxicities[feature.id.to_s] ||= [] # TODO generic way of parsing TOX values if feature.name == "7.99 Toxicity (other) ICP-AES" and feature.unit == "mL/ug(Mg)" toxicities[feature.id.to_s] << -Math.log10(value) - #if value.numeric? - #begin - #rescue - #p feature - #p value - #exit - #end else toxicities[feature.id.to_s] << value end @@ -36,7 +34,6 @@ module OpenTox else warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted." end - save end def parse_ambit_value feature, v @@ -79,5 +76,3 @@ module OpenTox end end - - diff --git a/lib/regression.rb b/lib/regression.rb index d2c4e91..fe45f99 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -84,7 +84,7 @@ module OpenTox activities = [] weights = [] - pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq + pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem.keys}.flatten.uniq data_frame = [] data_frame[0] = [] @@ -93,7 +93,7 @@ module OpenTox n["toxicities"][params[:prediction_feature_id].to_s].each do |act| data_frame[0][i] = act n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? - neighbor.physchem_descriptors.each do |pid,values| + neighbor.physchem.each do |pid,values| values.uniq! warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1 j = pc_ids.index(pid)+1 @@ -121,7 +121,7 @@ module OpenTox return result else query_descriptors = pc_ids.collect do |i| - compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA" + compound.physchem[i] ? compound.physchem_descriptors[i].for_R : "NA" end remove_idx = [] query_descriptors.each_with_index do |v,i| diff --git a/lib/substance.rb b/lib/substance.rb index 82ca65d..34bc94a 100644 --- a/lib/substance.rb +++ b/lib/substance.rb @@ -1,7 +1,7 @@ module OpenTox class Substance - field :physchem_descriptors, type: Hash, default: {} + field :physchem, type: Hash, default: {} field :toxicities, type: Hash, default: {} field :dataset_ids, type: Array, default: [] end -- cgit v1.2.3 From 51f57e2858b60bed74ebcc97189b2188c900c283 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 6 May 2016 12:49:28 +0200 Subject: dataset tests cleanup --- lib/compound.rb | 7 ++++--- lib/dataset.rb | 39 +++++++++++++++++++++++---------------- lib/lazar.rb | 1 - lib/model.rb | 4 ++-- lib/nanoparticle.rb | 6 +++--- lib/regression.rb | 6 +++--- lib/substance.rb | 2 +- 7 files changed, 36 insertions(+), 29 deletions(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index 143c4f2..6cb7f78 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -22,6 +22,7 @@ module OpenTox # Overwrites standard Mongoid method to create fingerprints before database insertion def self.find_or_create_by params + #PhysChem.descriptors # load descriptor features compound = self.find_or_initialize_by params compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT).size compound.save @@ -77,7 +78,7 @@ module OpenTox def physchem descriptors=PhysChem.openbabel_descriptors # TODO: speedup java descriptors - calculated_ids = physchem.keys + calculated_ids = physchem_descriptors.keys # BSON::ObjectId instances are not allowed as keys in a BSON document. new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids descs = {} @@ -90,11 +91,11 @@ module OpenTox # avoid recalculating Cdk features with multiple values descs.keys.uniq.each do |k| descs[k].send(k[0].downcase,k[1],self).each do |n,v| - physchem[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document. + physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document. end end save - physchem.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id} + physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id} end def smarts_match smarts, count=false diff --git a/lib/dataset.rb b/lib/dataset.rb index b51d74b..9b24440 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -30,19 +30,11 @@ module OpenTox @features end - # Find data entry values for a given compound and feature - # @param compound [OpenTox::Compound] OpenTox Compound object - # @param feature [OpenTox::Feature] OpenTox Feature object - # @return [Array] Data entry values - #def values(compound, feature) - #data_entries[compound.id.to_s][feature.id.to_s] - #end - # Writers # Set compounds def compounds=(compounds) - self.substance_ids = compounds.collect{|c| c.id} + self.substance_ids = compounds.collect{|c| c.id}.uniq end # Set features @@ -95,14 +87,27 @@ module OpenTox csv << ["Name"] + features.collect{|f| f.name} end substances.each do |substance| - features.each do |f| - substance.toxicities[f.id.to_s].each do |v| - if compound - csv << [inchi ? substance.inchi : substance.smiles , v] - else - csv << [substance.name , v] + if compound + name = (inchi ? substance.inchi : substance.smiles) + else + name = substance.name + end + nr_measurements = features.collect{|f| substance.toxicities[f.id.to_s].size if substance.toxicities[f.id.to_s]}.compact.uniq + + if nr_measurements.size > 1 + warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries." + else + (0..nr_measurements.first-1).each do |i| + row = [name] + features.each do |f| + if substance.toxicities[f.id.to_s] + row << substance.toxicities[f.id.to_s][i] + else + row << "" + end end - end if substance.toxicities[f.id.to_s] + csv << row + end end end end @@ -224,6 +229,8 @@ module OpenTox compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi} warn "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." end + substance_ids.uniq! + feature_ids.uniq! $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})" time = Time.now diff --git a/lib/lazar.rb b/lib/lazar.rb index 8eb46e0..8daaaa1 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -86,4 +86,3 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","Cross "experiment.rb", "import.rb", ].each{ |f| require_relative f } -OpenTox::PhysChem.descriptors # load descriptor features diff --git a/lib/model.rb b/lib/model.rb index 12abc6e..841ab20 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -47,9 +47,9 @@ module OpenTox end end R.assign "tox", toxicities - feature_ids = training_dataset.substances.collect{ |s| s["physchem"].keys}.flatten.uniq + feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq feature_ids.each do |feature_id| - feature_values = substances.collect{|s| s["physchem"][feature_id]} + feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id]} R.assign "feature", feature_values begin #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')" diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index c9fbb77..9bf419d 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -15,9 +15,9 @@ module OpenTox def add_feature feature, value case feature.category when "P-CHEM" - physchem[feature.id.to_s] ||= [] - physchem[feature.id.to_s] << value - physchem[feature.id.to_s].uniq! + physchem_descriptors[feature.id.to_s] ||= [] + physchem_descriptors[feature.id.to_s] << value + physchem_descriptors[feature.id.to_s].uniq! when "Proteomics" proteomics[feature.id.to_s] ||= [] proteomics[feature.id.to_s] << value diff --git a/lib/regression.rb b/lib/regression.rb index fe45f99..d2c4e91 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -84,7 +84,7 @@ module OpenTox activities = [] weights = [] - pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem.keys}.flatten.uniq + pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq data_frame = [] data_frame[0] = [] @@ -93,7 +93,7 @@ module OpenTox n["toxicities"][params[:prediction_feature_id].to_s].each do |act| data_frame[0][i] = act n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? - neighbor.physchem.each do |pid,values| + neighbor.physchem_descriptors.each do |pid,values| values.uniq! warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1 j = pc_ids.index(pid)+1 @@ -121,7 +121,7 @@ module OpenTox return result else query_descriptors = pc_ids.collect do |i| - compound.physchem[i] ? compound.physchem_descriptors[i].for_R : "NA" + compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA" end remove_idx = [] query_descriptors.each_with_index do |v,i| diff --git a/lib/substance.rb b/lib/substance.rb index 34bc94a..82ca65d 100644 --- a/lib/substance.rb +++ b/lib/substance.rb @@ -1,7 +1,7 @@ module OpenTox class Substance - field :physchem, type: Hash, default: {} + field :physchem_descriptors, type: Hash, default: {} field :toxicities, type: Hash, default: {} field :dataset_ids, type: Array, default: [] end -- cgit v1.2.3 From 48234554ea99b972a01718ac36c4e8332dd9159b Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sat, 7 May 2016 10:34:03 +0200 Subject: -log10 for regression datasets, test cleanups --- lib/compound.rb | 1 - lib/lazar.rb | 1 - lib/physchem.rb | 1 + 3 files changed, 1 insertion(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index 6cb7f78..c2ce5d0 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -22,7 +22,6 @@ module OpenTox # Overwrites standard Mongoid method to create fingerprints before database insertion def self.find_or_create_by params - #PhysChem.descriptors # load descriptor features compound = self.find_or_initialize_by params compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT).size compound.save diff --git a/lib/lazar.rb b/lib/lazar.rb index 8daaaa1..140bca3 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -61,7 +61,6 @@ suppressPackageStartupMessages({ " # OpenTox classes and includes -#CLASSES = ["Feature","Substance::Compound","Substance::Nanoparticle","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules [ # be aware of the require sequence as it affects class/method overwrites diff --git a/lib/physchem.rb b/lib/physchem.rb index f7b880f..86300ba 100644 --- a/lib/physchem.rb +++ b/lib/physchem.rb @@ -131,3 +131,4 @@ module OpenTox end end +OpenTox::PhysChem.descriptors # load descriptor features -- cgit v1.2.3 From 06fc914653face2c58fd4e6c47161cb03e217582 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sun, 8 May 2016 12:22:58 +0200 Subject: default validations fixed --- lib/classification.rb | 5 +++-- lib/compound.rb | 2 +- lib/crossvalidation.rb | 4 +--- lib/dataset.rb | 15 +++++++++------ lib/leave-one-out-validation.rb | 2 +- lib/model.rb | 5 ++--- lib/regression.rb | 10 +++++----- lib/validation.rb | 4 +++- 8 files changed, 25 insertions(+), 22 deletions(-) (limited to 'lib') diff --git a/lib/classification.rb b/lib/classification.rb index 93b4f0f..4cc9201 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -6,13 +6,14 @@ module OpenTox def self.weighted_majority_vote compound, params neighbors = params[:neighbors] feature_id = params[:prediction_feature_id].to_s + dataset_id = params[:training_dataset_id].to_s sims = {} neighbors.each do |n| sim = n["tanimoto"] - n["toxicities"][feature_id].each do |act| + n["toxicities"][feature_id][dataset_id].each do |act| sims[act] ||= [] sims[act] << sim - end + end if n["toxicities"][feature_id][dataset_id] end sim_all = sims.collect{|a,s| s}.flatten sim_sum = sim_all.sum diff --git a/lib/compound.rb b/lib/compound.rb index c2ce5d0..3af6f6c 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -288,7 +288,7 @@ module OpenTox training_dataset.compounds.each do |compound| candidate_fingerprint = compound.fingerprint params[:type] sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f - neighbors << {"_id" => compound.id, "toxicities" => {prediction_feature.id.to_s => compound.toxicities[prediction_feature.id.to_s]}, "tanimoto" => sim} if sim >= params[:min_sim] + neighbors << {"_id" => compound.id, "toxicities" => {prediction_feature.id.to_s => {training_dataset_id.to_s => compound.toxicities[prediction_feature.id.to_s][training_dataset_id.to_s]}}, "tanimoto" => sim} if sim >= params[:min_sim] end neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]} end diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index e1f956b..8e0c5b9 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -133,14 +133,12 @@ module OpenTox neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters) neighbors.collect! do |n| neighbor = Compound.find(n[0]) - { :smiles => neighbor.smiles, :similarity => n[1], :measurements => neighbor.toxicities[prediction_feature.id.to_s]} + { :smiles => neighbor.smiles, :similarity => n[1], :measurements => neighbor.toxicities[prediction_feature.id.to_s][training_dataset.id.to_s]} end { :smiles => compound.smiles, - #:fingerprint => compound.fp4.collect{|id| Smarts.find(id).name}, :measured => p[1], :predicted => p[2], - #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs, :error => (p[1]-p[2]).abs, :relative_error => (p[1]-p[2]).abs/p[1], :confidence => p[3], diff --git a/lib/dataset.rb b/lib/dataset.rb index 9b24440..86800c6 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -64,6 +64,9 @@ module OpenTox dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id ) dataset.compounds.each do |compound| compound.dataset_ids << dataset.id + compound.toxicities.each do |feature_id,data| + data[dataset.id.to_s] = data[self.id.to_s] # copy data entries + end compound.save end dataset @@ -92,7 +95,7 @@ module OpenTox else name = substance.name end - nr_measurements = features.collect{|f| substance.toxicities[f.id.to_s].size if substance.toxicities[f.id.to_s]}.compact.uniq + nr_measurements = features.collect{|f| substance.toxicities[f.id.to_s][self.id.to_s].size if substance.toxicities[f.id.to_s]}.compact.uniq if nr_measurements.size > 1 warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries." @@ -100,8 +103,8 @@ module OpenTox (0..nr_measurements.first-1).each do |i| row = [name] features.each do |f| - if substance.toxicities[f.id.to_s] - row << substance.toxicities[f.id.to_s][i] + if substance.toxicities[f.id.to_s] and substance.toxicities[f.id.to_s][self.id.to_s] + row << substance.toxicities[f.id.to_s][self.id.to_s][i] else row << "" end @@ -149,7 +152,6 @@ module OpenTox feature_names = table.shift.collect{|f| f.strip} warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size compound_format = feature_names.shift.strip - # TODO nanoparticles bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i numeric = [] # guess feature types @@ -219,8 +221,9 @@ module OpenTox else v = v.strip end - compound.toxicities[feature_ids[j].to_s] ||= [] - compound.toxicities[feature_ids[j].to_s] << v + compound.toxicities[feature_ids[j].to_s] ||= {} + compound.toxicities[feature_ids[j].to_s][self.id.to_s] ||= [] + compound.toxicities[feature_ids[j].to_s][self.id.to_s] << v compound.save end end diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index ed917eb..2306041 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -19,7 +19,7 @@ module OpenTox nr_unpredicted = 0 predictions.each do |cid,prediction| if prediction[:value] - prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s] + prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s][dataset_id.to_s] else nr_unpredicted += 1 end diff --git a/lib/model.rb b/lib/model.rb index 841ab20..5b094fb 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -41,7 +41,7 @@ module OpenTox toxicities = [] substances = [] training_dataset.substances.each do |s| - s["toxicities"][prediction_feature_id].each do |act| + s["toxicities"][prediction_feature_id][training_dataset_id.to_s].each do |act| toxicities << act substances << s end @@ -76,8 +76,7 @@ module OpenTox prediction = {} if neighbors.collect{|n| n["_id"]}.include? compound.id - #TODO restrict to dataset features - database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s].uniq + database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s][training_dataset_id.to_s].uniq prediction[:database_activities] = database_activities prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." neighbors.delete_if{|n| n["_id"] == compound.id} diff --git a/lib/regression.rb b/lib/regression.rb index d2c4e91..13e1380 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -11,7 +11,7 @@ module OpenTox sim = row["tanimoto"] sim ||= 1 # TODO: sim f nanoparticles if row["toxicities"][params[:prediction_feature_id].to_s] - row["toxicities"][params[:prediction_feature_id].to_s].each do |act| + row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| weighted_sum += sim*act sim_sum += sim end @@ -33,7 +33,7 @@ module OpenTox neighbor = Compound.find row["_id"] fingerprint = neighbor.fingerprint if row["toxicities"][params[:prediction_feature_id].to_s] - row["toxicities"][params[:prediction_feature_id].to_s].each do |act| + row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| activities << act weights << row["tanimoto"] fingerprint_ids.each_with_index do |id,j| @@ -77,10 +77,10 @@ module OpenTox def self.local_physchem_regression compound, params, method="pls"#, method_params="ncomp = 4" - neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s]} # use only neighbors with measured activities + neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s] and n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s]} # use only neighbors with measured activities return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 - return {:value => neighbors.first["toxicities"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 + return {:value => neighbors.first["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 activities = [] weights = [] @@ -90,7 +90,7 @@ module OpenTox neighbors.each_with_index do |n,i| neighbor = Substance.find(n["_id"]) - n["toxicities"][params[:prediction_feature_id].to_s].each do |act| + n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| data_frame[0][i] = act n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? neighbor.physchem_descriptors.each do |pid,values| diff --git a/lib/validation.rb b/lib/validation.rb index 68cb1a1..334efd7 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -34,7 +34,9 @@ module OpenTox nr_unpredicted = 0 predictions.each do |cid,prediction| if prediction[:value] - prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s] + tox = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s] + #prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s][test_set.id.to_s] + prediction[:measured] = tox[test_set.id.to_s] if tox else nr_unpredicted += 1 end -- cgit v1.2.3 From ab652ac85036c5b372e7f1a08cdb75a19db5b19a Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sun, 8 May 2016 12:57:10 +0200 Subject: regression crossvalidation fixed --- lib/compound.rb | 5 ++++- lib/leave-one-out-validation.rb | 6 +++--- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index 3af6f6c..0a9111b 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -288,7 +288,10 @@ module OpenTox training_dataset.compounds.each do |compound| candidate_fingerprint = compound.fingerprint params[:type] sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f - neighbors << {"_id" => compound.id, "toxicities" => {prediction_feature.id.to_s => {training_dataset_id.to_s => compound.toxicities[prediction_feature.id.to_s][training_dataset_id.to_s]}}, "tanimoto" => sim} if sim >= params[:min_sim] + fid = prediction_feature.id.to_s + did = params[:training_dataset_id].to_s + v = compound.toxicities[prediction_feature.id.to_s] + neighbors << {"_id" => compound.id, "toxicities" => {fid => {did => v[params[:training_dataset_id].to_s]}}, "tanimoto" => sim} if sim >= params[:min_sim] and v end neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]} end diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index 2306041..7189617 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -3,7 +3,6 @@ module OpenTox class LeaveOneOutValidation field :model_id, type: BSON::ObjectId - field :dataset_id, type: BSON::ObjectId field :nr_instances, type: Integer field :nr_unpredicted, type: Integer field :predictions, type: Hash @@ -13,13 +12,14 @@ module OpenTox $logger.debug "#{model.name}: LOO validation started" t = Time.now model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation - loo = klass.new :model_id => model.id, :dataset_id => model.training_dataset_id + loo = klass.new :model_id => model.id predictions = model.predict model.training_dataset.compounds predictions.each{|cid,p| p.delete(:neighbors)} nr_unpredicted = 0 predictions.each do |cid,prediction| if prediction[:value] - prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s][dataset_id.to_s] + tox = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s] + prediction[:measured] = tox[model.training_dataset_id.to_s] if tox else nr_unpredicted += 1 end -- cgit v1.2.3 From 7794086d367fb256c3673d7578b23ec2fb83e6ed Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 9 May 2016 14:05:29 +0200 Subject: physchem crossvalidation fixed --- lib/regression.rb | 3 ++- lib/validation-statistics.rb | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/regression.rb b/lib/regression.rb index 13e1380..b8a7e5f 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -80,7 +80,7 @@ module OpenTox neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s] and n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s]} # use only neighbors with measured activities return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 - return {:value => neighbors.first["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 + return {:value => neighbors.first["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].median, :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 activities = [] weights = [] @@ -94,6 +94,7 @@ module OpenTox data_frame[0][i] = act n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? neighbor.physchem_descriptors.each do |pid,values| + values = [values] if values.is_a? Float values.uniq! warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1 j = pc_ids.index(pid)+1 diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index b7c95f6..0079bae 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -68,7 +68,7 @@ module OpenTox x = [] y = [] predictions.each do |cid,pred| - if pred[:value] and pred[:measured] #and pred[:measured] != [nil] + if pred[:value] and pred[:measured] x << pred[:measured].median y << pred[:value] error = pred[:value]-pred[:measured].median -- cgit v1.2.3 From 611bac891177f8d9185d45486dd574b6ef4d1912 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 9 May 2016 15:11:46 +0200 Subject: nanoparticle models fixed --- lib/dataset.rb | 8 ++++---- lib/import.rb | 6 +++--- lib/model.rb | 1 + lib/nanoparticle.rb | 37 +++++++++++++++++++++---------------- lib/regression.rb | 2 +- 5 files changed, 30 insertions(+), 24 deletions(-) (limited to 'lib') diff --git a/lib/dataset.rb b/lib/dataset.rb index 86800c6..9738c1f 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -62,12 +62,12 @@ module OpenTox training_cids = training_idxs.collect{|i| substance_ids[i]} chunk = [training_cids,test_cids].collect do |cids| dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id ) - dataset.compounds.each do |compound| - compound.dataset_ids << dataset.id - compound.toxicities.each do |feature_id,data| + dataset.substances.each do |substance| + substance.dataset_ids << dataset.id + substance.toxicities.each do |feature_id,data| data[dataset.id.to_s] = data[self.id.to_s] # copy data entries end - compound.save + substance.save end dataset end diff --git a/lib/import.rb b/lib/import.rb index 11cb367..dfe5e2d 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -40,10 +40,10 @@ module OpenTox datasets[bundle_uri].substance_ids << nanoparticle.id nanoparticle["dataset_ids"] << datasets[bundle_uri].id end + bundle = datasets[np["bundles"].keys.first].id if np["bundles"].size == 1 study["effects"].each do |effect| effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature # TODO parse core/coating - # TODO parse proteomics, they come as a large textValue #$logger.debug File.join(np["compound"]["URI"],"study") effect["conditions"].delete_if { |k, v| v.nil? } # parse proteomics data @@ -53,7 +53,7 @@ module OpenTox :name => identifier, :category => "Proteomics", ) - nanoparticle.parse_ambit_value feature, value + nanoparticle.parse_ambit_value feature, value, bundle end else feature = klass.find_or_create_by( @@ -62,7 +62,7 @@ module OpenTox :category => study["protocol"]["topcategory"], :conditions => effect["conditions"] ) - nanoparticle.parse_ambit_value feature, effect["result"] + nanoparticle.parse_ambit_value feature, effect["result"], bundle end end nanoparticle.save diff --git a/lib/model.rb b/lib/model.rb index 5b094fb..070248a 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -76,6 +76,7 @@ module OpenTox prediction = {} if neighbors.collect{|n| n["_id"]}.include? compound.id + me = neighbors.select{|n| n["_id"] == compound.id}.first database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s][training_dataset_id.to_s].uniq prediction[:database_activities] = database_activities prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 9bf419d..b79981d 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -9,10 +9,14 @@ module OpenTox field :proteomics, type: Hash, default: {} def nanoparticle_neighbors params - Dataset.find(params[:training_dataset_id]).nanoparticles.collect{|np| np["tanimoto"] = 1; np} + dataset = Dataset.find(params[:training_dataset_id]) + Dataset.find(params[:training_dataset_id]).nanoparticles.collect do |np| + np["tanimoto"] = 1 + np unless np.toxicities.empty? + end.compact end - def add_feature feature, value + def add_feature feature, value, dataset_id case feature.category when "P-CHEM" physchem_descriptors[feature.id.to_s] ||= [] @@ -23,51 +27,52 @@ module OpenTox proteomics[feature.id.to_s] << value proteomics[feature.id.to_s].uniq! when "TOX" - toxicities[feature.id.to_s] ||= [] + toxicities[feature.id.to_s] ||= {} + toxicities[feature.id.to_s][dataset_id.to_s] ||= [] # TODO generic way of parsing TOX values if feature.name == "7.99 Toxicity (other) ICP-AES" and feature.unit == "mL/ug(Mg)" - toxicities[feature.id.to_s] << -Math.log10(value) + toxicities[feature.id.to_s][dataset_id.to_s] << -Math.log10(value) else - toxicities[feature.id.to_s] << value + toxicities[feature.id.to_s][dataset_id.to_s] << value end - toxicities[feature.id.to_s].uniq! + toxicities[feature.id.to_s][dataset_id.to_s].uniq! else warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted." end end - def parse_ambit_value feature, v + def parse_ambit_value feature, v, dataset_id v.delete "unit" # TODO: ppm instead of weights if v.keys == ["textValue"] - add_feature feature, v["textValue"] + add_feature feature, v["textValue"], dataset_id elsif v.keys == ["loValue"] - add_feature feature, v["loValue"] + add_feature feature, v["loValue"], dataset_id elsif v.keys.size == 2 and v["errorValue"] - add_feature feature, v["loValue"] + add_feature feature, v["loValue"], dataset_id warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." elsif v.keys.size == 2 and v["loQualifier"] == "mean" - add_feature feature, v["loValue"] + add_feature feature, v["loValue"], dataset_id warn "'#{feature.name}' is a mean value. Original data is not available." elsif v.keys.size == 2 and v["loQualifier"] #== ">=" warn "Only min value available for '#{feature.name}', entry ignored" elsif v.keys.size == 2 and v["upQualifier"] #== ">=" warn "Only max value available for '#{feature.name}', entry ignored" elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil? - add_feature feature, v["loValue"] + add_feature feature, v["loValue"], dataset_id warn "loQualifier and upQualifier are empty." elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"] == "" and v["upQualifier"] == "" - add_feature feature, v["loValue"] + add_feature feature, v["loValue"], dataset_id warn "loQualifier and upQualifier are empty." elsif v.keys.size == 4 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil? - add_feature feature, v["loValue"] + add_feature feature, v["loValue"], dataset_id warn "loQualifier and upQualifier are empty." elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] and v["loValue"] and v["upValue"] - add_feature feature, [v["loValue"],v["upValue"]].mean + add_feature feature, [v["loValue"],v["upValue"]].mean, dataset_id warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available." elsif v.size == 4 and v["loQualifier"] == "mean" and v["errorValue"] warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." - add_feature feature, v["loValue"] + add_feature feature, v["loValue"], dataset_id elsif v == {} # do nothing else warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'." diff --git a/lib/regression.rb b/lib/regression.rb index b8a7e5f..691f903 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -10,7 +10,7 @@ module OpenTox neighbors.each do |row| sim = row["tanimoto"] sim ||= 1 # TODO: sim f nanoparticles - if row["toxicities"][params[:prediction_feature_id].to_s] + if row["toxicities"][params[:prediction_feature_id].to_s] and row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s] row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| weighted_sum += sim*act sim_sum += sim -- cgit v1.2.3 From c1be8fe66f640d44dbbc9bfe5212733994bfb9c5 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 9 May 2016 15:44:29 +0200 Subject: physchem crossvalidation fixed, test_compound_descriptor_parameters assertions fixed --- lib/regression.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/regression.rb b/lib/regression.rb index 691f903..2eaae73 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -94,7 +94,7 @@ module OpenTox data_frame[0][i] = act n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? neighbor.physchem_descriptors.each do |pid,values| - values = [values] if values.is_a? Float + values = [values] unless values.is_a? Array values.uniq! warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1 j = pc_ids.index(pid)+1 -- cgit v1.2.3 From b8bb12c8a163c238d7d4387c1914e2100bb660df Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 12 May 2016 15:23:01 +0200 Subject: enm study import fixed --- lib/classification.rb | 15 +++---- lib/compound.rb | 120 +++++++++++++++++++++++++------------------------ lib/crossvalidation.rb | 21 ++++++--- lib/dataset.rb | 77 +++++++++++++++---------------- lib/import.rb | 8 ++-- lib/lazar.rb | 2 + lib/model.rb | 65 +++++++++++++++++---------- lib/nanoparticle.rb | 80 ++++++++++++++++++++------------- lib/regression.rb | 102 +++++++++++++++++++---------------------- lib/substance.rb | 1 - lib/validation.rb | 4 ++ 11 files changed, 270 insertions(+), 225 deletions(-) (limited to 'lib') diff --git a/lib/classification.rb b/lib/classification.rb index 4cc9201..48ff8b3 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -3,17 +3,15 @@ module OpenTox class Classification - def self.weighted_majority_vote compound, params - neighbors = params[:neighbors] - feature_id = params[:prediction_feature_id].to_s - dataset_id = params[:training_dataset_id].to_s + def self.weighted_majority_vote substance, neighbors sims = {} - neighbors.each do |n| - sim = n["tanimoto"] - n["toxicities"][feature_id][dataset_id].each do |act| + neighbors.each do |neighbor| + sim = neighbor["similarity"] + activities = neighbor["toxicities"] + activities.each do |act| sims[act] ||= [] sims[act] << sim - end if n["toxicities"][feature_id][dataset_id] + end if activities end sim_all = sims.collect{|a,s| s}.flatten sim_sum = sim_all.sum @@ -26,7 +24,6 @@ module OpenTox p_max = probabilities.collect{|a,p| p}.max prediction = probabilities.key(p_max) {:value => prediction,:probabilities => probabilities} - end end end diff --git a/lib/compound.rb b/lib/compound.rb index 0a9111b..2554d54 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -254,67 +254,69 @@ module OpenTox self["chemblid"] end - def fingerprint_count_neighbors params - # TODO fix +# def fingerprint_count_neighbors params +# # TODO fix +# neighbors = [] +# query_fingerprint = self.fingerprint params[:type] +# training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound| +# unless self == compound +# candidate_fingerprint = compound.fingerprint params[:type] +# features = (query_fingerprint + candidate_fingerprint).uniq +# min_sum = 0 +# max_sum = 0 +# features.each do |f| +# min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax +# min_sum += min +# max_sum += max +# end +# max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f +# neighbors << [compound.id, sim] if sim and sim >= params[:min_sim] +# end +# end +# neighbors.sort{|a,b| b.last <=> a.last} +# end + + def fingerprint_neighbors(type:, min_sim: 0.1, dataset_id:, prediction_feature_id:) neighbors = [] - query_fingerprint = self.fingerprint params[:type] - training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound| - unless self == compound - candidate_fingerprint = compound.fingerprint params[:type] - features = (query_fingerprint + candidate_fingerprint).uniq - min_sum = 0 - max_sum = 0 - features.each do |f| - min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax - min_sum += min - max_sum += max - end - max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f - neighbors << [compound.id, sim] if sim and sim >= params[:min_sim] + dataset = Dataset.find(dataset_id) + if type == DEFAULT_FINGERPRINT + neighbors = db_neighbors(min_sim: min_sim, dataset_id: dataset_id) + neighbors.each do |n| + n["toxicities"] = dataset.values(n["_id"],prediction_feature_id) end - end - neighbors.sort{|a,b| b.last <=> a.last} - end - - def fingerprint_neighbors params - bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim] - neighbors = [] - if params[:type] == DEFAULT_FINGERPRINT - neighbors = db_neighbors params else - query_fingerprint = self.fingerprint params[:type] - training_dataset = Dataset.find(params[:training_dataset_id]) - prediction_feature = training_dataset.features.first - training_dataset.compounds.each do |compound| - candidate_fingerprint = compound.fingerprint params[:type] - sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f - fid = prediction_feature.id.to_s - did = params[:training_dataset_id].to_s - v = compound.toxicities[prediction_feature.id.to_s] - neighbors << {"_id" => compound.id, "toxicities" => {fid => {did => v[params[:training_dataset_id].to_s]}}, "tanimoto" => sim} if sim >= params[:min_sim] and v - end - neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]} - end - neighbors - end - - def physchem_neighbors params - feature_dataset = Dataset.find params[:feature_dataset_id] - query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors] - neighbors = [] - feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i| - # TODO implement pearson and cosine similarity separatly - R.assign "x", query_fingerprint - R.assign "y", candidate_fingerprint - sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first - if sim >= params[:min_sim] - neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming + query_fingerprint = self.fingerprint type + dataset.compounds.each do |compound| + values = dataset.values(compound,prediction_feature_id) + if values + candidate_fingerprint = compound.fingerprint type + sim = Algorithm::Similarity.tanimoto(query_fingerprint , candidate_fingerprint) + neighbors << {"_id" => compound.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim + end end + neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} end neighbors end - def db_neighbors params +# def physchem_neighbors params +# # TODO: fix, tests +# feature_dataset = Dataset.find params[:feature_dataset_id] +# query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors] +# neighbors = [] +# feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i| +# # TODO implement pearson and cosine similarity separatly +# R.assign "x", query_fingerprint +# R.assign "y", candidate_fingerprint +# sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first +# if sim >= params[:min_sim] +# neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming +# end +# end +# neighbors +# end + + def db_neighbors min_sim: 0.1, dataset_id: # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb #qn = default_fingerprint_size @@ -326,20 +328,20 @@ module OpenTox #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}}, #{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self {'$project' => { - 'tanimoto' => {'$let' => { + 'similarity' => {'$let' => { 'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}}, - #'vars' => {'common' => {'$size' => {'$setIntersection' => ["$default_fingerprint", default_fingerprint]}}}, 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]} }}, '_id' => 1, - 'toxicities' => 1, + #'toxicities' => 1, 'dataset_ids' => 1 }}, - {'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}}, - {'$sort' => {'tanimoto' => -1}} + {'$match' => {'similarity' => {'$gte' => min_sim}}}, + {'$sort' => {'similarity' => -1}} ] - $mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]} + # TODO move into aggregate pipeline, see http://stackoverflow.com/questions/30537317/mongodb-aggregation-match-if-value-in-array + $mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? dataset_id} end diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 8e0c5b9..da4b731 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -77,6 +77,7 @@ module OpenTox def statistics stat = ValidationStatistics.classification(predictions, Feature.find(model.prediction_feature_id).accept_values) update_attributes(stat) + stat end def confidence_plot @@ -120,6 +121,7 @@ module OpenTox def statistics stat = ValidationStatistics.regression predictions update_attributes(stat) + stat end def misclassifications n=nil @@ -164,24 +166,29 @@ module OpenTox end def correlation_plot - unless correlation_plot_id + #unless correlation_plot_id tmpfile = "/tmp/#{id.to_s}_correlation.png" - x = predictions.collect{|p| p[1]} - y = predictions.collect{|p| p[2]} + x = [] + y = [] + predictions.each do |sid,p| + x << p["value"] + y << p["measured"].median + end attributes = Model::Lazar.find(self.model_id).attributes attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key} attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n") R.assign "measurement", x R.assign "prediction", y - R.eval "all = c(-log(measurement),-log(prediction))" + R.eval "all = c(measurement,prediction)" R.eval "range = c(min(all), max(all))" - R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)" + R.eval "image = qplot(prediction,measurement,main='#{self.name}',asp=1,xlim=range, ylim=range)" R.eval "image = image + geom_abline(intercept=0, slope=1)" - R.eval "ggsave(file='#{tmpfile}', plot=image)" + #R.eval "ggsave(file='#{tmpfile}', plot=image)" + R.eval "ggsave(file='#{tmpfile}')" file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png") plot_id = $gridfs.insert_one(file) update(:correlation_plot_id => plot_id) - end + #end $gridfs.find_one(_id: correlation_plot_id).data end end diff --git a/lib/dataset.rb b/lib/dataset.rb index 9738c1f..8c7fe68 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -7,6 +7,7 @@ module OpenTox field :substance_ids, type: Array, default: [] field :feature_ids, type: Array, default: [] + field :data_entries, type: Hash, default: {} # Readers @@ -30,6 +31,16 @@ module OpenTox @features end + def values substance,feature + substance = substance.id if substance.is_a? Substance + feature = feature.id if feature.is_a? Feature + if data_entries[substance.to_s] and data_entries[substance.to_s][feature.to_s] + data_entries[substance.to_s][feature.to_s] + else + nil + end + end + # Writers # Set compounds @@ -42,6 +53,14 @@ module OpenTox self.feature_ids = features.collect{|f| f.id} end + def add(substance,feature,value) + substance = substance.id if substance.is_a? Substance + feature = feature.id if feature.is_a? Feature + data_entries[substance.to_s] ||= {} + data_entries[substance.to_s][feature.to_s] ||= [] + data_entries[substance.to_s][feature.to_s] << value + end + # Dataset operations # Split a dataset into n folds @@ -64,11 +83,10 @@ module OpenTox dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id ) dataset.substances.each do |substance| substance.dataset_ids << dataset.id - substance.toxicities.each do |feature_id,data| - data[dataset.id.to_s] = data[self.id.to_s] # copy data entries - end substance.save + dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {} end + dataset.save dataset end start = last+1 @@ -95,7 +113,7 @@ module OpenTox else name = substance.name end - nr_measurements = features.collect{|f| substance.toxicities[f.id.to_s][self.id.to_s].size if substance.toxicities[f.id.to_s]}.compact.uniq + nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq if nr_measurements.size > 1 warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries." @@ -103,8 +121,8 @@ module OpenTox (0..nr_measurements.first-1).each do |i| row = [name] features.each do |f| - if substance.toxicities[f.id.to_s] and substance.toxicities[f.id.to_s][self.id.to_s] - row << substance.toxicities[f.id.to_s][self.id.to_s][i] + if data_entries[substance.id.to_s] and data_entries[substance.id.to_s][f.id.to_s] + row << data_entries[substance.id.to_s][f.id.to_s] else row << "" end @@ -146,8 +164,6 @@ module OpenTox # does a lot of guesswork in order to determine feature types def parse_table table - time = Time.now - # features feature_names = table.shift.collect{|f| f.strip} warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size @@ -174,39 +190,31 @@ module OpenTox feature_ids << feature.id if feature end - $logger.debug "Feature values: #{Time.now-time}" - time = Time.now - - r = -1 - compound_time = 0 - value_time = 0 - - # compounds and values + # substances and values table.each_with_index do |vals,i| - ct = Time.now identifier = vals.shift.strip warn "No feature values for compound at position #{i+2}." if vals.compact.empty? begin case compound_format when /SMILES/i - compound = OpenTox::Compound.from_smiles(identifier) + substance = OpenTox::Compound.from_smiles(identifier) when /InChI/i - compound = OpenTox::Compound.from_inchi(identifier) + substance = OpenTox::Compound.from_inchi(identifier) # TODO nanoparticle end rescue - compound = nil + substance = nil end - if compound.nil? # compound parsers may return nil + if substance.nil? # compound parsers may return nil warn "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." next end - substance_ids << compound.id - compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id - compound_time += Time.now-ct + substance_ids << substance.id + data_entries[substance.id.to_s] = {} + substance.dataset_ids << self.id unless substance.dataset_ids.include? self.id + substance.save - r += 1 unless vals.size == feature_ids.size warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." next @@ -214,32 +222,25 @@ module OpenTox vals.each_with_index do |v,j| if v.blank? - warn "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})." + warn "Empty value for compound '#{identifier}' and feature '#{feature_names[i]}'." next elsif numeric[j] v = v.to_f else v = v.strip end - compound.toxicities[feature_ids[j].to_s] ||= {} - compound.toxicities[feature_ids[j].to_s][self.id.to_s] ||= [] - compound.toxicities[feature_ids[j].to_s][self.id.to_s] << v - compound.save + data_entries[substance.id.to_s][feature_ids[j].to_s] ||= [] + data_entries[substance.id.to_s][feature_ids[j].to_s] << v end end - compounds.duplicates.each do |compound| + substances.duplicates.each do |substance| positions = [] - compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi} - warn "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." + substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi} + warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." end substance_ids.uniq! feature_ids.uniq! - - $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})" - time = Time.now save - $logger.debug "Saving: #{Time.now-time}" - end end diff --git a/lib/import.rb b/lib/import.rb index dfe5e2d..3c6966e 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -9,16 +9,18 @@ module OpenTox #get list of bundle URIs bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] File.open(File.join(dir,"bundles.json"),"w+"){|f| f.puts JSON.pretty_generate(bundles)} - datasets = [] bundles.each do |bundle| + p bundle["title"] nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"] + p nanoparticles.size nanoparticles.each do |nanoparticle| uuid = nanoparticle["values"]["https://data.enanomapper.net/identifier/uuid"] $logger.debug uuid File.open(File.join(dir,"nanoparticle-#{uuid}.json"),"w+"){|f| f.puts JSON.pretty_generate(nanoparticle)} studies = JSON.parse(RestClientWrapper.get(File.join(nanoparticle["compound"]["URI"],"study")))["study"] + p uuid if studies.size < 1 studies.each do |study| - File.open(File.join(dir,"study-#{uuid}.json"),"w+"){|f| f.puts JSON.pretty_generate(study)} + File.open(File.join(dir,"study-#{study["uuid"]}.json"),"w+"){|f| f.puts JSON.pretty_generate(study)} end end end @@ -37,7 +39,7 @@ module OpenTox :source => np["compound"]["URI"], ) np["bundles"].keys.each do |bundle_uri| - datasets[bundle_uri].substance_ids << nanoparticle.id + #datasets[bundle_uri].substance_ids << nanoparticle.id nanoparticle["dataset_ids"] << datasets[bundle_uri].id end bundle = datasets[np["bundles"].keys.first].id if np["bundles"].size == 1 diff --git a/lib/lazar.rb b/lib/lazar.rb index 140bca3..55de511 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -48,6 +48,7 @@ NR_CORES = `getconf _NPROCESSORS_ONLN`.to_i R = Rserve::Connection.new R.eval " suppressPackageStartupMessages({ + library(labeling,lib=\"#{rlib}\") library(iterators,lib=\"#{rlib}\") library(foreach,lib=\"#{rlib}\") library(ggplot2,lib=\"#{rlib}\") @@ -75,6 +76,7 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","Cross "nanoparticle.rb", "dataset.rb", "algorithm.rb", + "similarity", "model.rb", "classification.rb", "regression.rb", diff --git a/lib/model.rb b/lib/model.rb index 070248a..8baed41 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -30,7 +30,7 @@ module OpenTox self.training_dataset_id ||= training_dataset.id self.name ||= "#{training_dataset.name} #{prediction_feature.name}" self.neighbor_algorithm_parameters ||= {} - self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id + self.neighbor_algorithm_parameters[:dataset_id] = training_dataset.id Algorithm.run(feature_selection_algorithm, self) if feature_selection_algorithm save @@ -41,7 +41,7 @@ module OpenTox toxicities = [] substances = [] training_dataset.substances.each do |s| - s["toxicities"][prediction_feature_id][training_dataset_id.to_s].each do |act| + training_dataset.values(s,prediction_feature_id).each do |act| toxicities << act substances << s end @@ -68,24 +68,41 @@ module OpenTox relevant_features.sort!{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h end - def predict_compound compound - neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) - # remove neighbors without prediction_feature - # check for database activities (neighbors may include query compound) + def predict_substance substance + neighbors = substance.send(neighbor_algorithm, neighbor_algorithm_parameters) database_activities = nil prediction = {} - if neighbors.collect{|n| n["_id"]}.include? compound.id + # handle query substance + if neighbors.collect{|n| n["_id"]}.include? substance.id - me = neighbors.select{|n| n["_id"] == compound.id}.first - database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s][training_dataset_id.to_s].uniq + query = neighbors.select{|n| n["_id"] == substance.id}.first + database_activities = training_dataset.values(query["_id"],prediction_feature_id) prediction[:database_activities] = database_activities - prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." - neighbors.delete_if{|n| n["_id"] == compound.id} + prediction[:warning] = "#{database_activities.size} substances have been removed from neighbors, because they are identical with the query substance." + neighbors.delete_if{|n| n["_id"] == substance.id} # remove query substance for an unbiased prediction (also useful for loo validation) end if neighbors.empty? - prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []}) + prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) + elsif neighbors.size == 1 + value = nil + tox = neighbors.first["toxicities"] + if tox.size == 1 # single measurement + value = tox + else # multiple measurement + if tox.collect{|t| t.numeric?}.uniq == [true] # numeric + value = tox.median + elsif tox.uniq.size == 1 # single value + value = tox.first + else # contradictory results + # TODO add majority vote + end + end + prediction.merge!({:value => value, :confidence => nil, :warning => "Only one similar compound in the training set. Predicting median of its experimental values."}) if value else - prediction.merge!(Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_id=> training_dataset_id,:prediction_feature_id => prediction_feature.id})) + # call prediction algorithm + klass,method = prediction_algorithm.split('.') + result = Object.const_get(klass).send(method,substance,neighbors) + prediction.merge! result prediction[:neighbors] = neighbors prediction[:neighbors] ||= [] end @@ -97,27 +114,27 @@ module OpenTox training_dataset = Dataset.find training_dataset_id # parse data - compounds = [] + substances = [] if object.is_a? Substance - compounds = [object] + substances = [object] elsif object.is_a? Array - compounds = object + substances = object elsif object.is_a? Dataset - compounds = object.compounds + substances = object.substances else bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter." end # make predictions predictions = {} - compounds.each do |c| - predictions[c.id.to_s] = predict_compound c + substances.each do |c| + predictions[c.id.to_s] = predict_substance c predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id end # serialize result if object.is_a? Substance - prediction = predictions[compounds.first.id.to_s] + prediction = predictions[substances.first.id.to_s] prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity return prediction elsif object.is_a? Array @@ -160,7 +177,8 @@ module OpenTox model.neighbor_algorithm_parameters ||= {} { :type => "MP2D", - :training_dataset_id => training_dataset.id, + :dataset_id => training_dataset.id, + :prediction_feature_id => prediction_feature.id, :min_sim => 0.1 }.each do |key,value| model.neighbor_algorithm_parameters[key] ||= value @@ -179,8 +197,9 @@ module OpenTox model.neighbor_algorithm_parameters ||= {} { :type => "MP2D", - :training_dataset_id => training_dataset.id, - :min_sim => 0.1 + :min_sim => 0.1, + :dataset_id => training_dataset.id, + :prediction_feature_id => prediction_feature.id, }.each do |key,value| model.neighbor_algorithm_parameters[key] ||= value end diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index b79981d..6527fa3 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -8,15 +8,31 @@ module OpenTox field :bundles, type: Array, default: [] field :proteomics, type: Hash, default: {} - def nanoparticle_neighbors params - dataset = Dataset.find(params[:training_dataset_id]) - Dataset.find(params[:training_dataset_id]).nanoparticles.collect do |np| - np["tanimoto"] = 1 - np unless np.toxicities.empty? - end.compact + def nanoparticle_neighbors min_sim: 0.1, type:, dataset_id:, prediction_feature_id: + dataset = Dataset.find(dataset_id) + neighbors = [] + p dataset.data_entries.size + p dataset.substance_ids.size + p dataset.substance_ids.collect{|i| i.to_s} == dataset.data_entries.keys + p dataset.substance_ids.collect{|i| i.to_s} + p dataset.data_entries.keys + dataset.nanoparticles.each do |np| + prediction_feature_id + p dataset.data_entries[np.id.to_s] + values = dataset.values(np,prediction_feature_id) + p values + if values + common_descriptors = physchem_descriptors.keys & np.physchem_descriptors.keys + sim = Algorithm::Similarity.cosine(common_descriptors.collect{|d| physchem_descriptors[d]}, common_descriptors.collect{|d| np.physchem_descriptors[d]}) + neighbors << {"_id" => np.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim + end + end + neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} + neighbors end def add_feature feature, value, dataset_id + dataset = Dataset.find(dataset_id) case feature.category when "P-CHEM" physchem_descriptors[feature.id.to_s] ||= [] @@ -27,55 +43,59 @@ module OpenTox proteomics[feature.id.to_s] << value proteomics[feature.id.to_s].uniq! when "TOX" - toxicities[feature.id.to_s] ||= {} - toxicities[feature.id.to_s][dataset_id.to_s] ||= [] # TODO generic way of parsing TOX values + p dataset.name + p self.name + p feature.name + p feature.unit + p value if feature.name == "7.99 Toxicity (other) ICP-AES" and feature.unit == "mL/ug(Mg)" - toxicities[feature.id.to_s][dataset_id.to_s] << -Math.log10(value) + dataset.add self, feature, -Math.log10(value) else - toxicities[feature.id.to_s][dataset_id.to_s] << value + dataset.add self, feature, value end - toxicities[feature.id.to_s][dataset_id.to_s].uniq! + dataset.save else warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted." end end def parse_ambit_value feature, v, dataset_id + dataset = Dataset.find(dataset_id) v.delete "unit" # TODO: ppm instead of weights if v.keys == ["textValue"] - add_feature feature, v["textValue"], dataset_id + add_feature feature, v["textValue"], dataset elsif v.keys == ["loValue"] - add_feature feature, v["loValue"], dataset_id + add_feature feature, v["loValue"], dataset elsif v.keys.size == 2 and v["errorValue"] - add_feature feature, v["loValue"], dataset_id - warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." + add_feature feature, v["loValue"], dataset + #warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." elsif v.keys.size == 2 and v["loQualifier"] == "mean" - add_feature feature, v["loValue"], dataset_id - warn "'#{feature.name}' is a mean value. Original data is not available." + add_feature feature, v["loValue"], dataset + #warn "'#{feature.name}' is a mean value. Original data is not available." elsif v.keys.size == 2 and v["loQualifier"] #== ">=" - warn "Only min value available for '#{feature.name}', entry ignored" + #warn "Only min value available for '#{feature.name}', entry ignored" elsif v.keys.size == 2 and v["upQualifier"] #== ">=" - warn "Only max value available for '#{feature.name}', entry ignored" + #warn "Only max value available for '#{feature.name}', entry ignored" elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil? - add_feature feature, v["loValue"], dataset_id - warn "loQualifier and upQualifier are empty." + add_feature feature, v["loValue"], dataset + #warn "loQualifier and upQualifier are empty." elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"] == "" and v["upQualifier"] == "" - add_feature feature, v["loValue"], dataset_id - warn "loQualifier and upQualifier are empty." + add_feature feature, v["loValue"], dataset + #warn "loQualifier and upQualifier are empty." elsif v.keys.size == 4 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil? - add_feature feature, v["loValue"], dataset_id - warn "loQualifier and upQualifier are empty." + add_feature feature, v["loValue"], dataset + #warn "loQualifier and upQualifier are empty." elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] and v["loValue"] and v["upValue"] - add_feature feature, [v["loValue"],v["upValue"]].mean, dataset_id - warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available." + add_feature feature, [v["loValue"],v["upValue"]].mean, dataset + #warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available." elsif v.size == 4 and v["loQualifier"] == "mean" and v["errorValue"] - warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." - add_feature feature, v["loValue"], dataset_id + #warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." + add_feature feature, v["loValue"], dataset elsif v == {} # do nothing else - warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'." + #warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'." end end diff --git a/lib/regression.rb b/lib/regression.rb index 2eaae73..9d305a6 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -3,49 +3,43 @@ module OpenTox class Regression - def self.local_weighted_average compound, params + def self.local_weighted_average substance, neighbors weighted_sum = 0.0 sim_sum = 0.0 - neighbors = params[:neighbors] - neighbors.each do |row| - sim = row["tanimoto"] - sim ||= 1 # TODO: sim f nanoparticles - if row["toxicities"][params[:prediction_feature_id].to_s] and row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s] - row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| - weighted_sum += sim*act - sim_sum += sim - end - end + neighbors.each do |neighbor| + sim = neighbor["similarity"] + activities = neighbor["toxicities"] + activities.each do |act| + weighted_sum += sim*act + sim_sum += sim + end if activities end sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum {:value => prediction} end - def self.local_fingerprint_regression compound, params, method='pls'#, method_params="sigma=0.05" - neighbors = params[:neighbors] - return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 - activities = [] + def self.local_fingerprint_regression substance, neighbors, method='pls'#, method_params="sigma=0.05" + values = [] fingerprints = {} weights = [] - fingerprint_ids = neighbors.collect{|row| Compound.find(row["_id"]).fingerprint}.flatten.uniq.sort - - neighbors.each_with_index do |row,i| - neighbor = Compound.find row["_id"] - fingerprint = neighbor.fingerprint - if row["toxicities"][params[:prediction_feature_id].to_s] - row["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| - activities << act - weights << row["tanimoto"] - fingerprint_ids.each_with_index do |id,j| - fingerprints[id] ||= [] - fingerprints[id] << fingerprint.include?(id) - end + fingerprint_ids = neighbors.collect{|n| Compound.find(n["_id"]).fingerprint}.flatten.uniq.sort + + neighbors.each do |n| + fingerprint = Substance.find(n["_id"]).fingerprint + activities = n["toxicities"] + activities.each do |act| + values << act + weights << n["similarity"] + fingerprint_ids.each do |id| + fingerprints[id] ||= [] + fingerprints[id] << fingerprint.include?(id) end - end + end if activities end variables = [] - data_frame = [activities] + data_frame = [values] + fingerprints.each do |k,v| unless v.uniq.size == 1 data_frame << v.collect{|m| m ? "T" : "F"} @@ -54,17 +48,16 @@ module OpenTox end if variables.empty? - result = local_weighted_average(compound, params) - result[:warning] = "No variables for regression model. Using weighted average of similar compounds." - return result - + prediction = local_weighted_average substance, neighbors + prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." + prediction else - compound_features = variables.collect{|f| compound.fingerprint.include?(f) ? "T" : "F"} - prediction = r_model_prediction method, data_frame, variables, weights, compound_features + substance_features = variables.collect{|f| substance.fingerprint.include?(f) ? "T" : "F"} + prediction = r_model_prediction method, data_frame, variables, weights, substance_features if prediction.nil? or prediction[:value].nil? - prediction = local_weighted_average(compound, params) - prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." - return prediction + prediction = local_weighted_average substance, neighbors + prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." + prediction else prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]] prediction[:value] = prediction[:value] @@ -75,13 +68,10 @@ module OpenTox end - def self.local_physchem_regression compound, params, method="pls"#, method_params="ncomp = 4" - - neighbors = params[:neighbors].select{|n| n["toxicities"][params[:prediction_feature_id].to_s] and n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s]} # use only neighbors with measured activities - - return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 - return {:value => neighbors.first["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].median, :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 + #def self.local_physchem_regression(substance:, neighbors:, feature_id:, dataset_id:, method: 'pls')#, method_params="ncomp = 4" + def self.local_physchem_regression substance, neighbors, method='pls' #, method_params="ncomp = 4" + #dataset = Dataset.find dataset_id activities = [] weights = [] pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq @@ -90,9 +80,11 @@ module OpenTox neighbors.each_with_index do |n,i| neighbor = Substance.find(n["_id"]) - n["toxicities"][params[:prediction_feature_id].to_s][params[:training_dataset_id].to_s].each do |act| + activities = neighbor["toxicities"] + activities.each do |act| data_frame[0][i] = act - n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? + # TODO: update with cosine similarity for physchem + weights << n["similarity"] neighbor.physchem_descriptors.each do |pid,values| values = [values] unless values.is_a? Array values.uniq! @@ -101,7 +93,7 @@ module OpenTox data_frame[j] ||= [] data_frame[j][i] = values.for_R end - end + end if activities (0..pc_ids.size+1).each do |j| # for R: fill empty values with NA data_frame[j] ||= [] data_frame[j][i] ||= "NA" @@ -117,12 +109,12 @@ module OpenTox end if pc_ids.empty? - result = local_weighted_average(compound, params) - result[:warning] = "No variables for regression model. Using weighted average of similar compounds." - return result + prediction = local_weighted_average substance, neighbors + prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." + prediction else query_descriptors = pc_ids.collect do |i| - compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA" + substance.physchem_descriptors[i] ? substance.physchem_descriptors[i].for_R : "NA" end remove_idx = [] query_descriptors.each_with_index do |v,i| @@ -135,9 +127,9 @@ module OpenTox end prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors if prediction.nil? - prediction = local_weighted_average(compound, params) - prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." - return prediction + prediction = local_weighted_average substance, neighbors + prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." + prediction else prediction end diff --git a/lib/substance.rb b/lib/substance.rb index 82ca65d..6768ce7 100644 --- a/lib/substance.rb +++ b/lib/substance.rb @@ -2,7 +2,6 @@ module OpenTox class Substance field :physchem_descriptors, type: Hash, default: {} - field :toxicities, type: Hash, default: {} field :dataset_ids, type: Array, default: [] end diff --git a/lib/validation.rb b/lib/validation.rb index 334efd7..015e718 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -32,9 +32,12 @@ module OpenTox predictions = validation_model.predict test_set.substances predictions.each{|cid,p| p.delete(:neighbors)} nr_unpredicted = 0 + p predictions.size predictions.each do |cid,prediction| + p prediction if prediction[:value] tox = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s] + p tox #prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s][test_set.id.to_s] prediction[:measured] = tox[test_set.id.to_s] if tox else @@ -42,6 +45,7 @@ module OpenTox end predictions.delete(cid) unless prediction[:value] and prediction[:measured] end + p predictions.size validation = self.new( :model_id => validation_model.id, :test_dataset_id => test_set.id, -- cgit v1.2.3 From c90644211e214a50f6fdb3a936bf247f45f1f4be Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 13 May 2016 13:38:24 +0200 Subject: compound tests fixed --- lib/compound.rb | 27 +++++++++++++++--------- lib/crossvalidation.rb | 26 ++++------------------- lib/dataset.rb | 41 +++++++++++++++++++----------------- lib/import.rb | 9 ++------ lib/lazar.rb | 2 +- lib/leave-one-out-validation.rb | 31 +++++---------------------- lib/nanoparticle.rb | 40 ++++++++++++++--------------------- lib/similarity.rb | 46 +++++++++++++++++++++++++++++++++++++++++ lib/validation-statistics.rb | 24 +++++++++++++++++++++ lib/validation.rb | 10 ++------- 10 files changed, 138 insertions(+), 118 deletions(-) create mode 100644 lib/similarity.rb (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index 2554d54..89e9db2 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -254,13 +254,15 @@ module OpenTox self["chemblid"] end -# def fingerprint_count_neighbors params -# # TODO fix -# neighbors = [] -# query_fingerprint = self.fingerprint params[:type] -# training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound| -# unless self == compound -# candidate_fingerprint = compound.fingerprint params[:type] +=begin + def fingerprint_neighbors(type:, min_sim: 0.1, dataset_id:, prediction_feature_id:) + neighbors = [] + dataset = Dataset.find(dataset_id) + query_fingerprint = self.fingerprint type + dataset.compounds.each do |compound| + values = dataset.values(compound,prediction_feature_id) + if values + candidate_fingerprint = compound.fingerprint type # features = (query_fingerprint + candidate_fingerprint).uniq # min_sum = 0 # max_sum = 0 @@ -274,7 +276,13 @@ module OpenTox # end # end # neighbors.sort{|a,b| b.last <=> a.last} -# end + sim = Algorithm::Similarity.tanimoto(query_fingerprint , candidate_fingerprint) + neighbors << {"_id" => compound.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim + end + end + neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]} + end +=end def fingerprint_neighbors(type:, min_sim: 0.1, dataset_id:, prediction_feature_id:) neighbors = [] @@ -294,9 +302,8 @@ module OpenTox neighbors << {"_id" => compound.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim end end - neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} end - neighbors + neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]} end # def physchem_neighbors params diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index da4b731..357f0fa 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -41,6 +41,7 @@ module OpenTox $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" t = Time.now validation = Validation.create(model, fold[0], fold[1],cv) + #p validation $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" #end end @@ -166,29 +167,10 @@ module OpenTox end def correlation_plot - #unless correlation_plot_id - tmpfile = "/tmp/#{id.to_s}_correlation.png" - x = [] - y = [] - predictions.each do |sid,p| - x << p["value"] - y << p["measured"].median - end - attributes = Model::Lazar.find(self.model_id).attributes - attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key} - attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n") - R.assign "measurement", x - R.assign "prediction", y - R.eval "all = c(measurement,prediction)" - R.eval "range = c(min(all), max(all))" - R.eval "image = qplot(prediction,measurement,main='#{self.name}',asp=1,xlim=range, ylim=range)" - R.eval "image = image + geom_abline(intercept=0, slope=1)" - #R.eval "ggsave(file='#{tmpfile}', plot=image)" - R.eval "ggsave(file='#{tmpfile}')" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png") - plot_id = $gridfs.insert_one(file) + unless correlation_plot_id + plot_id = ValidationStatistics.correlation_plot predictions update(:correlation_plot_id => plot_id) - #end + end $gridfs.find_one(_id: correlation_plot_id).data end end diff --git a/lib/dataset.rb b/lib/dataset.rb index 8c7fe68..205f640 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -5,8 +5,8 @@ module OpenTox class Dataset - field :substance_ids, type: Array, default: [] - field :feature_ids, type: Array, default: [] + #field :substance_ids, type: Array, default: [] + #field :feature_ids, type: Array, default: [] field :data_entries, type: Hash, default: {} # Readers @@ -21,13 +21,14 @@ module OpenTox # Get all substances def substances - @substances ||= substance_ids.collect{|id| OpenTox::Substance.find id} + @substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id}.uniq @substances end # Get all features def features - @features ||= feature_ids.collect{|id| OpenTox::Feature.find(id)} + #@features ||= feature_ids.collect{|id| OpenTox::Feature.find(id)} + @features ||= data_entries.collect{|sid,data| data.keys.collect{|id| OpenTox::Feature.find(id)}}.flatten.uniq @features end @@ -58,7 +59,11 @@ module OpenTox feature = feature.id if feature.is_a? Feature data_entries[substance.to_s] ||= {} data_entries[substance.to_s][feature.to_s] ||= [] - data_entries[substance.to_s][feature.to_s] << value + if value.is_a? Array + data_entries[substance.to_s][feature.to_s] += value + else + data_entries[substance.to_s][feature.to_s] << value + end end # Dataset operations @@ -67,7 +72,7 @@ module OpenTox # @param [Integer] number of folds # @return [Array] Array with folds [training_dataset,test_dataset] def folds n - len = self.substance_ids.size + len = self.substances.size indices = (0..len-1).to_a.shuffle mid = (len/n) chunks = [] @@ -76,12 +81,14 @@ module OpenTox last = start+mid last = last-1 unless len%n >= i test_idxs = indices[start..last] || [] - test_cids = test_idxs.collect{|i| substance_ids[i]} + test_substances = test_idxs.collect{|i| substances[i]} training_idxs = indices-test_idxs - training_cids = training_idxs.collect{|i| substance_ids[i]} - chunk = [training_cids,test_cids].collect do |cids| - dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id ) - dataset.substances.each do |substance| + training_substances = training_idxs.collect{|i| substances[i]} + chunk = [training_substances,test_substances].collect do |substances| + dataset = self.class.create(:source => self.id ) + substances.each do |substance| + #dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id ) + #dataset.substances.each do |substance| substance.dataset_ids << dataset.id substance.save dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {} @@ -170,6 +177,7 @@ module OpenTox compound_format = feature_names.shift.strip bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i numeric = [] + features = [] # guess feature types feature_names.each_with_index do |f,i| metadata = {:name => f} @@ -187,7 +195,7 @@ module OpenTox numeric[i] = false feature = NominalFeature.find_or_create_by(metadata) end - feature_ids << feature.id if feature + features << feature if feature end # substances and values @@ -210,12 +218,10 @@ module OpenTox warn "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." next end - substance_ids << substance.id - data_entries[substance.id.to_s] = {} substance.dataset_ids << self.id unless substance.dataset_ids.include? self.id substance.save - unless vals.size == feature_ids.size + unless vals.size == features.size warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." next end @@ -229,8 +235,7 @@ module OpenTox else v = v.strip end - data_entries[substance.id.to_s][feature_ids[j].to_s] ||= [] - data_entries[substance.id.to_s][feature_ids[j].to_s] << v + add substance, features[j], v end end substances.duplicates.each do |substance| @@ -238,8 +243,6 @@ module OpenTox substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi} warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." end - substance_ids.uniq! - feature_ids.uniq! save end diff --git a/lib/import.rb b/lib/import.rb index 3c6966e..2dcc361 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -39,7 +39,6 @@ module OpenTox :source => np["compound"]["URI"], ) np["bundles"].keys.each do |bundle_uri| - #datasets[bundle_uri].substance_ids << nanoparticle.id nanoparticle["dataset_ids"] << datasets[bundle_uri].id end bundle = datasets[np["bundles"].keys.first].id if np["bundles"].size == 1 @@ -59,7 +58,7 @@ module OpenTox end else feature = klass.find_or_create_by( - :name => "#{study["protocol"]["category"]["title"]} #{study["protocol"]["endpoint"]}", + :name => effect["endpoint"], :unit => effect["result"]["unit"], :category => study["protocol"]["topcategory"], :conditions => effect["conditions"] @@ -69,11 +68,7 @@ module OpenTox end nanoparticle.save end - datasets.each do |u,d| - d.feature_ids.uniq! - d.substance_ids.uniq! - d.save - end + datasets.each { |u,d| d.save } end =begin diff --git a/lib/lazar.rb b/lib/lazar.rb index 55de511..7bd87f4 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -80,10 +80,10 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","Cross "model.rb", "classification.rb", "regression.rb", + "validation-statistics.rb", "validation.rb", "crossvalidation.rb", "leave-one-out-validation.rb", - "validation-statistics.rb", "experiment.rb", "import.rb", ].each{ |f| require_relative f } diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index 7189617..b8deae9 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -13,18 +13,18 @@ module OpenTox t = Time.now model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation loo = klass.new :model_id => model.id - predictions = model.predict model.training_dataset.compounds + predictions = model.predict model.training_dataset.substances predictions.each{|cid,p| p.delete(:neighbors)} nr_unpredicted = 0 predictions.each do |cid,prediction| if prediction[:value] - tox = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s] - prediction[:measured] = tox[model.training_dataset_id.to_s] if tox + prediction[:measured] = model.training_dataset.values(cid, prediction[:prediction_feature_id]) else nr_unpredicted += 1 end predictions.delete(cid) unless prediction[:value] and prediction[:measured] end + predictions.select!{|cid,p| p[:value] and p[:measured]} loo.nr_instances = predictions.size loo.nr_unpredicted = nr_unpredicted loo.predictions = predictions @@ -86,6 +86,7 @@ module OpenTox class RegressionLeaveOneOutValidation < LeaveOneOutValidation + include Plot field :rmse, type: Float, default: 0 field :mae, type: Float, default: 0 @@ -100,29 +101,7 @@ module OpenTox def correlation_plot unless correlation_plot_id - tmpfile = "/tmp/#{id.to_s}_correlation.svg" - predicted_values = [] - measured_values = [] - predictions.each do |pred| - pred[:database_activities].each do |activity| - if pred[:value] - predicted_values << pred[:value] - measured_values << activity - end - end - end - attributes = Model::Lazar.find(self.model_id).attributes - attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key} - attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n") - R.assign "measurement", measured_values - R.assign "prediction", predicted_values - R.eval "all = c(-log(measurement),-log(prediction))" - R.eval "range = c(min(all), max(all))" - R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)" - R.eval "image = image + geom_abline(intercept=0, slope=1)" - R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg") - plot_id = $gridfs.insert_one(file) + #plot_id = correlation_plot update(:correlation_plot_id => plot_id) end $gridfs.find_one(_id: correlation_plot_id).data diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 6527fa3..7890a19 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -11,19 +11,14 @@ module OpenTox def nanoparticle_neighbors min_sim: 0.1, type:, dataset_id:, prediction_feature_id: dataset = Dataset.find(dataset_id) neighbors = [] - p dataset.data_entries.size - p dataset.substance_ids.size - p dataset.substance_ids.collect{|i| i.to_s} == dataset.data_entries.keys - p dataset.substance_ids.collect{|i| i.to_s} - p dataset.data_entries.keys dataset.nanoparticles.each do |np| - prediction_feature_id - p dataset.data_entries[np.id.to_s] values = dataset.values(np,prediction_feature_id) - p values if values common_descriptors = physchem_descriptors.keys & np.physchem_descriptors.keys - sim = Algorithm::Similarity.cosine(common_descriptors.collect{|d| physchem_descriptors[d]}, common_descriptors.collect{|d| np.physchem_descriptors[d]}) + common_descriptors.select!{|id| NumericFeature.find(id) } + query_descriptors = common_descriptors.collect{|d| physchem_descriptors[d].first} + neighbor_descriptors = common_descriptors.collect{|d| np.physchem_descriptors[d].first} + sim = Algorithm::Similarity.cosine(query_descriptors,neighbor_descriptors) neighbors << {"_id" => np.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim end end @@ -44,12 +39,7 @@ module OpenTox proteomics[feature.id.to_s].uniq! when "TOX" # TODO generic way of parsing TOX values - p dataset.name - p self.name - p feature.name - p feature.unit - p value - if feature.name == "7.99 Toxicity (other) ICP-AES" and feature.unit == "mL/ug(Mg)" + if feature.name == "Net cell association" and feature.unit == "mL/ug(Mg)" dataset.add self, feature, -Math.log10(value) else dataset.add self, feature, value @@ -70,32 +60,32 @@ module OpenTox add_feature feature, v["loValue"], dataset elsif v.keys.size == 2 and v["errorValue"] add_feature feature, v["loValue"], dataset - #warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." + warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." elsif v.keys.size == 2 and v["loQualifier"] == "mean" add_feature feature, v["loValue"], dataset - #warn "'#{feature.name}' is a mean value. Original data is not available." + warn "'#{feature.name}' is a mean value. Original data is not available." elsif v.keys.size == 2 and v["loQualifier"] #== ">=" - #warn "Only min value available for '#{feature.name}', entry ignored" + warn "Only min value available for '#{feature.name}', entry ignored" elsif v.keys.size == 2 and v["upQualifier"] #== ">=" - #warn "Only max value available for '#{feature.name}', entry ignored" + warn "Only max value available for '#{feature.name}', entry ignored" elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil? add_feature feature, v["loValue"], dataset - #warn "loQualifier and upQualifier are empty." + warn "loQualifier and upQualifier are empty." elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"] == "" and v["upQualifier"] == "" add_feature feature, v["loValue"], dataset - #warn "loQualifier and upQualifier are empty." + warn "loQualifier and upQualifier are empty." elsif v.keys.size == 4 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil? add_feature feature, v["loValue"], dataset - #warn "loQualifier and upQualifier are empty." + warn "loQualifier and upQualifier are empty." elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] and v["loValue"] and v["upValue"] add_feature feature, [v["loValue"],v["upValue"]].mean, dataset - #warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available." + warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available." elsif v.size == 4 and v["loQualifier"] == "mean" and v["errorValue"] - #warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." + warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." add_feature feature, v["loValue"], dataset elsif v == {} # do nothing else - #warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'." + warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'." end end diff --git a/lib/similarity.rb b/lib/similarity.rb new file mode 100644 index 0000000..f25d4c3 --- /dev/null +++ b/lib/similarity.rb @@ -0,0 +1,46 @@ +module OpenTox + module Algorithm + + class Vector + def self.dot_product(a, b) + products = a.zip(b).map{|a, b| a * b} + products.inject(0) {|s,p| s + p} + end + + def self.magnitude(point) + squares = point.map{|x| x ** 2} + Math.sqrt(squares.inject(0) {|s, c| s + c}) + end + end + + class Similarity + + def self.tanimoto a, b + ( a & b).size/(a|b).size.to_f + end + + def self.euclid a, b + sq = a.zip(b).map{|a,b| (a - b) ** 2} + Math.sqrt(sq.inject(0) {|s,c| s + c}) + end + + # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity + def self.cosine a, b + Algorithm::Vector.dot_product(a, b) / (Algorithm::Vector.magnitude(a) * Algorithm::Vector.magnitude(b)) + end + + def self.weighted_cosine(a, b, w) + dot_product = 0 + magnitude_a = 0 + magnitude_b = 0 + (0..a.size-1).each do |i| + dot_product += w[i].abs*a[i]*b[i] + magnitude_a += w[i].abs*a[i]**2 + magnitude_b += w[i].abs*b[i]**2 + end + dot_product/Math.sqrt(magnitude_a*magnitude_b) + end + + end + end +end diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 0079bae..2d6b56e 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -96,5 +96,29 @@ module OpenTox :finished_at => Time.now } end + + end + + module Plot + + def plot_id + tmpfile = "/tmp/#{id.to_s}_correlation.png" + x = [] + y = [] + predictions.each do |sid,p| + x << p["value"] + y << p["measured"].median + end + R.assign "measurement", x + R.assign "prediction", y + R.eval "all = c(measurement,prediction)" + R.eval "range = c(min(all), max(all))" + R.eval "image = qplot(prediction,measurement,main='',asp=1,xlim=range, ylim=range)" + R.eval "image = image + geom_abline(intercept=0, slope=1)" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png") + plot_id = $gridfs.insert_one(file) + plot_id + end end end diff --git a/lib/validation.rb b/lib/validation.rb index 015e718..9122df1 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -32,20 +32,14 @@ module OpenTox predictions = validation_model.predict test_set.substances predictions.each{|cid,p| p.delete(:neighbors)} nr_unpredicted = 0 - p predictions.size predictions.each do |cid,prediction| - p prediction if prediction[:value] - tox = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s] - p tox - #prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s][test_set.id.to_s] - prediction[:measured] = tox[test_set.id.to_s] if tox + prediction[:measured] = test_set.values(cid, prediction[:prediction_feature_id]) else nr_unpredicted += 1 end - predictions.delete(cid) unless prediction[:value] and prediction[:measured] end - p predictions.size + predictions.select!{|cid,p| p[:value] and p[:measured]} validation = self.new( :model_id => validation_model.id, :test_dataset_id => test_set.id, -- cgit v1.2.3 From b2d80ad2e470fcb41af4b747142e5693f2fa4615 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 24 May 2016 13:05:53 +0200 Subject: dataset tests fixed --- lib/dataset.rb | 43 +++++++++++++------------------------------ lib/validation-statistics.rb | 1 + 2 files changed, 14 insertions(+), 30 deletions(-) (limited to 'lib') diff --git a/lib/dataset.rb b/lib/dataset.rb index 205f640..38a55a8 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -5,8 +5,6 @@ module OpenTox class Dataset - #field :substance_ids, type: Array, default: [] - #field :feature_ids, type: Array, default: [] field :data_entries, type: Hash, default: {} # Readers @@ -27,7 +25,6 @@ module OpenTox # Get all features def features - #@features ||= feature_ids.collect{|id| OpenTox::Feature.find(id)} @features ||= data_entries.collect{|sid,data| data.keys.collect{|id| OpenTox::Feature.find(id)}}.flatten.uniq @features end @@ -44,16 +41,6 @@ module OpenTox # Writers - # Set compounds - def compounds=(compounds) - self.substance_ids = compounds.collect{|c| c.id}.uniq - end - - # Set features - def features=(features) - self.feature_ids = features.collect{|f| f.id} - end - def add(substance,feature,value) substance = substance.id if substance.is_a? Substance feature = feature.id if feature.is_a? Feature @@ -87,8 +74,6 @@ module OpenTox chunk = [training_substances,test_substances].collect do |substances| dataset = self.class.create(:source => self.id ) substances.each do |substance| - #dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id ) - #dataset.substances.each do |substance| substance.dataset_ids << dataset.id substance.save dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {} @@ -108,7 +93,7 @@ module OpenTox # @return [String] def to_csv(inchi=false) CSV.generate() do |csv| - compound = Substance.find(substance_ids.first).is_a? Compound + compound = substances.first.is_a? Compound if compound csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} else @@ -128,11 +113,7 @@ module OpenTox (0..nr_measurements.first-1).each do |i| row = [name] features.each do |f| - if data_entries[substance.id.to_s] and data_entries[substance.id.to_s][f.id.to_s] - row << data_entries[substance.id.to_s][f.id.to_s] - else - row << "" - end + values(substance,f) ? row << values(substance,f)[i] : row << "" end csv << row end @@ -152,8 +133,8 @@ module OpenTox # Create a dataset from CSV file # TODO: document structure - def self.from_csv_file file, source=nil - source ||= file + def self.from_csv_file file, accept_empty_values=false + source = file name = File.basename(file,".*") dataset = self.find_by(:source => source, :name => name) if dataset @@ -162,14 +143,14 @@ module OpenTox $logger.debug "Parsing #{file}." table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8' dataset = self.new(:source => source, :name => name) - dataset.parse_table table + dataset.parse_table table, accept_empty_values end dataset end # parse data in tabular format (e.g. from csv) # does a lot of guesswork in order to determine feature types - def parse_table table + def parse_table table, accept_empty_values # features feature_names = table.shift.collect{|f| f.strip} @@ -200,24 +181,25 @@ module OpenTox # substances and values + all_substances = [] table.each_with_index do |vals,i| identifier = vals.shift.strip - warn "No feature values for compound at position #{i+2}." if vals.compact.empty? + warn "No feature values for compound at line #{i+2} of #{source}." if vals.compact.empty? and !accept_empty_values begin case compound_format when /SMILES/i substance = OpenTox::Compound.from_smiles(identifier) when /InChI/i substance = OpenTox::Compound.from_inchi(identifier) - # TODO nanoparticle end rescue substance = nil end if substance.nil? # compound parsers may return nil - warn "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." + warn "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored." next end + all_substances << substance substance.dataset_ids << self.id unless substance.dataset_ids.include? self.id substance.save @@ -237,10 +219,11 @@ module OpenTox end add substance, features[j], v end + data_entries[substance.id.to_s] = {} if vals.empty? and accept_empty_values end - substances.duplicates.each do |substance| + all_substances.duplicates.each do |substance| positions = [] - substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi} + all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi} warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." end save diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 2d6b56e..3c52b15 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -68,6 +68,7 @@ module OpenTox x = [] y = [] predictions.each do |cid,pred| + p pred if pred[:value] and pred[:measured] x << pred[:measured].median y << pred[:value] -- cgit v1.2.3 From cc08e6beda7f7d70ebf6c6929a22d1a0cd7c1a20 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 24 May 2016 15:41:24 +0200 Subject: tests fixed. DescriptorTest#test_compound_all may fail within all.rb --- lib/dataset.rb | 5 +++++ lib/model.rb | 9 +++++---- lib/validation-statistics.rb | 9 +++++---- 3 files changed, 15 insertions(+), 8 deletions(-) (limited to 'lib') diff --git a/lib/dataset.rb b/lib/dataset.rb index 38a55a8..9138452 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -229,6 +229,11 @@ module OpenTox save end + def delete + compounds.each{|c| c.dataset_ids.delete id.to_s} + super + end + end # Dataset for lazar predictions diff --git a/lib/model.rb b/lib/model.rb index 8baed41..3a178a1 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -69,6 +69,7 @@ module OpenTox end def predict_substance substance + neighbor_algorithm_parameters = Hash[self.neighbor_algorithm_parameters.map{ |k, v| [k.to_sym, v] }] # convert string keys to symbols neighbors = substance.send(neighbor_algorithm, neighbor_algorithm_parameters) database_activities = nil prediction = {} @@ -82,22 +83,22 @@ module OpenTox neighbors.delete_if{|n| n["_id"] == substance.id} # remove query substance for an unbiased prediction (also useful for loo validation) end if neighbors.empty? - prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) + prediction.merge!({:value => nil,:probabilities => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) elsif neighbors.size == 1 value = nil tox = neighbors.first["toxicities"] if tox.size == 1 # single measurement - value = tox + value = tox.first else # multiple measurement if tox.collect{|t| t.numeric?}.uniq == [true] # numeric value = tox.median elsif tox.uniq.size == 1 # single value value = tox.first else # contradictory results - # TODO add majority vote + # TODO add majority vote?? end end - prediction.merge!({:value => value, :confidence => nil, :warning => "Only one similar compound in the training set. Predicting median of its experimental values."}) if value + prediction.merge!({:value => value, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting median of its experimental values.", :neighbors => neighbors}) if value else # call prediction algorithm klass,method = prediction_algorithm.split('.') diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 3c52b15..156353a 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -8,10 +8,11 @@ module OpenTox predictivity = {} nr_instances = 0 predictions.each do |cid,pred| - # TODO use measured majority class - if pred[:measured].uniq.size == 1 + # TODO + # use predictions without probabilities (single neighbor)?? + # use measured majority class?? + if pred[:measured].uniq.size == 1 and pred[:probabilities] m = pred[:measured].first - #pred[:measured].each do |m| if pred[:value] == m if pred[:value] == accept_values[0] confusion_matrix[0][0] += 1 @@ -63,12 +64,12 @@ module OpenTox end def self.regression predictions + # TODO: predictions within prediction_interval rmse = 0 mae = 0 x = [] y = [] predictions.each do |cid,pred| - p pred if pred[:value] and pred[:measured] x << pred[:measured].median y << pred[:value] -- cgit v1.2.3 From f46ba3b7262f5b551c81fc9396c5b7f0cac7f030 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 27 May 2016 19:16:16 +0200 Subject: first correlation of nanoparticle predictions --- lib/compound.rb | 30 ----------- lib/crossvalidation.rb | 2 +- lib/dataset.rb | 11 ++-- lib/import.rb | 57 +++++++++++++++------ lib/leave-one-out-validation.rb | 3 +- lib/model.rb | 16 +++--- lib/nanoparticle.rb | 110 +++++++++++++++++++++++++++++++--------- lib/regression.rb | 2 - lib/similarity.rb | 2 +- lib/validation-statistics.rb | 13 ++--- 10 files changed, 149 insertions(+), 97 deletions(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index 89e9db2..a87678e 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -254,36 +254,6 @@ module OpenTox self["chemblid"] end -=begin - def fingerprint_neighbors(type:, min_sim: 0.1, dataset_id:, prediction_feature_id:) - neighbors = [] - dataset = Dataset.find(dataset_id) - query_fingerprint = self.fingerprint type - dataset.compounds.each do |compound| - values = dataset.values(compound,prediction_feature_id) - if values - candidate_fingerprint = compound.fingerprint type -# features = (query_fingerprint + candidate_fingerprint).uniq -# min_sum = 0 -# max_sum = 0 -# features.each do |f| -# min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax -# min_sum += min -# max_sum += max -# end -# max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f -# neighbors << [compound.id, sim] if sim and sim >= params[:min_sim] -# end -# end -# neighbors.sort{|a,b| b.last <=> a.last} - sim = Algorithm::Similarity.tanimoto(query_fingerprint , candidate_fingerprint) - neighbors << {"_id" => compound.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim - end - end - neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]} - end -=end - def fingerprint_neighbors(type:, min_sim: 0.1, dataset_id:, prediction_feature_id:) neighbors = [] dataset = Dataset.find(dataset_id) diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 357f0fa..420dd8c 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -168,7 +168,7 @@ module OpenTox def correlation_plot unless correlation_plot_id - plot_id = ValidationStatistics.correlation_plot predictions + plot_id = ValidationStatistics.correlation_plot id, predictions update(:correlation_plot_id => plot_id) end $gridfs.find_one(_id: correlation_plot_id).data diff --git a/lib/dataset.rb b/lib/dataset.rb index 9138452..0c65d61 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -46,11 +46,8 @@ module OpenTox feature = feature.id if feature.is_a? Feature data_entries[substance.to_s] ||= {} data_entries[substance.to_s][feature.to_s] ||= [] - if value.is_a? Array - data_entries[substance.to_s][feature.to_s] += value - else - data_entries[substance.to_s][feature.to_s] << value - end + data_entries[substance.to_s][feature.to_s] << value + #data_entries[substance.to_s][feature.to_s].uniq! if value.numeric? # assuming that identical values come from the same source end # Dataset operations @@ -75,6 +72,7 @@ module OpenTox dataset = self.class.create(:source => self.id ) substances.each do |substance| substance.dataset_ids << dataset.id + substance.dataset_ids.uniq! substance.save dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {} end @@ -200,7 +198,8 @@ module OpenTox next end all_substances << substance - substance.dataset_ids << self.id unless substance.dataset_ids.include? self.id + substance.dataset_ids << self.id + substance.dataset_ids.uniq! substance.save unless vals.size == features.size diff --git a/lib/import.rb b/lib/import.rb index 2dcc361..80d4579 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -10,15 +10,15 @@ module OpenTox bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] File.open(File.join(dir,"bundles.json"),"w+"){|f| f.puts JSON.pretty_generate(bundles)} bundles.each do |bundle| - p bundle["title"] + $logger.debug bundle["title"] nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"] - p nanoparticles.size + $logger.debug nanoparticles.size nanoparticles.each do |nanoparticle| uuid = nanoparticle["values"]["https://data.enanomapper.net/identifier/uuid"] $logger.debug uuid File.open(File.join(dir,"nanoparticle-#{uuid}.json"),"w+"){|f| f.puts JSON.pretty_generate(nanoparticle)} studies = JSON.parse(RestClientWrapper.get(File.join(nanoparticle["compound"]["URI"],"study")))["study"] - p uuid if studies.size < 1 + $logger.debug uuid if studies.size < 1 studies.each do |study| File.open(File.join(dir,"study-#{study["uuid"]}.json"),"w+"){|f| f.puts JSON.pretty_generate(study)} end @@ -27,35 +27,58 @@ module OpenTox end def self.import dir="." + start_time = Time.now + t1 = 0 + t2 = 0 datasets = {} JSON.parse(File.read(File.join(dir,"bundles.json"))).each do |bundle| datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"]) end Dir[File.join(dir,"study*.json")].each do |s| + t = Time.now study = JSON.parse(File.read(s)) np = JSON.parse(File.read(File.join(dir,"nanoparticle-#{study['owner']['substance']['uuid']}.json"))) + core = {} + coating = [] + np["composition"].each do |c| + if c["relation"] == "HAS_CORE" + core = { + :uri => c["component"]["compound"]["URI"], + :name => c["component"]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"] + } + elsif c["relation"] == "HAS_COATING" + coating << { + :uri => c["component"]["compound"]["URI"], + :name => c["component"]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"] + } + end + end if np["composition"] nanoparticle = Nanoparticle.find_or_create_by( :name => np["values"]["https://data.enanomapper.net/identifier/name"], :source => np["compound"]["URI"], + :core => core, + :coating => coating ) np["bundles"].keys.each do |bundle_uri| - nanoparticle["dataset_ids"] << datasets[bundle_uri].id + nanoparticle.dataset_ids << datasets[bundle_uri].id end - bundle = datasets[np["bundles"].keys.first].id if np["bundles"].size == 1 + dataset = datasets[np["bundles"].keys.first] + proteomics_features = {} study["effects"].each do |effect| effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature - # TODO parse core/coating - #$logger.debug File.join(np["compound"]["URI"],"study") effect["conditions"].delete_if { |k, v| v.nil? } - # parse proteomics data - if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 + if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data +=begin JSON.parse(effect["result"]["textValue"]).each do |identifier, value| - feature = klass.find_or_create_by( - :name => identifier, - :category => "Proteomics", - ) - nanoparticle.parse_ambit_value feature, value, bundle + # time critical step + t = Time.now + proteomics_features[identifier] ||= klass.find_or_create_by(:name => identifier, :category => "Proteomics") + t1 += Time.now - t + t = Time.now + nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset + t2 += Time.now - t end +=end else feature = klass.find_or_create_by( :name => effect["endpoint"], @@ -63,10 +86,14 @@ module OpenTox :category => study["protocol"]["topcategory"], :conditions => effect["conditions"] ) - nanoparticle.parse_ambit_value feature, effect["result"], bundle + nanoparticle.parse_ambit_value feature, effect["result"], dataset end end nanoparticle.save + #p "Total time: #{Time.now - start_time}" + #p "Proteomics features: #{t1}" + #p "Proteomics values: #{t2}" + #p "Time2: #{t2}" end datasets.each { |u,d| d.save } end diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index b8deae9..9698e05 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -86,7 +86,6 @@ module OpenTox class RegressionLeaveOneOutValidation < LeaveOneOutValidation - include Plot field :rmse, type: Float, default: 0 field :mae, type: Float, default: 0 @@ -101,7 +100,7 @@ module OpenTox def correlation_plot unless correlation_plot_id - #plot_id = correlation_plot + plot_id = ValidationStatistics.correlation_plot id, predictions update(:correlation_plot_id => plot_id) end $gridfs.find_one(_id: correlation_plot_id).data diff --git a/lib/model.rb b/lib/model.rb index 3a178a1..18d621b 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -32,12 +32,13 @@ module OpenTox self.neighbor_algorithm_parameters ||= {} self.neighbor_algorithm_parameters[:dataset_id] = training_dataset.id - Algorithm.run(feature_selection_algorithm, self) if feature_selection_algorithm + #send(feature_selection_algorithm.to_sym) if feature_selection_algorithm save self end def correlation_filter + self.relevant_features = {} toxicities = [] substances = [] training_dataset.substances.each do |s| @@ -49,23 +50,22 @@ module OpenTox R.assign "tox", toxicities feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq feature_ids.each do |feature_id| - feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id]} + feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]} R.assign "feature", feature_values begin - #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')" - R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='complete')" + R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')" pvalue = R.eval("cor$p.value").to_ruby if pvalue <= 0.05 r = R.eval("cor$estimate").to_ruby - relevant_features[feature] = {} - relevant_features[feature]["pvalue"] = pvalue - relevant_features[feature]["r"] = r + self.relevant_features[feature_id] = {} + self.relevant_features[feature_id]["pvalue"] = pvalue + self.relevant_features[feature_id]["r"] = r end rescue warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{toxicities}) failed." end end - relevant_features.sort!{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h + self.relevant_features = self.relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h end def predict_substance substance diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 7890a19..5c6d944 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -3,12 +3,11 @@ module OpenTox class Nanoparticle < Substance include OpenTox - field :core, type: String + field :core, type: Hash, default: {} field :coating, type: Array, default: [] - field :bundles, type: Array, default: [] field :proteomics, type: Hash, default: {} - def nanoparticle_neighbors min_sim: 0.1, type:, dataset_id:, prediction_feature_id: + def nanoparticle_neighbors_old min_sim: 0.9, type:, dataset_id:, prediction_feature_id: dataset = Dataset.find(dataset_id) neighbors = [] dataset.nanoparticles.each do |np| @@ -25,33 +24,96 @@ module OpenTox neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} neighbors end - - def add_feature feature, value, dataset_id + + def nanoparticle_neighbors min_sim: 0.9, type:, dataset_id:, prediction_feature_id: + p self.name + #p self.physchem_descriptors.keys.size dataset = Dataset.find(dataset_id) - case feature.category - when "P-CHEM" - physchem_descriptors[feature.id.to_s] ||= [] - physchem_descriptors[feature.id.to_s] << value - physchem_descriptors[feature.id.to_s].uniq! - when "Proteomics" - proteomics[feature.id.to_s] ||= [] - proteomics[feature.id.to_s] << value - proteomics[feature.id.to_s].uniq! - when "TOX" - # TODO generic way of parsing TOX values - if feature.name == "Net cell association" and feature.unit == "mL/ug(Mg)" - dataset.add self, feature, -Math.log10(value) + relevant_features = {} + toxicities = [] + substances = [] + # TODO: exclude query activities!!! + dataset.substances.each do |s| + dataset.values(s,prediction_feature_id).each do |act| + toxicities << act + substances << s + end + end + R.assign "tox", toxicities + feature_ids = physchem_descriptors.keys.select{|fid| Feature.find(fid).is_a? NumericFeature} + # identify relevant features + feature_ids.each do |feature_id| + feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]} + R.assign "feature", feature_values + begin + R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')" + pvalue = R.eval("cor$p.value").to_ruby + if pvalue <= 0.05 + r = R.eval("cor$estimate").to_ruby + relevant_features[feature_id] = {} + relevant_features[feature_id]["pvalue"] = pvalue + relevant_features[feature_id]["r"] = r + relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby + relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby + end + rescue + warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{toxicities}) failed." + end + end + neighbors = [] + substances.each do |substance| + values = dataset.values(substance,prediction_feature_id) + if values + common_descriptors = relevant_features.keys & substance.physchem_descriptors.keys + # scale values + query_descriptors = common_descriptors.collect{|d| (physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} + neighbor_descriptors = common_descriptors.collect{|d| (substance.physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} + #weights = common_descriptors.collect{|d| 1-relevant_features[d]["pvalue"]} + weights = common_descriptors.collect{|d| relevant_features[d]["r"]**2} + #p weights + sim = Algorithm::Similarity.weighted_cosine(query_descriptors,neighbor_descriptors,weights) + ##p "SIM" + #p [sim, Algorithm::Similarity.cosine(query_descriptors,neighbor_descriptors)] + neighbors << {"_id" => substance.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim + end + end + p neighbors.size + neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} + neighbors + end + + def add_feature feature, value, dataset + unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand + case feature.category + when "P-CHEM" + physchem_descriptors[feature.id.to_s] ||= [] + physchem_descriptors[feature.id.to_s] << value + physchem_descriptors[feature.id.to_s].uniq! + when "Proteomics" + proteomics[feature.id.to_s] ||= [] + proteomics[feature.id.to_s] << value + proteomics[feature.id.to_s].uniq! + when "TOX" + # TODO generic way of parsing TOX values + if feature.name == "Net cell association" and feature.unit == "mL/ug(Mg)" + dataset.add self, feature, Math.log2(value) + elsif feature.name == "Total protein (BCA assay)" + physchem_descriptors[feature.id.to_s] ||= [] + physchem_descriptors[feature.id.to_s] << value + physchem_descriptors[feature.id.to_s].uniq! + else + dataset.add self, feature, value + end + dataset.save + dataset_ids << dataset.id + dataset_ids.uniq! else - dataset.add self, feature, value + warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted." end - dataset.save - else - warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted." end end - def parse_ambit_value feature, v, dataset_id - dataset = Dataset.find(dataset_id) + def parse_ambit_value feature, v, dataset v.delete "unit" # TODO: ppm instead of weights if v.keys == ["textValue"] diff --git a/lib/regression.rb b/lib/regression.rb index 9d305a6..6487557 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -71,7 +71,6 @@ module OpenTox #def self.local_physchem_regression(substance:, neighbors:, feature_id:, dataset_id:, method: 'pls')#, method_params="ncomp = 4" def self.local_physchem_regression substance, neighbors, method='pls' #, method_params="ncomp = 4" - #dataset = Dataset.find dataset_id activities = [] weights = [] pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq @@ -83,7 +82,6 @@ module OpenTox activities = neighbor["toxicities"] activities.each do |act| data_frame[0][i] = act - # TODO: update with cosine similarity for physchem weights << n["similarity"] neighbor.physchem_descriptors.each do |pid,values| values = [values] unless values.is_a? Array diff --git a/lib/similarity.rb b/lib/similarity.rb index f25d4c3..00179c1 100644 --- a/lib/similarity.rb +++ b/lib/similarity.rb @@ -38,7 +38,7 @@ module OpenTox magnitude_a += w[i].abs*a[i]**2 magnitude_b += w[i].abs*b[i]**2 end - dot_product/Math.sqrt(magnitude_a*magnitude_b) + dot_product/(Math.sqrt(magnitude_a)*Math.sqrt(magnitude_b)) end end diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 156353a..e61543b 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -83,7 +83,7 @@ module OpenTox end R.assign "measurement", x R.assign "prediction", y - R.eval "r <- cor(measurement,prediction,use='complete')" + R.eval "r <- cor(measurement,prediction,use='pairwise')" r = R.eval("r").to_ruby mae = mae/predictions.size @@ -99,11 +99,7 @@ module OpenTox } end - end - - module Plot - - def plot_id + def self.correlation_plot id, predictions tmpfile = "/tmp/#{id.to_s}_correlation.png" x = [] y = [] @@ -115,10 +111,11 @@ module OpenTox R.assign "prediction", y R.eval "all = c(measurement,prediction)" R.eval "range = c(min(all), max(all))" - R.eval "image = qplot(prediction,measurement,main='',asp=1,xlim=range, ylim=range)" + # TODO units + R.eval "image = qplot(prediction,measurement,main='',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" R.eval "image = image + geom_abline(intercept=0, slope=1)" R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png") + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.png") plot_id = $gridfs.insert_one(file) plot_id end -- cgit v1.2.3 From b515a0cfedb887a2af753db6e4a08ae1af430cad Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 31 May 2016 18:08:08 +0200 Subject: cleanup of validation modules/classes --- lib/classification.rb | 2 +- lib/compound.rb | 6 +- lib/crossvalidation.rb | 251 +++++++++++----------------------- lib/dataset.rb | 2 +- lib/lazar.rb | 7 +- lib/leave-one-out-validation.rb | 141 ++++++------------- lib/model.rb | 26 ++-- lib/nanoparticle.rb | 80 +++++------ lib/regression.rb | 6 +- lib/train-test-validation.rb | 58 ++++++++ lib/validation-statistics.rb | 292 +++++++++++++++++++++++++--------------- lib/validation.rb | 72 +++------- 12 files changed, 441 insertions(+), 502 deletions(-) create mode 100644 lib/train-test-validation.rb (limited to 'lib') diff --git a/lib/classification.rb b/lib/classification.rb index 48ff8b3..0f3c6d9 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -7,7 +7,7 @@ module OpenTox sims = {} neighbors.each do |neighbor| sim = neighbor["similarity"] - activities = neighbor["toxicities"] + activities = neighbor["measurements"] activities.each do |act| sims[act] ||= [] sims[act] << sim diff --git a/lib/compound.rb b/lib/compound.rb index a87678e..4541816 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -260,7 +260,7 @@ module OpenTox if type == DEFAULT_FINGERPRINT neighbors = db_neighbors(min_sim: min_sim, dataset_id: dataset_id) neighbors.each do |n| - n["toxicities"] = dataset.values(n["_id"],prediction_feature_id) + n["measurements"] = dataset.values(n["_id"],prediction_feature_id) end else query_fingerprint = self.fingerprint type @@ -269,7 +269,7 @@ module OpenTox if values candidate_fingerprint = compound.fingerprint type sim = Algorithm::Similarity.tanimoto(query_fingerprint , candidate_fingerprint) - neighbors << {"_id" => compound.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim + neighbors << {"_id" => compound.id, "measurements" => values, "similarity" => sim} if sim >= min_sim end end end @@ -310,7 +310,7 @@ module OpenTox 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]} }}, '_id' => 1, - #'toxicities' => 1, + #'measurements' => 1, 'dataset_ids' => 1 }}, {'$match' => {'similarity' => {'$gte' => min_sim}}}, diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 420dd8c..22071d8 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -1,193 +1,96 @@ module OpenTox - class CrossValidation - field :validation_ids, type: Array, default: [] - field :model_id, type: BSON::ObjectId - field :folds, type: Integer - field :nr_instances, type: Integer - field :nr_unpredicted, type: Integer - field :predictions, type: Hash, default: {} - field :finished_at, type: Time - - def time - finished_at - created_at - end - - def validations - validation_ids.collect{|vid| Validation.find vid} - end - - def model - Model::Lazar.find model_id - end - - def self.create model, n=10 - klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification - klass = RegressionCrossValidation if model.is_a? Model::LazarRegression - bad_request_error "Unknown model class #{model.class}." unless klass - - cv = klass.new( - name: model.name, - model_id: model.id, - folds: n - ) - cv.save # set created_at - nr_instances = 0 - nr_unpredicted = 0 - predictions = {} - training_dataset = Dataset.find model.training_dataset_id - training_dataset.folds(n).each_with_index do |fold,fold_nr| - #fork do # parallel execution of validations can lead to Rserve and memory problems - $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" - t = Time.now - validation = Validation.create(model, fold[0], fold[1],cv) - #p validation - $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" - #end - end - #Process.waitall - cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id) - cv.validations.each do |validation| - nr_instances += validation.nr_instances - nr_unpredicted += validation.nr_unpredicted - predictions.merge! validation.predictions + module Validation + class CrossValidation < Validation + field :validation_ids, type: Array, default: [] + field :model_id, type: BSON::ObjectId + field :folds, type: Integer, default: 10 + field :nr_instances, type: Integer, default: 0 + field :nr_unpredicted, type: Integer, default: 0 + field :predictions, type: Hash, default: {} + + def time + finished_at - created_at end - cv.update_attributes( - nr_instances: nr_instances, - nr_unpredicted: nr_unpredicted, - predictions: predictions - ) - $logger.debug "Nr unpredicted: #{nr_unpredicted}" - cv.statistics - cv - end - end - class ClassificationCrossValidation < CrossValidation - - field :accept_values, type: Array - field :confusion_matrix, type: Array - field :weighted_confusion_matrix, type: Array - field :accuracy, type: Float - field :weighted_accuracy, type: Float - field :true_rate, type: Hash - field :predictivity, type: Hash - field :confidence_plot_id, type: BSON::ObjectId - # TODO auc, f-measure (usability??) - - def statistics - stat = ValidationStatistics.classification(predictions, Feature.find(model.prediction_feature_id).accept_values) - update_attributes(stat) - stat - end + def validations + validation_ids.collect{|vid| TrainTest.find vid} + end - def confidence_plot - unless confidence_plot_id - tmpfile = "/tmp/#{id.to_s}_confidence.png" - accuracies = [] - confidences = [] - correct_predictions = 0 - incorrect_predictions = 0 - predictions.each do |p| - if p[1] and p[2] - p[1] == p[2] ? correct_predictions += 1 : incorrect_predictions += 1 - accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f - confidences << p[3] + def model + Model::Lazar.find model_id + end - end + def self.create model, n=10 + klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification + klass = RegressionCrossValidation if model.is_a? Model::LazarRegression + bad_request_error "Unknown model class #{model.class}." unless klass + + cv = klass.new( + name: model.name, + model_id: model.id, + folds: n + ) + cv.save # set created_at + nr_instances = 0 + nr_unpredicted = 0 + predictions = {} + training_dataset = Dataset.find model.training_dataset_id + training_dataset.folds(n).each_with_index do |fold,fold_nr| + #fork do # parallel execution of validations can lead to Rserve and memory problems + $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" + t = Time.now + validation = TrainTest.create(model, fold[0], fold[1]) + cv.validation_ids << validation.id + cv.nr_instances += validation.nr_instances + cv.nr_unpredicted += validation.nr_unpredicted + cv.predictions.merge! validation.predictions + $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" + #end end - R.assign "accuracy", accuracies - R.assign "confidence", confidences - R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" - R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png") - plot_id = $gridfs.insert_one(file) - update(:confidence_plot_id => plot_id) + #Process.waitall + cv.save + $logger.debug "Nr unpredicted: #{nr_unpredicted}" + cv.statistics + cv.update_attributes(finished_at: Time.now) + cv end - $gridfs.find_one(_id: confidence_plot_id).data - end - - #Average area under roc 0.646 - #Area under roc 0.646 - #F measure carcinogen: 0.769, noncarcinogen: 0.348 - end - - class RegressionCrossValidation < CrossValidation - - field :rmse, type: Float - field :mae, type: Float - field :r_squared, type: Float - field :correlation_plot_id, type: BSON::ObjectId - - def statistics - stat = ValidationStatistics.regression predictions - update_attributes(stat) - stat end - def misclassifications n=nil - n ||= 10 - model = Model::Lazar.find(self.model_id) - training_dataset = Dataset.find(model.training_dataset_id) - prediction_feature = training_dataset.features.first - predictions.collect do |p| - unless p.include? nil - compound = Compound.find(p[0]) - neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters) - neighbors.collect! do |n| - neighbor = Compound.find(n[0]) - { :smiles => neighbor.smiles, :similarity => n[1], :measurements => neighbor.toxicities[prediction_feature.id.to_s][training_dataset.id.to_s]} - end - { - :smiles => compound.smiles, - :measured => p[1], - :predicted => p[2], - :error => (p[1]-p[2]).abs, - :relative_error => (p[1]-p[2]).abs/p[1], - :confidence => p[3], - :neighbors => neighbors - } - end - end.compact.sort{|a,b| b[:relative_error] <=> a[:relative_error]}[0..n-1] + class ClassificationCrossValidation < CrossValidation + include ClassificationStatistics + field :accept_values, type: Array + field :confusion_matrix, type: Array + field :weighted_confusion_matrix, type: Array + field :accuracy, type: Float + field :weighted_accuracy, type: Float + field :true_rate, type: Hash + field :predictivity, type: Hash + field :confidence_plot_id, type: BSON::ObjectId end - def confidence_plot - tmpfile = "/tmp/#{id.to_s}_confidence.png" - sorted_predictions = predictions.collect{|p| [(p[1]-p[2]).abs,p[3]] if p[1] and p[2]}.compact - R.assign "error", sorted_predictions.collect{|p| p[0]} - R.assign "confidence", sorted_predictions.collect{|p| p[1]} - # TODO fix axis names - R.eval "image = qplot(confidence,error)" - R.eval "image = image + stat_smooth(method='lm', se=FALSE)" - R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png") - plot_id = $gridfs.insert_one(file) - update(:confidence_plot_id => plot_id) - $gridfs.find_one(_id: confidence_plot_id).data + class RegressionCrossValidation < CrossValidation + include RegressionStatistics + field :rmse, type: Float + field :mae, type: Float + field :r_squared, type: Float + field :correlation_plot_id, type: BSON::ObjectId end - def correlation_plot - unless correlation_plot_id - plot_id = ValidationStatistics.correlation_plot id, predictions - update(:correlation_plot_id => plot_id) + class RepeatedCrossValidation < Validation + field :crossvalidation_ids, type: Array, default: [] + def self.create model, folds=10, repeats=3 + repeated_cross_validation = self.new + repeats.times do |n| + $logger.debug "Crossvalidation #{n+1} for #{model.name}" + repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id + end + repeated_cross_validation.save + repeated_cross_validation end - $gridfs.find_one(_id: correlation_plot_id).data - end - end - - class RepeatedCrossValidation - field :crossvalidation_ids, type: Array, default: [] - def self.create model, folds=10, repeats=3 - repeated_cross_validation = self.new - repeats.times do |n| - $logger.debug "Crossvalidation #{n+1} for #{model.name}" - repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id + def crossvalidations + crossvalidation_ids.collect{|id| CrossValidation.find(id)} end - repeated_cross_validation.save - repeated_cross_validation - end - def crossvalidations - crossvalidation_ids.collect{|id| CrossValidation.find(id)} end end diff --git a/lib/dataset.rb b/lib/dataset.rb index 0c65d61..2e21e5b 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -69,7 +69,7 @@ module OpenTox training_idxs = indices-test_idxs training_substances = training_idxs.collect{|i| substances[i]} chunk = [training_substances,test_substances].collect do |substances| - dataset = self.class.create(:source => self.id ) + dataset = self.class.create(:name => "#{self.name} (Fold #{i-1})",:source => self.id ) substances.each do |substance| substance.dataset_ids << dataset.id substance.dataset_ids.uniq! diff --git a/lib/lazar.rb b/lib/lazar.rb index 7bd87f4..1853aba 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -62,7 +62,7 @@ suppressPackageStartupMessages({ " # OpenTox classes and includes -CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules +CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules [ # be aware of the require sequence as it affects class/method overwrites "overwrite.rb", @@ -82,8 +82,9 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","Cross "regression.rb", "validation-statistics.rb", "validation.rb", - "crossvalidation.rb", + "train-test-validation.rb", "leave-one-out-validation.rb", - "experiment.rb", + "crossvalidation.rb", + #"experiment.rb", "import.rb", ].each{ |f| require_relative f } diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index 9698e05..7ff65ff 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -1,110 +1,57 @@ module OpenTox - class LeaveOneOutValidation - - field :model_id, type: BSON::ObjectId - field :nr_instances, type: Integer - field :nr_unpredicted, type: Integer - field :predictions, type: Hash - field :finished_at, type: Time - - def self.create model - $logger.debug "#{model.name}: LOO validation started" - t = Time.now - model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation - loo = klass.new :model_id => model.id - predictions = model.predict model.training_dataset.substances - predictions.each{|cid,p| p.delete(:neighbors)} - nr_unpredicted = 0 - predictions.each do |cid,prediction| - if prediction[:value] - prediction[:measured] = model.training_dataset.values(cid, prediction[:prediction_feature_id]) - else - nr_unpredicted += 1 + module Validation + + class LeaveOneOut < Validation + + def self.create model + $logger.debug "#{model.name}: LOO validation started" + t = Time.now + model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOut : klass = RegressionLeaveOneOut + loo = klass.new :model_id => model.id + predictions = model.predict model.training_dataset.substances + predictions.each{|cid,p| p.delete(:neighbors)} + nr_unpredicted = 0 + predictions.each do |cid,prediction| + if prediction[:value] + prediction[:measurements] = model.training_dataset.values(cid, prediction[:prediction_feature_id]) + else + nr_unpredicted += 1 + end + predictions.delete(cid) unless prediction[:value] and prediction[:measurements] end - predictions.delete(cid) unless prediction[:value] and prediction[:measured] + predictions.select!{|cid,p| p[:value] and p[:measurements]} + loo.nr_instances = predictions.size + loo.nr_unpredicted = nr_unpredicted + loo.predictions = predictions + loo.statistics + $logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds" + loo end - predictions.select!{|cid,p| p[:value] and p[:measured]} - loo.nr_instances = predictions.size - loo.nr_unpredicted = nr_unpredicted - loo.predictions = predictions - loo.statistics - loo.save - $logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds" - loo - end - def model - Model::Lazar.find model_id end - end - class ClassificationLeaveOneOutValidation < LeaveOneOutValidation - - field :accept_values, type: Array - field :confusion_matrix, type: Array, default: [] - field :weighted_confusion_matrix, type: Array, default: [] - field :accuracy, type: Float - field :weighted_accuracy, type: Float - field :true_rate, type: Hash, default: {} - field :predictivity, type: Hash, default: {} - field :confidence_plot_id, type: BSON::ObjectId - - def statistics - stat = ValidationStatistics.classification(predictions, Feature.find(model.prediction_feature_id).accept_values) - update_attributes(stat) + class ClassificationLeaveOneOut < LeaveOneOut + include ClassificationStatistics + field :accept_values, type: Array + field :confusion_matrix, type: Array, default: [] + field :weighted_confusion_matrix, type: Array, default: [] + field :accuracy, type: Float + field :weighted_accuracy, type: Float + field :true_rate, type: Hash, default: {} + field :predictivity, type: Hash, default: {} + field :confidence_plot_id, type: BSON::ObjectId end - - def confidence_plot - unless confidence_plot_id - tmpfile = "/tmp/#{id.to_s}_confidence.svg" - accuracies = [] - confidences = [] - correct_predictions = 0 - incorrect_predictions = 0 - predictions.each do |p| - p[:database_activities].each do |db_act| - if p[:value] - p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1 - accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f - confidences << p[:confidence] - - end - end - end - R.assign "accuracy", accuracies - R.assign "confidence", confidences - R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" - R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") - plot_id = $gridfs.insert_one(file) - update(:confidence_plot_id => plot_id) - end - $gridfs.find_one(_id: confidence_plot_id).data + + class RegressionLeaveOneOut < LeaveOneOut + include RegressionStatistics + field :rmse, type: Float, default: 0 + field :mae, type: Float, default: 0 + field :r_squared, type: Float + field :correlation_plot_id, type: BSON::ObjectId + field :confidence_plot_id, type: BSON::ObjectId end - end - - - class RegressionLeaveOneOutValidation < LeaveOneOutValidation - - field :rmse, type: Float, default: 0 - field :mae, type: Float, default: 0 - field :r_squared, type: Float - field :correlation_plot_id, type: BSON::ObjectId - field :confidence_plot_id, type: BSON::ObjectId - def statistics - stat = ValidationStatistics.regression predictions - update_attributes(stat) - end - - def correlation_plot - unless correlation_plot_id - plot_id = ValidationStatistics.correlation_plot id, predictions - update(:correlation_plot_id => plot_id) - end - $gridfs.find_one(_id: correlation_plot_id).data - end end end diff --git a/lib/model.rb b/lib/model.rb index 18d621b..988cac9 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -22,7 +22,6 @@ module OpenTox # @param [OpenTox::Dataset] training_dataset # @return [OpenTox::Model::Lazar] Regression or classification model def initialize prediction_feature, training_dataset, params={} - super params # set defaults for empty parameters @@ -39,15 +38,15 @@ module OpenTox def correlation_filter self.relevant_features = {} - toxicities = [] + measurements = [] substances = [] training_dataset.substances.each do |s| training_dataset.values(s,prediction_feature_id).each do |act| - toxicities << act + measurements << act substances << s end end - R.assign "tox", toxicities + R.assign "tox", measurements feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq feature_ids.each do |feature_id| feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]} @@ -62,7 +61,7 @@ module OpenTox self.relevant_features[feature_id]["r"] = r end rescue - warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{toxicities}) failed." + warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed." end end self.relevant_features = self.relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h @@ -71,22 +70,22 @@ module OpenTox def predict_substance substance neighbor_algorithm_parameters = Hash[self.neighbor_algorithm_parameters.map{ |k, v| [k.to_sym, v] }] # convert string keys to symbols neighbors = substance.send(neighbor_algorithm, neighbor_algorithm_parameters) - database_activities = nil + measurements = nil prediction = {} # handle query substance if neighbors.collect{|n| n["_id"]}.include? substance.id query = neighbors.select{|n| n["_id"] == substance.id}.first - database_activities = training_dataset.values(query["_id"],prediction_feature_id) - prediction[:database_activities] = database_activities - prediction[:warning] = "#{database_activities.size} substances have been removed from neighbors, because they are identical with the query substance." + measurements = training_dataset.values(query["_id"],prediction_feature_id) + prediction[:measurements] = measurements + prediction[:warning] = "#{measurements.size} substances have been removed from neighbors, because they are identical with the query substance." neighbors.delete_if{|n| n["_id"] == substance.id} # remove query substance for an unbiased prediction (also useful for loo validation) end if neighbors.empty? prediction.merge!({:value => nil,:probabilities => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) elsif neighbors.size == 1 value = nil - tox = neighbors.first["toxicities"] + tox = neighbors.first["measurements"] if tox.size == 1 # single measurement value = tox.first else # multiple measurement @@ -141,7 +140,7 @@ module OpenTox elsif object.is_a? Array return predictions elsif object.is_a? Dataset - predictions.each{|cid,p| p.delete(:neighbors)} + #predictions.each{|cid,p| p.delete(:neighbors)} # prepare prediction dataset measurement_feature = Feature.find prediction_feature_id @@ -187,6 +186,7 @@ module OpenTox model.save model end + end class LazarRegression < Lazar @@ -197,19 +197,21 @@ module OpenTox model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression" model.neighbor_algorithm_parameters ||= {} { - :type => "MP2D", :min_sim => 0.1, :dataset_id => training_dataset.id, :prediction_feature_id => prediction_feature.id, }.each do |key,value| model.neighbor_algorithm_parameters[key] ||= value end + model.neighbor_algorithm_parameters[:type] = "MP2D" if training_dataset.substances.first.is_a? Compound model.save model end + end class Prediction + include OpenTox include Mongoid::Document include Mongoid::Timestamps diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 5c6d944..d0f8f51 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -6,58 +6,43 @@ module OpenTox field :core, type: Hash, default: {} field :coating, type: Array, default: [] field :proteomics, type: Hash, default: {} - - def nanoparticle_neighbors_old min_sim: 0.9, type:, dataset_id:, prediction_feature_id: - dataset = Dataset.find(dataset_id) - neighbors = [] - dataset.nanoparticles.each do |np| - values = dataset.values(np,prediction_feature_id) - if values - common_descriptors = physchem_descriptors.keys & np.physchem_descriptors.keys - common_descriptors.select!{|id| NumericFeature.find(id) } - query_descriptors = common_descriptors.collect{|d| physchem_descriptors[d].first} - neighbor_descriptors = common_descriptors.collect{|d| np.physchem_descriptors[d].first} - sim = Algorithm::Similarity.cosine(query_descriptors,neighbor_descriptors) - neighbors << {"_id" => np.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim - end - end - neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} - neighbors - end - def nanoparticle_neighbors min_sim: 0.9, type:, dataset_id:, prediction_feature_id: + def physchem_neighbors min_sim: 0.9, dataset_id:, prediction_feature_id: p self.name - #p self.physchem_descriptors.keys.size dataset = Dataset.find(dataset_id) relevant_features = {} - toxicities = [] + measurements = [] substances = [] # TODO: exclude query activities!!! dataset.substances.each do |s| - dataset.values(s,prediction_feature_id).each do |act| - toxicities << act - substances << s + if s.core == self.core # exclude nanoparticles with different core + dataset.values(s,prediction_feature_id).each do |act| + measurements << act + substances << s + end end end - R.assign "tox", toxicities + R.assign "tox", measurements feature_ids = physchem_descriptors.keys.select{|fid| Feature.find(fid).is_a? NumericFeature} # identify relevant features feature_ids.each do |feature_id| feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]} - R.assign "feature", feature_values - begin - R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')" - pvalue = R.eval("cor$p.value").to_ruby - if pvalue <= 0.05 - r = R.eval("cor$estimate").to_ruby - relevant_features[feature_id] = {} - relevant_features[feature_id]["pvalue"] = pvalue - relevant_features[feature_id]["r"] = r - relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby - relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby + unless feature_values.uniq.size == 1 + R.assign "feature", feature_values + begin + R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')" + p_value = R.eval("cor$p.value").to_ruby + if p_value <= 0.05 + r = R.eval("cor$estimate").to_ruby + relevant_features[feature_id] = {} + relevant_features[feature_id]["p_value"] = p_value + relevant_features[feature_id]["r"] = r + relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby + relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby + end + rescue + warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed." end - rescue - warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{toxicities}) failed." end end neighbors = [] @@ -68,13 +53,17 @@ module OpenTox # scale values query_descriptors = common_descriptors.collect{|d| (physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} neighbor_descriptors = common_descriptors.collect{|d| (substance.physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} - #weights = common_descriptors.collect{|d| 1-relevant_features[d]["pvalue"]} + #weights = common_descriptors.collect{|d| 1-relevant_features[d]["p_value"]} weights = common_descriptors.collect{|d| relevant_features[d]["r"]**2} - #p weights sim = Algorithm::Similarity.weighted_cosine(query_descriptors,neighbor_descriptors,weights) - ##p "SIM" - #p [sim, Algorithm::Similarity.cosine(query_descriptors,neighbor_descriptors)] - neighbors << {"_id" => substance.id, "toxicities" => values, "similarity" => sim} if sim >= min_sim + neighbors << { + "_id" => substance.id, + "measurements" => values, + "similarity" => sim, + "common_descriptors" => common_descriptors.collect do |id| + {:id => id, :p_value => relevant_features[id]["p_value"], :r_squared => relevant_features[id]["r"]**2} + end + } if sim >= min_sim end end p neighbors.size @@ -94,10 +83,7 @@ module OpenTox proteomics[feature.id.to_s] << value proteomics[feature.id.to_s].uniq! when "TOX" - # TODO generic way of parsing TOX values - if feature.name == "Net cell association" and feature.unit == "mL/ug(Mg)" - dataset.add self, feature, Math.log2(value) - elsif feature.name == "Total protein (BCA assay)" + if feature.name == "Total protein (BCA assay)" physchem_descriptors[feature.id.to_s] ||= [] physchem_descriptors[feature.id.to_s] << value physchem_descriptors[feature.id.to_s].uniq! diff --git a/lib/regression.rb b/lib/regression.rb index 6487557..cffcbbf 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -8,7 +8,7 @@ module OpenTox sim_sum = 0.0 neighbors.each do |neighbor| sim = neighbor["similarity"] - activities = neighbor["toxicities"] + activities = neighbor["measurements"] activities.each do |act| weighted_sum += sim*act sim_sum += sim @@ -26,7 +26,7 @@ module OpenTox neighbors.each do |n| fingerprint = Substance.find(n["_id"]).fingerprint - activities = n["toxicities"] + activities = n["measurements"] activities.each do |act| values << act weights << n["similarity"] @@ -79,7 +79,7 @@ module OpenTox neighbors.each_with_index do |n,i| neighbor = Substance.find(n["_id"]) - activities = neighbor["toxicities"] + activities = neighbor["measurements"] activities.each do |act| data_frame[0][i] = act weights << n["similarity"] diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb new file mode 100644 index 0000000..286614a --- /dev/null +++ b/lib/train-test-validation.rb @@ -0,0 +1,58 @@ +module OpenTox + + module Validation + + class TrainTest < Validation + + field :training_dataset_id, type: BSON::ObjectId + field :test_dataset_id, type: BSON::ObjectId + + def self.create model, training_set, test_set + + atts = model.attributes.dup # do not modify attributes of the original model + atts["_id"] = BSON::ObjectId.new + atts[:training_dataset_id] = training_set.id + validation_model = model.class.create model.prediction_feature, training_set, atts + validation_model.save + predictions = validation_model.predict test_set.substances + nr_unpredicted = 0 + predictions.each do |cid,prediction| + if prediction[:value] + prediction[:measurements] = test_set.values(cid, prediction[:prediction_feature_id]) + else + nr_unpredicted += 1 + end + end + predictions.select!{|cid,p| p[:value] and p[:measurements]} + validation = self.new( + :model_id => validation_model.id, + :test_dataset_id => test_set.id, + :nr_instances => test_set.substances.size, + :nr_unpredicted => nr_unpredicted, + :predictions => predictions + ) + validation.save + validation + end + + def test_dataset + Dataset.find test_dataset_id + end + + def training_dataset + Dataset.find training_dataset_id + end + + end + + class ClassificationTrainTest < TrainTest + include ClassificationStatistics + end + + class RegressionTrainTest < TrainTest + include RegressionStatistics + end + + end + +end diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index e61543b..816824b 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -1,123 +1,203 @@ module OpenTox - class ValidationStatistics - include OpenTox - def self.classification predictions, accept_values - confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} - weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} - true_rate = {} - predictivity = {} - nr_instances = 0 - predictions.each do |cid,pred| - # TODO - # use predictions without probabilities (single neighbor)?? - # use measured majority class?? - if pred[:measured].uniq.size == 1 and pred[:probabilities] - m = pred[:measured].first - if pred[:value] == m - if pred[:value] == accept_values[0] - confusion_matrix[0][0] += 1 - weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]] - nr_instances += 1 - elsif pred[:value] == accept_values[1] - confusion_matrix[1][1] += 1 - weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]] - nr_instances += 1 - end - elsif pred[:value] != m - if pred[:value] == accept_values[0] - confusion_matrix[0][1] += 1 - weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]] - nr_instances += 1 - elsif pred[:value] == accept_values[1] - confusion_matrix[1][0] += 1 - weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]] - nr_instances += 1 + module Validation + module ClassificationStatistics + + def statistics + self.accept_values = model.prediction_feature.accept_values + self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} + self.weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} + true_rate = {} + predictivity = {} + nr_instances = 0 + predictions.each do |cid,pred| + # TODO + # use predictions without probabilities (single neighbor)?? + # use measured majority class?? + if pred[:measurements].uniq.size == 1 and pred[:probabilities] + m = pred[:measurements].first + if pred[:value] == m + if pred[:value] == accept_values[0] + confusion_matrix[0][0] += 1 + weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]] + nr_instances += 1 + elsif pred[:value] == accept_values[1] + confusion_matrix[1][1] += 1 + weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]] + nr_instances += 1 + end + elsif pred[:value] != m + if pred[:value] == accept_values[0] + confusion_matrix[0][1] += 1 + weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]] + nr_instances += 1 + elsif pred[:value] == accept_values[1] + confusion_matrix[1][0] += 1 + weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]] + nr_instances += 1 + end end end end + true_rate = {} + predictivity = {} + accept_values.each_with_index do |v,i| + true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f + predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f + end + confidence_sum = 0 + weighted_confusion_matrix.each do |r| + r.each do |c| + confidence_sum += c + end + end + self.accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f + self.weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f + $logger.debug "Accuracy #{accuracy}" + save + { + :accept_values => accept_values, + :confusion_matrix => confusion_matrix, + :weighted_confusion_matrix => weighted_confusion_matrix, + :accuracy => accuracy, + :weighted_accuracy => weighted_accuracy, + :true_rate => true_rate, + :predictivity => predictivity, + } end - true_rate = {} - predictivity = {} - accept_values.each_with_index do |v,i| - true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f - predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f - end - confidence_sum = 0 - weighted_confusion_matrix.each do |r| - r.each do |c| - confidence_sum += c + + def confidence_plot + unless confidence_plot_id + tmpfile = "/tmp/#{id.to_s}_confidence.svg" + accuracies = [] + confidences = [] + correct_predictions = 0 + incorrect_predictions = 0 + predictions.each do |p| + p[:measurements].each do |db_act| + if p[:value] + p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1 + accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f + confidences << p[:confidence] + + end + end + end + R.assign "accuracy", accuracies + R.assign "confidence", confidences + R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") + plot_id = $gridfs.insert_one(file) + update(:confidence_plot_id => plot_id) end + $gridfs.find_one(_id: confidence_plot_id).data end - accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f - weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f - $logger.debug "Accuracy #{accuracy}" - { - :accept_values => accept_values, - :confusion_matrix => confusion_matrix, - :weighted_confusion_matrix => weighted_confusion_matrix, - :accuracy => accuracy, - :weighted_accuracy => weighted_accuracy, - :true_rate => true_rate, - :predictivity => predictivity, - :finished_at => Time.now - } end - def self.regression predictions - # TODO: predictions within prediction_interval - rmse = 0 - mae = 0 - x = [] - y = [] - predictions.each do |cid,pred| - if pred[:value] and pred[:measured] - x << pred[:measured].median - y << pred[:value] - error = pred[:value]-pred[:measured].median - rmse += error**2 - mae += error.abs - else - warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + module RegressionStatistics + + def statistics + # TODO: predictions within prediction_interval + rmse = 0 + mae = 0 + x = [] + y = [] + predictions.each do |cid,pred| + if pred[:value] and pred[:measurements] + x << pred[:measurements].median + y << pred[:value] + error = pred[:value]-pred[:measurements].median + rmse += error**2 + mae += error.abs + else + warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + end end + R.assign "measurement", x + R.assign "prediction", y + R.eval "r <- cor(measurement,prediction,use='pairwise')" + r = R.eval("r").to_ruby + + mae = mae/predictions.size + rmse = Math.sqrt(rmse/predictions.size) + $logger.debug "R^2 #{r**2}" + $logger.debug "RMSE #{rmse}" + $logger.debug "MAE #{mae}" + { + :mae => mae, + :rmse => rmse, + :r_squared => r**2, + } end - R.assign "measurement", x - R.assign "prediction", y - R.eval "r <- cor(measurement,prediction,use='pairwise')" - r = R.eval("r").to_ruby - mae = mae/predictions.size - rmse = Math.sqrt(rmse/predictions.size) - $logger.debug "R^2 #{r**2}" - $logger.debug "RMSE #{rmse}" - $logger.debug "MAE #{mae}" - { - :mae => mae, - :rmse => rmse, - :r_squared => r**2, - :finished_at => Time.now - } - end + def correlation_plot + unless correlation_plot_id + tmpfile = "/tmp/#{id.to_s}_correlation.pdf" + x = [] + y = [] + feature = Feature.find(predictions.first.last["prediction_feature_id"]) + predictions.each do |sid,p| + x << p["value"] + y << p["measurements"].median + end + R.assign "measurement", x + R.assign "prediction", y + R.eval "all = c(measurement,prediction)" + R.eval "range = c(min(all), max(all))" + title = feature.name + title += "[#{feature.unit}]" if feature.unit and !feature.unit.blank? + R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" + R.eval "image = image + geom_abline(intercept=0, slope=1)" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.png") + plot_id = $gridfs.insert_one(file) + update(:correlation_plot_id => plot_id) + end + $gridfs.find_one(_id: correlation_plot_id).data + end - def self.correlation_plot id, predictions - tmpfile = "/tmp/#{id.to_s}_correlation.png" - x = [] - y = [] - predictions.each do |sid,p| - x << p["value"] - y << p["measured"].median + def worst_predictions n: 5, show_neigbors: true, show_common_descriptors: false + worst_predictions = predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,n] + worst_predictions.collect do |p| + substance = Substance.find(p.first) + prediction = p[1] + if show_neigbors + neighbors = prediction["neighbors"].collect do |n| + common_descriptors = [] + if show_common_descriptors + common_descriptors = n["common_descriptors"].collect do |d| + f=Feature.find(d) + { + :id => f.id.to_s, + :name => "#{f.name} (#{f.conditions})", + :p_value => d[:p_value], + :r_squared => d[:r_squared], + } + end + else + common_descriptors = n["common_descriptors"].size + end + { + :name => Substance.find(n["_id"]).name, + :id => n["_id"].to_s, + :common_descriptors => common_descriptors + } + end + else + neighbors = prediction["neighbors"].size + end + { + :id => substance.id.to_s, + :name => substance.name, + :feature => Feature.find(prediction["prediction_feature_id"]).name, + :error => (prediction["value"] - prediction["measurements"].median).abs, + :prediction => prediction["value"], + :measurements => prediction["measurements"], + :neighbors => neighbors + } + end end - R.assign "measurement", x - R.assign "prediction", y - R.eval "all = c(measurement,prediction)" - R.eval "range = c(min(all), max(all))" - # TODO units - R.eval "image = qplot(prediction,measurement,main='',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" - R.eval "image = image + geom_abline(intercept=0, slope=1)" - R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.png") - plot_id = $gridfs.insert_one(file) - plot_id end end end diff --git a/lib/validation.rb b/lib/validation.rb index 9122df1..ff9a971 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -1,63 +1,25 @@ module OpenTox - class Validation - - field :model_id, type: BSON::ObjectId - field :prediction_dataset_id, type: BSON::ObjectId - field :crossvalidation_id, type: BSON::ObjectId - field :test_dataset_id, type: BSON::ObjectId - field :nr_instances, type: Integer - field :nr_unpredicted, type: Integer - field :predictions, type: Hash - - def prediction_dataset - Dataset.find prediction_dataset_id - end - - def test_dataset - Dataset.find test_dataset_id - end - - def model - Model::Lazar.find model_id - end - - def self.create model, training_set, test_set, crossvalidation=nil - - atts = model.attributes.dup # do not modify attributes of the original model - atts["_id"] = BSON::ObjectId.new - atts[:training_dataset_id] = training_set.id - validation_model = model.class.create model.prediction_feature, training_set, atts - validation_model.save - predictions = validation_model.predict test_set.substances - predictions.each{|cid,p| p.delete(:neighbors)} - nr_unpredicted = 0 - predictions.each do |cid,prediction| - if prediction[:value] - prediction[:measured] = test_set.values(cid, prediction[:prediction_feature_id]) - else - nr_unpredicted += 1 - end + module Validation + + class Validation + include OpenTox + include Mongoid::Document + include Mongoid::Timestamps + store_in collection: "validations" + field :name, type: String + field :model_id, type: BSON::ObjectId + field :nr_instances, type: Integer + field :nr_unpredicted, type: Integer + field :predictions, type: Hash + field :finished_at, type: Time + + def model + Model::Lazar.find model_id end - predictions.select!{|cid,p| p[:value] and p[:measured]} - validation = self.new( - :model_id => validation_model.id, - :test_dataset_id => test_set.id, - :nr_instances => test_set.substances.size, - :nr_unpredicted => nr_unpredicted, - :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence - ) - validation.crossvalidation_id = crossvalidation.id if crossvalidation - validation.save - validation - end - - end - class ClassificationValidation < Validation - end + end - class RegressionValidation < Validation end end -- cgit v1.2.3 From 65b69d4c35890a7a2d2992108f0cf4eb5202dd1b Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 1 Jun 2016 10:37:00 +0200 Subject: validation tests fixed --- lib/crossvalidation.rb | 24 ++++++++---------------- lib/leave-one-out-validation.rb | 1 - lib/model.rb | 3 +-- lib/validation-statistics.rb | 19 ++++++++++--------- lib/validation.rb | 6 +++--- 5 files changed, 22 insertions(+), 31 deletions(-) (limited to 'lib') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 22071d8..15e25a5 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -3,23 +3,7 @@ module OpenTox module Validation class CrossValidation < Validation field :validation_ids, type: Array, default: [] - field :model_id, type: BSON::ObjectId field :folds, type: Integer, default: 10 - field :nr_instances, type: Integer, default: 0 - field :nr_unpredicted, type: Integer, default: 0 - field :predictions, type: Hash, default: {} - - def time - finished_at - created_at - end - - def validations - validation_ids.collect{|vid| TrainTest.find vid} - end - - def model - Model::Lazar.find model_id - end def self.create model, n=10 klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification @@ -55,6 +39,14 @@ module OpenTox cv.update_attributes(finished_at: Time.now) cv end + + def time + finished_at - created_at + end + + def validations + validation_ids.collect{|vid| TrainTest.find vid} + end end class ClassificationCrossValidation < CrossValidation diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index 7ff65ff..59f43c5 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -49,7 +49,6 @@ module OpenTox field :mae, type: Float, default: 0 field :r_squared, type: Float field :correlation_plot_id, type: BSON::ObjectId - field :confidence_plot_id, type: BSON::ObjectId end end diff --git a/lib/model.rb b/lib/model.rb index 988cac9..81f9629 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -33,7 +33,6 @@ module OpenTox #send(feature_selection_algorithm.to_sym) if feature_selection_algorithm save - self end def correlation_filter @@ -203,7 +202,7 @@ module OpenTox }.each do |key,value| model.neighbor_algorithm_parameters[key] ||= value end - model.neighbor_algorithm_parameters[:type] = "MP2D" if training_dataset.substances.first.is_a? Compound + model.neighbor_algorithm_parameters[:type] ||= "MP2D" if training_dataset.substances.first.is_a? Compound model.save model end diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 816824b..e42d298 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -98,8 +98,8 @@ module OpenTox def statistics # TODO: predictions within prediction_interval - rmse = 0 - mae = 0 + self.rmse = 0 + self.mae = 0 x = [] y = [] predictions.each do |cid,pred| @@ -107,8 +107,8 @@ module OpenTox x << pred[:measurements].median y << pred[:value] error = pred[:value]-pred[:measurements].median - rmse += error**2 - mae += error.abs + self.rmse += error**2 + self.mae += error.abs else warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." @@ -117,17 +117,18 @@ module OpenTox R.assign "measurement", x R.assign "prediction", y R.eval "r <- cor(measurement,prediction,use='pairwise')" - r = R.eval("r").to_ruby + self.r_squared = R.eval("r").to_ruby**2 - mae = mae/predictions.size - rmse = Math.sqrt(rmse/predictions.size) - $logger.debug "R^2 #{r**2}" + self.mae = self.mae/predictions.size + self.rmse = Math.sqrt(self.rmse/predictions.size) + $logger.debug "R^2 #{r_squared}" $logger.debug "RMSE #{rmse}" $logger.debug "MAE #{mae}" + save { :mae => mae, :rmse => rmse, - :r_squared => r**2, + :r_squared => r_squared, } end diff --git a/lib/validation.rb b/lib/validation.rb index ff9a971..ced9596 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -9,9 +9,9 @@ module OpenTox store_in collection: "validations" field :name, type: String field :model_id, type: BSON::ObjectId - field :nr_instances, type: Integer - field :nr_unpredicted, type: Integer - field :predictions, type: Hash + field :nr_instances, type: Integer, default: 0 + field :nr_unpredicted, type: Integer, default: 0 + field :predictions, type: Hash, default: {} field :finished_at, type: Time def model -- cgit v1.2.3 From 458a2d753551ea607f2ed5efdd0ac0a02d55d673 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 1 Jun 2016 12:46:03 +0200 Subject: all tests fixed --- lib/model.rb | 8 ++++---- lib/nanoparticle.rb | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/model.rb b/lib/model.rb index 81f9629..3482aee 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -236,7 +236,7 @@ module OpenTox end def repeated_crossvalidation - RepeatedCrossValidation.find repeated_crossvalidation_id + Validation::RepeatedCrossValidation.find repeated_crossvalidation_id end def crossvalidations @@ -244,7 +244,7 @@ module OpenTox end def leave_one_out_validation - LeaveOneOutValidation.find leave_one_out_validation_id + Validation::LeaveOneOut.find leave_one_out_validation_id end def regression? @@ -269,8 +269,8 @@ module OpenTox end prediction_model[:model_id] = model.id prediction_model[:prediction_feature_id] = prediction_feature.id - prediction_model[:repeated_crossvalidation_id] = RepeatedCrossValidation.create(model).id - prediction_model[:leave_one_out_validation_id] = LeaveOneOutValidation.create(model).id + prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id + prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id prediction_model.save prediction_model end diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index d0f8f51..ca79a3d 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -100,6 +100,8 @@ module OpenTox end def parse_ambit_value feature, v, dataset + #p dataset + #p feature v.delete "unit" # TODO: ppm instead of weights if v.keys == ["textValue"] -- cgit v1.2.3 From 85f2308c101b4778508c2d767e08af4cfd671b7b Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 2 Jun 2016 12:22:39 +0200 Subject: local pls regression for nanoparticles --- lib/nanoparticle.rb | 13 ++++++++++--- lib/regression.rb | 34 ++++++++++++++++++---------------- lib/validation-statistics.rb | 6 +++++- 3 files changed, 33 insertions(+), 20 deletions(-) (limited to 'lib') diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index ca79a3d..65aab23 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -6,9 +6,10 @@ module OpenTox field :core, type: Hash, default: {} field :coating, type: Array, default: [] field :proteomics, type: Hash, default: {} + + attr_accessor :scaled_values def physchem_neighbors min_sim: 0.9, dataset_id:, prediction_feature_id: - p self.name dataset = Dataset.find(dataset_id) relevant_features = {} measurements = [] @@ -52,7 +53,9 @@ module OpenTox common_descriptors = relevant_features.keys & substance.physchem_descriptors.keys # scale values query_descriptors = common_descriptors.collect{|d| (physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} + @scaled_values = common_descriptors.collect{|d| [d,(physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h neighbor_descriptors = common_descriptors.collect{|d| (substance.physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} + neighbor_scaled_values = common_descriptors.collect{|d| [d,(substance.physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h #weights = common_descriptors.collect{|d| 1-relevant_features[d]["p_value"]} weights = common_descriptors.collect{|d| relevant_features[d]["r"]**2} sim = Algorithm::Similarity.weighted_cosine(query_descriptors,neighbor_descriptors,weights) @@ -61,12 +64,16 @@ module OpenTox "measurements" => values, "similarity" => sim, "common_descriptors" => common_descriptors.collect do |id| - {:id => id, :p_value => relevant_features[id]["p_value"], :r_squared => relevant_features[id]["r"]**2} + { + :id => id, + :scaled_value => neighbor_scaled_values[id], + :p_value => relevant_features[id]["p_value"], + :r_squared => relevant_features[id]["r"]**2} end } if sim >= min_sim end end - p neighbors.size + $logger.debug "#{self.name}: #{neighbors.size} neighbors" neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} neighbors end diff --git a/lib/regression.rb b/lib/regression.rb index cffcbbf..5028c78 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -73,23 +73,19 @@ module OpenTox activities = [] weights = [] - pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq + pc_ids = neighbors.collect{|n| n["common_descriptors"].collect{|d| d[:id]}}.flatten.uniq.sort data_frame = [] data_frame[0] = [] neighbors.each_with_index do |n,i| - neighbor = Substance.find(n["_id"]) - activities = neighbor["measurements"] + activities = n["measurements"] activities.each do |act| data_frame[0][i] = act weights << n["similarity"] - neighbor.physchem_descriptors.each do |pid,values| - values = [values] unless values.is_a? Array - values.uniq! - warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1 - j = pc_ids.index(pid)+1 + n["common_descriptors"].each do |d| + j = pc_ids.index(d[:id])+1 data_frame[j] ||= [] - data_frame[j][i] = values.for_R + data_frame[j][i] = d[:scaled_value] end end if activities (0..pc_ids.size+1).each do |j| # for R: fill empty values with NA @@ -97,10 +93,12 @@ module OpenTox data_frame[j][i] ||= "NA" end end + remove_idx = [] data_frame.each_with_index do |r,i| remove_idx << i if r.uniq.size == 1 # remove properties with a single value end + remove_idx.reverse.each do |i| data_frame.delete_at i pc_ids.delete_at i @@ -112,7 +110,7 @@ module OpenTox prediction else query_descriptors = pc_ids.collect do |i| - substance.physchem_descriptors[i] ? substance.physchem_descriptors[i].for_R : "NA" + substance.scaled_values[i] ? substance.scaled_values[i] : "NA" end remove_idx = [] query_descriptors.each_with_index do |v,i| @@ -127,10 +125,9 @@ module OpenTox if prediction.nil? prediction = local_weighted_average substance, neighbors prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." - prediction - else - prediction end + p prediction + prediction end end @@ -172,10 +169,15 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" R.eval "names(fingerprint) <- features" R.eval "prediction <- predict(model,fingerprint)" + value = R.eval("prediction").to_f + rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f + r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f + prediction_interval = value-1.96*rmse, value+1.96*rmse { - :value => R.eval("prediction").to_f, - :rmse => R.eval("getTrainPerf(model)$TrainRMSE").to_f, - :r_squared => R.eval("getTrainPerf(model)$TrainRsquared").to_f, + :value => value, + :rmse => rmse, + :r_squared => r_squared, + :prediction_interval => prediction_interval } rescue return nil diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index e42d298..6b252b1 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -100,6 +100,8 @@ module OpenTox # TODO: predictions within prediction_interval self.rmse = 0 self.mae = 0 + #self.within_prediction_interval = 0 + #self.outside_prediction_interval = 0 x = [] y = [] predictions.each do |cid,pred| @@ -109,6 +111,9 @@ module OpenTox error = pred[:value]-pred[:measurements].median self.rmse += error**2 self.mae += error.abs + #if pred[:prediction_interval] + #if pred[:measurements] + #end else warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." @@ -118,7 +123,6 @@ module OpenTox R.assign "prediction", y R.eval "r <- cor(measurement,prediction,use='pairwise')" self.r_squared = R.eval("r").to_ruby**2 - self.mae = self.mae/predictions.size self.rmse = Math.sqrt(self.rmse/predictions.size) $logger.debug "R^2 #{r_squared}" -- cgit v1.2.3 From eec5bddbd35c9ecee8021128508d8718bccb4fe3 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 2 Jun 2016 17:54:48 +0200 Subject: local pls regression for nanoparticle proteomics --- lib/import.rb | 15 ++------------- lib/nanoparticle.rb | 12 +++++++++--- lib/regression.rb | 41 +++++++++++++++++++++++++---------------- 3 files changed, 36 insertions(+), 32 deletions(-) (limited to 'lib') diff --git a/lib/import.rb b/lib/import.rb index 80d4579..4c49e5e 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -68,17 +68,10 @@ module OpenTox effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature effect["conditions"].delete_if { |k, v| v.nil? } if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data -=begin - JSON.parse(effect["result"]["textValue"]).each do |identifier, value| - # time critical step - t = Time.now - proteomics_features[identifier] ||= klass.find_or_create_by(:name => identifier, :category => "Proteomics") - t1 += Time.now - t - t = Time.now + JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step + proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics") nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset - t2 += Time.now - t end -=end else feature = klass.find_or_create_by( :name => effect["endpoint"], @@ -90,10 +83,6 @@ module OpenTox end end nanoparticle.save - #p "Total time: #{Time.now - start_time}" - #p "Proteomics features: #{t1}" - #p "Proteomics values: #{t2}" - #p "Time2: #{t2}" end datasets.each { |u,d| d.save } end diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 65aab23..3e29ae1 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -10,6 +10,7 @@ module OpenTox attr_accessor :scaled_values def physchem_neighbors min_sim: 0.9, dataset_id:, prediction_feature_id: + p name dataset = Dataset.find(dataset_id) relevant_features = {} measurements = [] @@ -46,6 +47,7 @@ module OpenTox end end end + #p relevant_features.keys.collect{|i| Feature.find(i).name} neighbors = [] substances.each do |substance| values = dataset.values(substance,prediction_feature_id) @@ -86,9 +88,12 @@ module OpenTox physchem_descriptors[feature.id.to_s] << value physchem_descriptors[feature.id.to_s].uniq! when "Proteomics" - proteomics[feature.id.to_s] ||= [] - proteomics[feature.id.to_s] << value - proteomics[feature.id.to_s].uniq! + #proteomics[feature.id.to_s] ||= [] + #proteomics[feature.id.to_s] << value + #proteomics[feature.id.to_s].uniq! + physchem_descriptors[feature.id.to_s] ||= [] + physchem_descriptors[feature.id.to_s] << value + physchem_descriptors[feature.id.to_s].uniq! when "TOX" if feature.name == "Total protein (BCA assay)" physchem_descriptors[feature.id.to_s] ||= [] @@ -109,6 +114,7 @@ module OpenTox def parse_ambit_value feature, v, dataset #p dataset #p feature + # TODO add study id to warnings v.delete "unit" # TODO: ppm instead of weights if v.keys == ["textValue"] diff --git a/lib/regression.rb b/lib/regression.rb index 5028c78..b9067c6 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -88,35 +88,42 @@ module OpenTox data_frame[j][i] = d[:scaled_value] end end if activities - (0..pc_ids.size+1).each do |j| # for R: fill empty values with NA + #(0..pc_ids.size+1).each do |j| # for R: fill empty values with NA + (0..pc_ids.size).each do |j| # for R: fill empty values with NA data_frame[j] ||= [] data_frame[j][i] ||= "NA" end end - remove_idx = [] - data_frame.each_with_index do |r,i| - remove_idx << i if r.uniq.size == 1 # remove properties with a single value - end + #remove_idx = [] + #data_frame.each_with_index do |r,i| + #remove_idx << i if r.uniq.size == 1 # remove properties with a single value TODO: don't break R names assignment + #end - remove_idx.reverse.each do |i| - data_frame.delete_at i - pc_ids.delete_at i - end + #p data_frame.size + #p pc_ids.size + #data_frame.delete_if.with_index { |_, index| remove_idx.include? index } + #pc_ids.delete_if.with_index { |_, index| remove_idx.include? index-1 } + #remove_idx.sort.reverse.each do |i| + #p i + #data_frame.delete_at i + #pc_ids.delete_at i + #end + #p data_frame.size + #p pc_ids.size if pc_ids.empty? prediction = local_weighted_average substance, neighbors prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." prediction else - query_descriptors = pc_ids.collect do |i| - substance.scaled_values[i] ? substance.scaled_values[i] : "NA" - end + query_descriptors = pc_ids.collect { |i| substance.scaled_values[i] } remove_idx = [] query_descriptors.each_with_index do |v,i| - remove_idx << i if v == "NA" + #remove_idx << i if v == "NA" + remove_idx << i unless v end - remove_idx.reverse.each do |i| + remove_idx.sort.reverse.each do |i| data_frame.delete_at i pc_ids.delete_at i query_descriptors.delete_at i @@ -135,8 +142,9 @@ module OpenTox def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values R.assign "weights", training_weights r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" -rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) =begin +=end +rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) File.open("tmp.R","w+"){|f| f.puts "suppressPackageStartupMessages({ library(iterators,lib=\"#{rlib}\") @@ -159,10 +167,11 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) f.puts "names(fingerprint) <- features" f.puts "prediction <- predict(model,fingerprint)" } -=end R.eval "data <- #{r_data_frame}" R.assign "features", training_features + p training_features.size + p R.eval("names(data)").to_ruby.size begin R.eval "names(data) <- append(c('activities'),features)" # R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)" -- cgit v1.2.3 From 128fd36b2531756c15a93776871e80eb44e524f1 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 2 Jun 2016 19:01:18 +0200 Subject: proteomics regression validation --- lib/model.rb | 30 ++++++++++++++++++------------ lib/nanoparticle.rb | 28 ++-------------------------- 2 files changed, 20 insertions(+), 38 deletions(-) (limited to 'lib') diff --git a/lib/model.rb b/lib/model.rb index 3482aee..277bca3 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -31,7 +31,7 @@ module OpenTox self.neighbor_algorithm_parameters ||= {} self.neighbor_algorithm_parameters[:dataset_id] = training_dataset.id - #send(feature_selection_algorithm.to_sym) if feature_selection_algorithm + send(feature_selection_algorithm.to_sym) if feature_selection_algorithm save end @@ -49,25 +49,31 @@ module OpenTox feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq feature_ids.each do |feature_id| feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]} - R.assign "feature", feature_values - begin - R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')" - pvalue = R.eval("cor$p.value").to_ruby - if pvalue <= 0.05 - r = R.eval("cor$estimate").to_ruby - self.relevant_features[feature_id] = {} - self.relevant_features[feature_id]["pvalue"] = pvalue - self.relevant_features[feature_id]["r"] = r + unless feature_values.uniq.size == 1 + R.assign "feature", feature_values + begin + R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')" + pvalue = R.eval("cor$p.value").to_ruby + if pvalue <= 0.05 + r = R.eval("cor$estimate").to_ruby + self.relevant_features[feature_id] = {} + self.relevant_features[feature_id]["pvalue"] = pvalue + self.relevant_features[feature_id]["r"] = r + self.relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby + self.relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby + end + rescue + warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed." end - rescue - warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed." end end self.relevant_features = self.relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h + p self.relevant_features end def predict_substance substance neighbor_algorithm_parameters = Hash[self.neighbor_algorithm_parameters.map{ |k, v| [k.to_sym, v] }] # convert string keys to symbols + neighbor_algorithm_parameters[:relevant_features] = self.relevant_features if self.relevant_features neighbors = substance.send(neighbor_algorithm, neighbor_algorithm_parameters) measurements = nil prediction = {} diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 3e29ae1..c1bf1b5 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -9,10 +9,10 @@ module OpenTox attr_accessor :scaled_values - def physchem_neighbors min_sim: 0.9, dataset_id:, prediction_feature_id: + def physchem_neighbors min_sim: 0.9, dataset_id:, prediction_feature_id:, relevant_features: p name dataset = Dataset.find(dataset_id) - relevant_features = {} + #relevant_features = {} measurements = [] substances = [] # TODO: exclude query activities!!! @@ -24,30 +24,6 @@ module OpenTox end end end - R.assign "tox", measurements - feature_ids = physchem_descriptors.keys.select{|fid| Feature.find(fid).is_a? NumericFeature} - # identify relevant features - feature_ids.each do |feature_id| - feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]} - unless feature_values.uniq.size == 1 - R.assign "feature", feature_values - begin - R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')" - p_value = R.eval("cor$p.value").to_ruby - if p_value <= 0.05 - r = R.eval("cor$estimate").to_ruby - relevant_features[feature_id] = {} - relevant_features[feature_id]["p_value"] = p_value - relevant_features[feature_id]["r"] = r - relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby - relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby - end - rescue - warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed." - end - end - end - #p relevant_features.keys.collect{|i| Feature.find(i).name} neighbors = [] substances.each do |substance| values = dataset.values(substance,prediction_feature_id) -- cgit v1.2.3 From 290c7f86950c4051d018b8019ff4e72ec406c58c Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 3 Jun 2016 19:15:36 +0200 Subject: random forest regression --- lib/lazar.rb | 2 ++ lib/model.rb | 29 +++++++++++++++---------- lib/regression.rb | 63 +++++++++++++++++++++++++------------------------------ 3 files changed, 48 insertions(+), 46 deletions(-) (limited to 'lib') diff --git a/lib/lazar.rb b/lib/lazar.rb index 1853aba..46605d3 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -57,6 +57,8 @@ suppressPackageStartupMessages({ library(pls,lib=\"#{rlib}\") library(caret,lib=\"#{rlib}\") library(doMC,lib=\"#{rlib}\") + library(randomForest,lib=\"#{rlib}\") + library(plyr,lib=\"#{rlib}\") registerDoMC(#{NR_CORES}) }) " diff --git a/lib/model.rb b/lib/model.rb index 277bca3..0432c56 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -3,6 +3,7 @@ module OpenTox module Model class Lazar + include OpenTox include Mongoid::Document include Mongoid::Timestamps @@ -11,11 +12,15 @@ module OpenTox field :name, type: String field :creator, type: String, default: __FILE__ field :training_dataset_id, type: BSON::ObjectId - field :prediction_algorithm, type: String field :prediction_feature_id, type: BSON::ObjectId + + field :prediction_algorithm, type: String + field :prediction_algorithm_parameters, type: Hash, default: {} + field :neighbor_algorithm, type: String field :neighbor_algorithm_parameters, type: Hash, default: {} field :feature_selection_algorithm, type: String + field :feature_selection_algorithm_parameters, type: Hash, default: {} field :relevant_features, type: Hash # Create a lazar model from a training_dataset and a feature_dataset @@ -35,7 +40,8 @@ module OpenTox save end - def correlation_filter + def correlation_filter + # TODO: speedup, single assignment of all features to R+ parallel computation of significance? self.relevant_features = {} measurements = [] substances = [] @@ -47,6 +53,7 @@ module OpenTox end R.assign "tox", measurements feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq + feature_ids.select!{|fid| Feature.find(fid).category == feature_selection_algorithm_parameters[:category]} if feature_selection_algorithm_parameters[:category] feature_ids.each do |feature_id| feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]} unless feature_values.uniq.size == 1 @@ -68,7 +75,6 @@ module OpenTox end end self.relevant_features = self.relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h - p self.relevant_features end def predict_substance substance @@ -90,14 +96,14 @@ module OpenTox prediction.merge!({:value => nil,:probabilities => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) elsif neighbors.size == 1 value = nil - tox = neighbors.first["measurements"] - if tox.size == 1 # single measurement - value = tox.first + m = neighbors.first["measurements"] + if m.size == 1 # single measurement + value = m.first else # multiple measurement - if tox.collect{|t| t.numeric?}.uniq == [true] # numeric - value = tox.median - elsif tox.uniq.size == 1 # single value - value = tox.first + if m.collect{|t| t.numeric?}.uniq == [true] # numeric + value = m.median + elsif m.uniq.size == 1 # single value + value = m.first else # contradictory results # TODO add majority vote?? end @@ -106,7 +112,8 @@ module OpenTox else # call prediction algorithm klass,method = prediction_algorithm.split('.') - result = Object.const_get(klass).send(method,substance,neighbors) + params = prediction_algorithm_parameters.merge({:substance => substance, :neighbors => neighbors}) + result = Object.const_get(klass).send(method,params) prediction.merge! result prediction[:neighbors] = neighbors prediction[:neighbors] ||= [] diff --git a/lib/regression.rb b/lib/regression.rb index b9067c6..c4c83d2 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -3,7 +3,7 @@ module OpenTox class Regression - def self.local_weighted_average substance, neighbors + def self.local_weighted_average substance:, neighbors: weighted_sum = 0.0 sim_sum = 0.0 neighbors.each do |neighbor| @@ -18,7 +18,7 @@ module OpenTox {:value => prediction} end - def self.local_fingerprint_regression substance, neighbors, method='pls'#, method_params="sigma=0.05" + def self.local_fingerprint_regression substance:, neighbors:, method: pls#, method_params="sigma=0.05" values = [] fingerprints = {} weights = [] @@ -68,8 +68,7 @@ module OpenTox end - #def self.local_physchem_regression(substance:, neighbors:, feature_id:, dataset_id:, method: 'pls')#, method_params="ncomp = 4" - def self.local_physchem_regression substance, neighbors, method='pls' #, method_params="ncomp = 4" + def self.local_physchem_regression substance:, neighbors:, method: pls activities = [] weights = [] @@ -88,46 +87,39 @@ module OpenTox data_frame[j][i] = d[:scaled_value] end end if activities - #(0..pc_ids.size+1).each do |j| # for R: fill empty values with NA (0..pc_ids.size).each do |j| # for R: fill empty values with NA data_frame[j] ||= [] data_frame[j][i] ||= "NA" end end - #remove_idx = [] - #data_frame.each_with_index do |r,i| - #remove_idx << i if r.uniq.size == 1 # remove properties with a single value TODO: don't break R names assignment - #end - - #p data_frame.size - #p pc_ids.size - #data_frame.delete_if.with_index { |_, index| remove_idx.include? index } - #pc_ids.delete_if.with_index { |_, index| remove_idx.include? index-1 } - #remove_idx.sort.reverse.each do |i| - #p i - #data_frame.delete_at i - #pc_ids.delete_at i - #end - #p data_frame.size - #p pc_ids.size + data_frame = data_frame.each_with_index.collect do |r,i| + if r.uniq.size == 1 # remove properties with a single value + r = nil + pc_ids[i-1] = nil # data_frame frame has additional activity entry + end + r + end + data_frame.compact! + pc_ids.compact! if pc_ids.empty? prediction = local_weighted_average substance, neighbors - prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." + prediction[:warning] = "No relevant variables for regression model. Using weighted average of similar substances." prediction else query_descriptors = pc_ids.collect { |i| substance.scaled_values[i] } - remove_idx = [] - query_descriptors.each_with_index do |v,i| - #remove_idx << i if v == "NA" - remove_idx << i unless v - end - remove_idx.sort.reverse.each do |i| - data_frame.delete_at i - pc_ids.delete_at i - query_descriptors.delete_at i + query_descriptors = query_descriptors.each_with_index.collect do |v,i| + unless v + v = nil + data_frame[i] = nil + pc_ids[i] = nil + end + v end + query_descriptors.compact! + data_frame.compact! + pc_ids.compact! prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors if prediction.nil? prediction = local_weighted_average substance, neighbors @@ -143,7 +135,6 @@ module OpenTox R.assign "weights", training_weights r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" =begin -=end rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) File.open("tmp.R","w+"){|f| f.puts "suppressPackageStartupMessages({ @@ -162,19 +153,21 @@ rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) f.puts "weights <- c(#{training_weights.join(', ')})" f.puts "features <- c(#{training_features.join(', ')})" f.puts "names(data) <- append(c('activities'),features)" # + f.puts "ctrl <- rfeControl(functions = #{method}, method = 'repeatedcv', repeats = 5, verbose = T)" + f.puts "lmProfile <- rfe(activities ~ ., data = data, rfeControl = ctrl)" + f.puts "model <- train(activities ~ ., data = data, method = '#{method}')" f.puts "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" f.puts "names(fingerprint) <- features" f.puts "prediction <- predict(model,fingerprint)" } +=end R.eval "data <- #{r_data_frame}" R.assign "features", training_features - p training_features.size - p R.eval("names(data)").to_ruby.size begin R.eval "names(data) <- append(c('activities'),features)" # - R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass)" + R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)" R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" R.eval "names(fingerprint) <- features" R.eval "prediction <- predict(model,fingerprint)" -- cgit v1.2.3 From f7e87b45f15083e5fcdea64821f06ed93ece4c4e Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 7 Jun 2016 18:07:28 +0200 Subject: (repeated)crossvalidation plots --- lib/crossvalidation.rb | 35 +++++++++++++++++++++++++++++++++++ lib/nanoparticle.rb | 1 - lib/regression.rb | 2 +- lib/validation-statistics.rb | 6 +++--- 4 files changed, 39 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 15e25a5..7aae3d2 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -71,6 +71,8 @@ module OpenTox class RepeatedCrossValidation < Validation field :crossvalidation_ids, type: Array, default: [] + field :correlation_plot_id, type: BSON::ObjectId + def self.create model, folds=10, repeats=3 repeated_cross_validation = self.new repeats.times do |n| @@ -80,9 +82,42 @@ module OpenTox repeated_cross_validation.save repeated_cross_validation end + def crossvalidations crossvalidation_ids.collect{|id| CrossValidation.find(id)} end + + def correlation_plot format: "png" + #unless correlation_plot_id + feature = Feature.find(crossvalidations.first.model.prediction_feature) + title = feature.name + title += "[#{feature.unit}]" if feature.unit and !feature.unit.blank? + tmpfile = "/tmp/#{id.to_s}_correlation.#{format}" + images = [] + crossvalidations.each_with_index do |cv,i| + x = [] + y = [] + cv.predictions.each do |sid,p| + x << p["value"] + y << p["measurements"].median + end + R.assign "measurement", x + R.assign "prediction", y + R.eval "all = c(measurement,prediction)" + R.eval "range = c(min(all), max(all))" + R.eval "image#{i} = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" + R.eval "image#{i} = image#{i} + geom_abline(intercept=0, slope=1)" + images << "image#{i}" + end + R.eval "pdf('#{tmpfile}')" + R.eval "grid.arrange(#{images.join ","},ncol=#{images.size})" + R.eval "dev.off()" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}") + correlation_plot_id = $gridfs.insert_one(file) + update(:correlation_plot_id => correlation_plot_id) + #end + $gridfs.find_one(_id: correlation_plot_id).data + end end end diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index c1bf1b5..d6261ee 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -10,7 +10,6 @@ module OpenTox attr_accessor :scaled_values def physchem_neighbors min_sim: 0.9, dataset_id:, prediction_feature_id:, relevant_features: - p name dataset = Dataset.find(dataset_id) #relevant_features = {} measurements = [] diff --git a/lib/regression.rb b/lib/regression.rb index c4c83d2..51317ac 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -122,7 +122,7 @@ module OpenTox pc_ids.compact! prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors if prediction.nil? - prediction = local_weighted_average substance, neighbors + prediction = local_weighted_average(substance: substance, neighbors: neighbors) prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." end p prediction diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 6b252b1..9aa9cff 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -136,9 +136,9 @@ module OpenTox } end - def correlation_plot + def correlation_plot format: "png" unless correlation_plot_id - tmpfile = "/tmp/#{id.to_s}_correlation.pdf" + tmpfile = "/tmp/#{id.to_s}_correlation.#{format}" x = [] y = [] feature = Feature.find(predictions.first.last["prediction_feature_id"]) @@ -155,7 +155,7 @@ module OpenTox R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" R.eval "image = image + geom_abline(intercept=0, slope=1)" R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.png") + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}") plot_id = $gridfs.insert_one(file) update(:correlation_plot_id => plot_id) end -- cgit v1.2.3 From 0f31c884d1bcfa448a1bf43a41d8fd6cf88bfc52 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 8 Jun 2016 18:26:07 +0200 Subject: compound tests fixed --- lib/classification.rb | 2 +- lib/compound.rb | 16 +++++++++------- lib/regression.rb | 6 +++--- 3 files changed, 13 insertions(+), 11 deletions(-) (limited to 'lib') diff --git a/lib/classification.rb b/lib/classification.rb index 0f3c6d9..2ccd7d1 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -3,7 +3,7 @@ module OpenTox class Classification - def self.weighted_majority_vote substance, neighbors + def self.weighted_majority_vote substance:, neighbors: sims = {} neighbors.each do |neighbor| sim = neighbor["similarity"] diff --git a/lib/compound.rb b/lib/compound.rb index 4541816..17cc240 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -257,12 +257,13 @@ module OpenTox def fingerprint_neighbors(type:, min_sim: 0.1, dataset_id:, prediction_feature_id:) neighbors = [] dataset = Dataset.find(dataset_id) - if type == DEFAULT_FINGERPRINT - neighbors = db_neighbors(min_sim: min_sim, dataset_id: dataset_id) - neighbors.each do |n| - n["measurements"] = dataset.values(n["_id"],prediction_feature_id) - end - else + # TODO: fix db_neighbors +# if type == DEFAULT_FINGERPRINT +# neighbors = db_neighbors(min_sim: min_sim, dataset_id: dataset_id) +# neighbors.each do |n| +# n["measurements"] = dataset.values(n["_id"],prediction_feature_id) +# end +# else query_fingerprint = self.fingerprint type dataset.compounds.each do |compound| values = dataset.values(compound,prediction_feature_id) @@ -271,7 +272,7 @@ module OpenTox sim = Algorithm::Similarity.tanimoto(query_fingerprint , candidate_fingerprint) neighbors << {"_id" => compound.id, "measurements" => values, "similarity" => sim} if sim >= min_sim end - end +# end end neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]} end @@ -294,6 +295,7 @@ module OpenTox # end def db_neighbors min_sim: 0.1, dataset_id: + p fingerprints[DEFAULT_FINGERPRINT] # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb #qn = default_fingerprint_size diff --git a/lib/regression.rb b/lib/regression.rb index 51317ac..d034d0b 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -18,7 +18,7 @@ module OpenTox {:value => prediction} end - def self.local_fingerprint_regression substance:, neighbors:, method: pls#, method_params="sigma=0.05" + def self.local_fingerprint_regression substance:, neighbors:, method: "pls" #, method_params="sigma=0.05" values = [] fingerprints = {} weights = [] @@ -55,7 +55,7 @@ module OpenTox substance_features = variables.collect{|f| substance.fingerprint.include?(f) ? "T" : "F"} prediction = r_model_prediction method, data_frame, variables, weights, substance_features if prediction.nil? or prediction[:value].nil? - prediction = local_weighted_average substance, neighbors + prediction = local_weighted_average(substance: substance, neighbors: neighbors) prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." prediction else @@ -68,7 +68,7 @@ module OpenTox end - def self.local_physchem_regression substance:, neighbors:, method: pls + def self.local_physchem_regression substance:, neighbors:, method: "pls" activities = [] weights = [] -- cgit v1.2.3 From f93aad7227c7bb3702fd28aab2d289f1ca9ce7e9 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 21 Jul 2016 17:35:20 +0200 Subject: correlation plot fixed --- lib/import.rb | 2 ++ lib/validation-statistics.rb | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/import.rb b/lib/import.rb index 4c49e5e..e187e3c 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -73,6 +73,8 @@ module OpenTox nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset end else + name = effect["endpoint"] + name = "log2(Net cell association)" if name == "Log2 transformed" # use a sensible name feature = klass.find_or_create_by( :name => effect["endpoint"], :unit => effect["result"]["unit"], diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 9aa9cff..3582c71 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -143,8 +143,8 @@ module OpenTox y = [] feature = Feature.find(predictions.first.last["prediction_feature_id"]) predictions.each do |sid,p| - x << p["value"] - y << p["measurements"].median + x << p["measurements"].median + y << p["value"] end R.assign "measurement", x R.assign "prediction", y -- cgit v1.2.3 From 46c628f1757ce8274a0b277b3ec3306609b38c14 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 25 Jul 2016 15:53:22 +0200 Subject: local_weighted_average fallback fixed, cv predictions pulled from validations to avoid mongo document size errors --- lib/crossvalidation.rb | 10 ++++++++-- lib/regression.rb | 4 ++-- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 7aae3d2..d7a1f08 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -18,7 +18,7 @@ module OpenTox cv.save # set created_at nr_instances = 0 nr_unpredicted = 0 - predictions = {} + #predictions = {} training_dataset = Dataset.find model.training_dataset_id training_dataset.folds(n).each_with_index do |fold,fold_nr| #fork do # parallel execution of validations can lead to Rserve and memory problems @@ -28,7 +28,7 @@ module OpenTox cv.validation_ids << validation.id cv.nr_instances += validation.nr_instances cv.nr_unpredicted += validation.nr_unpredicted - cv.predictions.merge! validation.predictions + #cv.predictions.merge! validation.predictions $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" #end end @@ -47,6 +47,12 @@ module OpenTox def validations validation_ids.collect{|vid| TrainTest.find vid} end + + def predictions + predictions = {} + validations.each{|v| predictions.merge!(v.predictions)} + predictions + end end class ClassificationCrossValidation < CrossValidation diff --git a/lib/regression.rb b/lib/regression.rb index d034d0b..269a743 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -48,7 +48,7 @@ module OpenTox end if variables.empty? - prediction = local_weighted_average substance, neighbors + prediction = local_weighted_average(substance: substance, neighbors: neighbors) prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." prediction else @@ -104,7 +104,7 @@ module OpenTox pc_ids.compact! if pc_ids.empty? - prediction = local_weighted_average substance, neighbors + prediction = local_weighted_average(substance: substance, neighbors: neighbors) prediction[:warning] = "No relevant variables for regression model. Using weighted average of similar substances." prediction else -- cgit v1.2.3 From 7313c5d26b5f3a672dac0494f16cdf0185f6a39f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 26 Jul 2016 13:21:57 +0200 Subject: NanoPrediction model --- lib/model.rb | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/model.rb b/lib/model.rb index 0432c56..5cf2cdb 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -283,10 +283,45 @@ module OpenTox prediction_model[:model_id] = model.id prediction_model[:prediction_feature_id] = prediction_feature.id prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id - prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id + #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id prediction_model.save prediction_model end + + end + + class NanoPrediction < Prediction + + def self.from_json_dump dir, category + Import::Enanomapper.import dir + + prediction_model = self.new( + :endpoint => "log2(Net cell association)", + :source => "https://data.enanomapper.net/", + :species => "A549 human lung epithelial carcinoma cells", + :unit => "log2(ug/Mg)" + ) + params = { + :feature_selection_algorithm => :correlation_filter, + :feature_selection_algorithm_parameters => {:category => category}, + :neighbor_algorithm => "physchem_neighbors", + :neighbor_algorithm_parameters => {:min_sim => 0.5}, + :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", + :prediction_algorithm_parameters => {:method => 'rf'}, # random forests + } + training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") + prediction_feature = Feature.find_or_create_by(name: "log2(Net cell association)", category: "TOX") + #prediction_feature = Feature.find("579621b84de73e267b414e55") + prediction_model[:prediction_feature_id] = prediction_feature.id + model = Model::LazarRegression.create(prediction_feature, training_dataset, params) + prediction_model[:model_id] = model.id + repeated_cv = Validation::RepeatedCrossValidation.create model + prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id + #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id + prediction_model.save + prediction_model + end + end end -- cgit v1.2.3 From 22eed169b6f156dc5a65c395f04866f349094f3e Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 26 Sep 2016 15:11:38 +0200 Subject: CACTUS_URI updated --- lib/compound.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index 17cc240..54a0364 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -1,4 +1,4 @@ -CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/" +CACTUS_URI="https://cactus.nci.nih.gov/chemical/structure/" module OpenTox -- cgit v1.2.3 From 96ca0eec8bfce8f95ea1d36de7ede61f7c12e517 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 26 Sep 2016 15:16:59 +0200 Subject: Chembl URI fixed --- lib/compound.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index 54a0364..deaace0 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -249,7 +249,7 @@ module OpenTox # @return [String] ChEMBL database compound id, derieved via restcall to chembl def chemblid # https://www.ebi.ac.uk/chembldb/ws#individualCompoundByInChiKey - uri = "http://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json" + uri = "https://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json" update(:chemblid => JSON.parse(RestClientWrapper.get(uri))["compounds"].first["chemblId"]) unless self["chemblid"] self["chemblid"] end -- cgit v1.2.3 From 2c54492126a501bd67ad59ef34792d0676396805 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 26 Sep 2016 16:20:37 +0200 Subject: pubchem uri fixed --- lib/compound.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index deaace0..4689d7a 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -241,7 +241,7 @@ module OpenTox # @return [String] PubChem Compound Identifier (CID), derieved via restcall to pubchem def cid - pug_uri = "http://pubchem.ncbi.nlm.nih.gov/rest/pug/" + pug_uri = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/" update(:cid => RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"] self["cid"] end -- cgit v1.2.3 From 9e8537997d84e78e6545a66a0d09c33e76c8b7cf Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 30 Sep 2016 17:11:30 +0200 Subject: npo uri as source, spectral count unit f proteomics features --- lib/import.rb | 31 +++++++++++++++++++++++++------ lib/nanoparticle.rb | 18 ++++-------------- 2 files changed, 29 insertions(+), 20 deletions(-) (limited to 'lib') diff --git a/lib/import.rb b/lib/import.rb index e187e3c..17894a9 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -62,24 +62,43 @@ module OpenTox np["bundles"].keys.each do |bundle_uri| nanoparticle.dataset_ids << datasets[bundle_uri].id end + dataset = datasets[np["bundles"].keys.first] proteomics_features = {} + category = study["protocol"]["topcategory"] + source = study["protocol"]["category"]["term"] + study["effects"].each do |effect| + effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature effect["conditions"].delete_if { |k, v| v.nil? } + if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data + JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step - proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics") + proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source) nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset end else name = effect["endpoint"] - name = "log2(Net cell association)" if name == "Log2 transformed" # use a sensible name + unit = effect["result"]["unit"] + warnings = [] + case name + when "Log2 transformed" # use a sensible name + name = "log2(Net cell association)" + warnings = ["Original name was 'Log2 transformed'"] + unit = "log2(mL/ug(Mg))" + when "Total protein (BCA assay)" + category = "P-CHEM" + warnings = ["Category changed from TOX to P-CHEM"] + end feature = klass.find_or_create_by( - :name => effect["endpoint"], - :unit => effect["result"]["unit"], - :category => study["protocol"]["topcategory"], - :conditions => effect["conditions"] + :name => name, + :unit => unit, + :category => category, + :conditions => effect["conditions"], + :source => study["protocol"]["category"]["term"], + :warnings => warnings ) nanoparticle.parse_ambit_value feature, effect["result"], dataset end diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index d6261ee..b1a3835 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -5,7 +5,7 @@ module OpenTox field :core, type: Hash, default: {} field :coating, type: Array, default: [] - field :proteomics, type: Hash, default: {} + #field :proteomics, type: Hash, default: {} attr_accessor :scaled_values @@ -63,26 +63,16 @@ module OpenTox physchem_descriptors[feature.id.to_s] << value physchem_descriptors[feature.id.to_s].uniq! when "Proteomics" - #proteomics[feature.id.to_s] ||= [] - #proteomics[feature.id.to_s] << value - #proteomics[feature.id.to_s].uniq! physchem_descriptors[feature.id.to_s] ||= [] physchem_descriptors[feature.id.to_s] << value physchem_descriptors[feature.id.to_s].uniq! when "TOX" - if feature.name == "Total protein (BCA assay)" - physchem_descriptors[feature.id.to_s] ||= [] - physchem_descriptors[feature.id.to_s] << value - physchem_descriptors[feature.id.to_s].uniq! - else - dataset.add self, feature, value - end - dataset.save - dataset_ids << dataset.id - dataset_ids.uniq! + dataset.add self, feature, value else warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted." end + dataset_ids << dataset.id + dataset_ids.uniq! end end -- cgit v1.2.3 From adefea0e78a4f05a2c9537e643873ad61fc22a0a Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 3 Oct 2016 19:49:55 +0200 Subject: initial model creation tests --- lib/classification.rb | 2 + lib/model.rb | 120 +++++++++++++++++++++++++++----------------------- lib/opentox.rb | 5 +-- 3 files changed, 67 insertions(+), 60 deletions(-) (limited to 'lib') diff --git a/lib/classification.rb b/lib/classification.rb index 2ccd7d1..03c32c4 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -25,7 +25,9 @@ module OpenTox prediction = probabilities.key(p_max) {:value => prediction,:probabilities => probabilities} end + end + end end diff --git a/lib/model.rb b/lib/model.rb index 5cf2cdb..749611e 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -13,31 +13,73 @@ module OpenTox field :creator, type: String, default: __FILE__ field :training_dataset_id, type: BSON::ObjectId field :prediction_feature_id, type: BSON::ObjectId - - field :prediction_algorithm, type: String - field :prediction_algorithm_parameters, type: Hash, default: {} - - field :neighbor_algorithm, type: String - field :neighbor_algorithm_parameters, type: Hash, default: {} - field :feature_selection_algorithm, type: String - field :feature_selection_algorithm_parameters, type: Hash, default: {} + field :algorithms, type: Hash field :relevant_features, type: Hash - - # Create a lazar model from a training_dataset and a feature_dataset - # @param [OpenTox::Dataset] training_dataset - # @return [OpenTox::Model::Lazar] Regression or classification model - def initialize prediction_feature, training_dataset, params={} - super params + + def self.create prediction_feature:nil, training_dataset:nil, algorithms:{} + bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset + prediction_feature = training_dataset.features.first unless prediction_feature + # TODO: prediction_feature without training_dataset: use all available data + # explicit prediction algorithm + if algorithms[:prediction] and algorithms[:prediction][:method] + case algorithms[:prediction][:method] + when /Classifiction/ + model = LazarClassification.new + when /Regression/ + model = LazarRegression.new + end + # guess model type + elsif prediction_feature.numeric? + model = LazarRegression.new + else + model = LazarClassification.new + end + # set defaults + if model.class == LazarClassification + model.algorithms = { + :similarity => { + :descriptors => "fingerprint['MP2D']", + :method => "Algorithm::Similarity.tanimoto", + :min => 0.1 + }, + :prediction => { + :descriptors => "fingerprint['MP2D']", + :method => "Algorithm::Classification.weighted_majority_vote", + }, + :feature_selection => nil, + } + elsif model.class == LazarRegression + model.algorithms = { + :similarity => { + :descriptors => "fingerprint['MP2D']", + :method => "Algorithm::Similarity.tanimoto", + :min => 0.1 + }, + :prediction => { + :descriptors => "fingerprint['MP2D']", + :method => "Algorithm::Regression.local_caret", + :parameters => "pls", + }, + :feature_selection => nil, + } + end + + # overwrite defaults + algorithms.each do |type,parameters| + parameters.each do |p,v| + model.algorithms[type][p] = v + end if parameters + end # set defaults for empty parameters - self.prediction_feature_id ||= prediction_feature.id - self.training_dataset_id ||= training_dataset.id - self.name ||= "#{training_dataset.name} #{prediction_feature.name}" - self.neighbor_algorithm_parameters ||= {} - self.neighbor_algorithm_parameters[:dataset_id] = training_dataset.id - - send(feature_selection_algorithm.to_sym) if feature_selection_algorithm - save + model.prediction_feature_id = prediction_feature.id + model.training_dataset_id = training_dataset.id + model.name = "#{training_dataset.name} #{prediction_feature.name}" + + #send(feature_selection_algorithm.to_sym) if feature_selection_algorithm + model.save + p model + model end def correlation_filter @@ -181,45 +223,11 @@ module OpenTox end class LazarClassification < Lazar - - def self.create prediction_feature, training_dataset, params={} - model = self.new prediction_feature, training_dataset, params - model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm - model.neighbor_algorithm ||= "fingerprint_neighbors" - model.neighbor_algorithm_parameters ||= {} - { - :type => "MP2D", - :dataset_id => training_dataset.id, - :prediction_feature_id => prediction_feature.id, - :min_sim => 0.1 - }.each do |key,value| - model.neighbor_algorithm_parameters[key] ||= value - end - model.save - model - end end class LazarRegression < Lazar - def self.create prediction_feature, training_dataset, params={} - model = self.new prediction_feature, training_dataset, params - model.neighbor_algorithm ||= "fingerprint_neighbors" - model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression" - model.neighbor_algorithm_parameters ||= {} - { - :min_sim => 0.1, - :dataset_id => training_dataset.id, - :prediction_feature_id => prediction_feature.id, - }.each do |key,value| - model.neighbor_algorithm_parameters[key] ||= value - end - model.neighbor_algorithm_parameters[:type] ||= "MP2D" if training_dataset.substances.first.is_a? Compound - model.save - model - end - end class Prediction diff --git a/lib/opentox.rb b/lib/opentox.rb index 7d8a8a2..5c300cf 100644 --- a/lib/opentox.rb +++ b/lib/opentox.rb @@ -1,8 +1,6 @@ module OpenTox - # Ruby interface - - # create default OpenTox classes (defined in opentox-client.rb) + # create default OpenTox classes # provides Mongoid's query and persistence methods # http://mongoid.org/en/mongoid/docs/persistence.html # http://mongoid.org/en/mongoid/docs/querying.html @@ -25,4 +23,3 @@ module OpenTox end end - -- cgit v1.2.3 From 5d4e5e463c2b87241bbb56e4658e1e26c0ed084f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 5 Oct 2016 13:22:12 +0200 Subject: substance and nanoparticle model creation and predictions --- lib/algorithm.rb | 13 +---- lib/classification.rb | 2 +- lib/compound.rb | 12 +++-- lib/feature_selection.rb | 46 ++++++++++++++++ lib/lazar.rb | 3 +- lib/model.rb | 135 +++++++++++++++++++++++------------------------ lib/nanoparticle.rb | 25 ++++----- lib/regression.rb | 67 ++++++++++++++++++++--- lib/similarity.rb | 15 +++--- lib/substance.rb | 63 +++++++++++++++++++++- 10 files changed, 265 insertions(+), 116 deletions(-) create mode 100644 lib/feature_selection.rb (limited to 'lib') diff --git a/lib/algorithm.rb b/lib/algorithm.rb index 113f847..0e4b93a 100644 --- a/lib/algorithm.rb +++ b/lib/algorithm.rb @@ -2,18 +2,9 @@ module OpenTox module Algorithm - # Generic method to execute algorithms - # Algorithms should: - # - accept a Compound, an Array of Compounds or a Dataset as first argument - # - optional parameters as second argument - # - return an object corresponding to the input type as result (eg. Compound -> value, Array of Compounds -> Array of values, Dataset -> Dataset with values - # @param [OpenTox::Compound,Array,OpenTox::Dataset] Input object - # @param [Hash] Algorithm parameters - # @return Algorithm result - def self.run algorithm, object, parameters=nil - bad_request_error "Cannot run '#{algorithm}' algorithm. Please provide an OpenTox::Algorithm." unless algorithm =~ /^OpenTox::Algorithm/ + def self.run algorithm, parameters=nil klass,method = algorithm.split('.') - parameters.nil? ? Object.const_get(klass).send(method,object) : Object.const_get(klass).send(method,object, parameters) + Object.const_get(klass).send(method,parameters) end end diff --git a/lib/classification.rb b/lib/classification.rb index 03c32c4..01ba878 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -3,7 +3,7 @@ module OpenTox class Classification - def self.weighted_majority_vote substance:, neighbors: + def self.weighted_majority_vote descriptors:nil, neighbors: sims = {} neighbors.each do |neighbor| sim = neighbor["similarity"] diff --git a/lib/compound.rb b/lib/compound.rb index 4689d7a..4d62c53 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -75,9 +75,9 @@ module OpenTox fingerprints[type] end - def physchem descriptors=PhysChem.openbabel_descriptors + def calculated_physchem descriptors=PhysChem.openbabel_descriptors # TODO: speedup java descriptors - calculated_ids = physchem_descriptors.keys + calculated_ids = descriptors.keys # BSON::ObjectId instances are not allowed as keys in a BSON document. new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids descs = {} @@ -90,11 +90,11 @@ module OpenTox # avoid recalculating Cdk features with multiple values descs.keys.uniq.each do |k| descs[k].send(k[0].downcase,k[1],self).each do |n,v| - physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document. + descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document. end end save - physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id} + descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id} end def smarts_match smarts, count=false @@ -254,6 +254,7 @@ module OpenTox self["chemblid"] end +=begin def fingerprint_neighbors(type:, min_sim: 0.1, dataset_id:, prediction_feature_id:) neighbors = [] dataset = Dataset.find(dataset_id) @@ -276,6 +277,7 @@ module OpenTox end neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]} end +=end # def physchem_neighbors params # # TODO: fix, tests @@ -340,7 +342,7 @@ module OpenTox # @return [Float] molecular weight def molecular_weight mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW") - physchem([mw_feature])[mw_feature.id.to_s] + calculated_physchem([mw_feature])[mw_feature.id.to_s] end private diff --git a/lib/feature_selection.rb b/lib/feature_selection.rb new file mode 100644 index 0000000..43e3bea --- /dev/null +++ b/lib/feature_selection.rb @@ -0,0 +1,46 @@ +module OpenTox + module Algorithm + + class FeatureSelection + + def self.correlation_filter dataset:, prediction_feature:, types:nil + # TODO: speedup, single assignment of all features to R+ parallel computation of significance? + relevant_features = {} + measurements = [] + substances = [] + dataset.substances.each do |s| + dataset.values(s,prediction_feature).each do |act| + measurements << act + substances << s + end + end + R.assign "tox", measurements + feature_ids = dataset.substances.collect{ |s| s["properties"].keys}.flatten.uniq + feature_ids.select!{|fid| types.include? Feature.find(fid).category} if types + feature_ids.each do |feature_id| + feature_values = substances.collect{|s| s["properties"][feature_id].first if s["properties"][feature_id]} + unless feature_values.uniq.size == 1 + R.assign "feature", feature_values + begin + R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')" + pvalue = R.eval("cor$p.value").to_ruby + if pvalue <= 0.05 + r = R.eval("cor$estimate").to_ruby + relevant_features[feature_id] = {} + relevant_features[feature_id]["pvalue"] = pvalue + relevant_features[feature_id]["r"] = r + relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby + relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby + end + rescue + warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed." + end + end + end + relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h + end + + end + + end +end diff --git a/lib/lazar.rb b/lib/lazar.rb index 46605d3..d0f05c0 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -78,7 +78,8 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation"," "nanoparticle.rb", "dataset.rb", "algorithm.rb", - "similarity", + "similarity.rb", + "feature_selection.rb", "model.rb", "classification.rb", "regression.rb", diff --git a/lib/model.rb b/lib/model.rb index 749611e..a272580 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -28,101 +28,91 @@ module OpenTox when /Regression/ model = LazarRegression.new end + # guess model type elsif prediction_feature.numeric? model = LazarRegression.new else model = LazarClassification.new end + # set defaults - if model.class == LazarClassification + substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq + bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1 + + if substance_classes.first == "OpenTox::Compound" + model.algorithms = { + :descriptors => { + :method => "fingerprint", + :type => 'MP2D', + }, :similarity => { - :descriptors => "fingerprint['MP2D']", :method => "Algorithm::Similarity.tanimoto", :min => 0.1 }, - :prediction => { - :descriptors => "fingerprint['MP2D']", - :method => "Algorithm::Classification.weighted_majority_vote", - }, - :feature_selection => nil, + :feature_selection => nil } - elsif model.class == LazarRegression + + if model.class == LazarClassification + model.algorithms[:prediction] = { + :method => "Algorithm::Classification.weighted_majority_vote", + } + elsif model.class == LazarRegression + model.algorithms[:prediction] = { + :method => "Algorithm::Regression.caret", + :parameters => "pls", + } + end + + elsif substance_classes.first == "OpenTox::Nanoparticle" model.algorithms = { + :descriptors => { + :method => "properties", + #:types => ["P-CHEM","Proteomics"], + :types => ["P-CHEM"], + }, :similarity => { - :descriptors => "fingerprint['MP2D']", - :method => "Algorithm::Similarity.tanimoto", - :min => 0.1 + :method => "Algorithm::Similarity.weighted_cosine", + :min => 0.5 }, :prediction => { - :descriptors => "fingerprint['MP2D']", - :method => "Algorithm::Regression.local_caret", - :parameters => "pls", + :method => "Algorithm::Regression.caret", + :parameters => "rf", + }, + :feature_selection => { + :method => "Algorithm::FeatureSelection.correlation_filter", }, - :feature_selection => nil, } + else + bad_request_error "Cannot create models for #{substance_classes.first}." end - # overwrite defaults + # overwrite defaults with explicit parameters algorithms.each do |type,parameters| - parameters.each do |p,v| - model.algorithms[type][p] = v - end if parameters + if parameters and parameters.is_a? Hash + parameters.each do |p,v| + model.algorithms[type] ||= {} + model.algorithms[type][p] = v + end + else + model.algorithms[type] = parameters + end end - # set defaults for empty parameters model.prediction_feature_id = prediction_feature.id model.training_dataset_id = training_dataset.id model.name = "#{training_dataset.name} #{prediction_feature.name}" - #send(feature_selection_algorithm.to_sym) if feature_selection_algorithm + if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method] + model.relevant_features = Algorithm.run model.algorithms[:feature_selection][:method], dataset: training_dataset, prediction_feature: prediction_feature, types: model.algorithms[:descriptors][:types] + end model.save - p model model end - def correlation_filter - # TODO: speedup, single assignment of all features to R+ parallel computation of significance? - self.relevant_features = {} - measurements = [] - substances = [] - training_dataset.substances.each do |s| - training_dataset.values(s,prediction_feature_id).each do |act| - measurements << act - substances << s - end - end - R.assign "tox", measurements - feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq - feature_ids.select!{|fid| Feature.find(fid).category == feature_selection_algorithm_parameters[:category]} if feature_selection_algorithm_parameters[:category] - feature_ids.each do |feature_id| - feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id].first if s["physchem_descriptors"][feature_id]} - unless feature_values.uniq.size == 1 - R.assign "feature", feature_values - begin - R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')" - pvalue = R.eval("cor$p.value").to_ruby - if pvalue <= 0.05 - r = R.eval("cor$estimate").to_ruby - self.relevant_features[feature_id] = {} - self.relevant_features[feature_id]["pvalue"] = pvalue - self.relevant_features[feature_id]["r"] = r - self.relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby - self.relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby - end - rescue - warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed." - end - end - end - self.relevant_features = self.relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h - end - def predict_substance substance - neighbor_algorithm_parameters = Hash[self.neighbor_algorithm_parameters.map{ |k, v| [k.to_sym, v] }] # convert string keys to symbols - neighbor_algorithm_parameters[:relevant_features] = self.relevant_features if self.relevant_features - neighbors = substance.send(neighbor_algorithm, neighbor_algorithm_parameters) + neighbors = substance.neighbors dataset_id: training_dataset_id, prediction_feature_id: prediction_feature_id, descriptors: algorithms[:descriptors], similarity: algorithms[:similarity], relevant_features: relevant_features measurements = nil prediction = {} # handle query substance @@ -153,9 +143,17 @@ module OpenTox prediction.merge!({:value => value, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting median of its experimental values.", :neighbors => neighbors}) if value else # call prediction algorithm - klass,method = prediction_algorithm.split('.') - params = prediction_algorithm_parameters.merge({:substance => substance, :neighbors => neighbors}) - result = Object.const_get(klass).send(method,params) + case algorithms[:descriptors][:method] + when "fingerprint" + descriptors = substance.fingerprints[algorithms[:descriptors][:type]] + when "properties" + descriptors = substance.properties + else + bad_request_error "Descriptor method '#{algorithms[:descriptors][:method]}' not available." + end + params = algorithms[:prediction].merge({:descriptors => descriptors, :neighbors => neighbors}) + params.delete :method + result = Algorithm.run algorithms[:prediction][:method], params prediction.merge! result prediction[:neighbors] = neighbors prediction[:neighbors] ||= [] @@ -176,7 +174,7 @@ module OpenTox elsif object.is_a? Dataset substances = object.substances else - bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter." + bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter." end # make predictions @@ -194,7 +192,6 @@ module OpenTox elsif object.is_a? Array return predictions elsif object.is_a? Dataset - #predictions.each{|cid,p| p.delete(:neighbors)} # prepare prediction dataset measurement_feature = Feature.find prediction_feature_id @@ -205,8 +202,6 @@ module OpenTox :prediction_feature_id => prediction_feature.id, :predictions => predictions ) - - #prediction_dataset.save return prediction_dataset end @@ -314,7 +309,7 @@ module OpenTox :feature_selection_algorithm_parameters => {:category => category}, :neighbor_algorithm => "physchem_neighbors", :neighbor_algorithm_parameters => {:min_sim => 0.5}, - :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", + :prediction_algorithm => "OpenTox::Algorithm::Regression.physchem_regression", :prediction_algorithm_parameters => {:method => 'rf'}, # random forests } training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index b1a3835..6905f6f 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -5,10 +5,10 @@ module OpenTox field :core, type: Hash, default: {} field :coating, type: Array, default: [] - #field :proteomics, type: Hash, default: {} attr_accessor :scaled_values +=begin def physchem_neighbors min_sim: 0.9, dataset_id:, prediction_feature_id:, relevant_features: dataset = Dataset.find(dataset_id) #relevant_features = {} @@ -27,12 +27,12 @@ module OpenTox substances.each do |substance| values = dataset.values(substance,prediction_feature_id) if values - common_descriptors = relevant_features.keys & substance.physchem_descriptors.keys + common_descriptors = relevant_features.keys & substance.descriptors.keys # scale values - query_descriptors = common_descriptors.collect{|d| (physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} - @scaled_values = common_descriptors.collect{|d| [d,(physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h - neighbor_descriptors = common_descriptors.collect{|d| (substance.physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} - neighbor_scaled_values = common_descriptors.collect{|d| [d,(substance.physchem_descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h + query_descriptors = common_descriptors.collect{|d| (descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} + @scaled_values = common_descriptors.collect{|d| [d,(descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h + neighbor_descriptors = common_descriptors.collect{|d| (substance.descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} + neighbor_scaled_values = common_descriptors.collect{|d| [d,(substance.descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h #weights = common_descriptors.collect{|d| 1-relevant_features[d]["p_value"]} weights = common_descriptors.collect{|d| relevant_features[d]["r"]**2} sim = Algorithm::Similarity.weighted_cosine(query_descriptors,neighbor_descriptors,weights) @@ -54,18 +54,19 @@ module OpenTox neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} neighbors end +=end def add_feature feature, value, dataset unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand case feature.category when "P-CHEM" - physchem_descriptors[feature.id.to_s] ||= [] - physchem_descriptors[feature.id.to_s] << value - physchem_descriptors[feature.id.to_s].uniq! + properties[feature.id.to_s] ||= [] + properties[feature.id.to_s] << value + properties[feature.id.to_s].uniq! when "Proteomics" - physchem_descriptors[feature.id.to_s] ||= [] - physchem_descriptors[feature.id.to_s] << value - physchem_descriptors[feature.id.to_s].uniq! + properties[feature.id.to_s] ||= [] + properties[feature.id.to_s] << value + properties[feature.id.to_s].uniq! when "TOX" dataset.add self, feature, value else diff --git a/lib/regression.rb b/lib/regression.rb index 269a743..396c9e4 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -3,7 +3,8 @@ module OpenTox class Regression - def self.local_weighted_average substance:, neighbors: + def self.weighted_average descriptors:nil, neighbors:, parameters:nil + # TODO: prediction_interval weighted_sum = 0.0 sim_sum = 0.0 neighbors.each do |neighbor| @@ -18,7 +19,57 @@ module OpenTox {:value => prediction} end - def self.local_fingerprint_regression substance:, neighbors:, method: "pls" #, method_params="sigma=0.05" + def self.caret descriptors:, neighbors:, method: "pls", parameters:nil + values = [] + descriptors = {} + weights = [] + descriptor_ids = neighbors.collect{|n| n["descriptors"]}.flatten.uniq.sort + + neighbors.each do |n| + activities = n["measurements"] + activities.each do |act| + values << act + weights << n["similarity"] + descriptor_ids.each do |id| + descriptors[id] ||= [] + descriptors[id] << n["descriptors"].include?(id) + end + end if activities + end + + variables = [] + data_frame = [values] + + descriptors.each do |k,v| + unless v.uniq.size == 1 + data_frame << v.collect{|m| m ? "T" : "F"} + variables << k + end + end + + if variables.empty? + prediction = weighted_average(descriptors: descriptors, neighbors: neighbors) + prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." + prediction + else + substance_features = variables.collect{|f| descriptors.include?(f) ? "T" : "F"} + #puts data_frame.to_yaml + prediction = r_model_prediction method, data_frame, variables, weights, substance_features + if prediction.nil? or prediction[:value].nil? + prediction = weighted_average(descriptors: descriptors, neighbors: neighbors) + prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances." + prediction + else + prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]] + prediction[:value] = prediction[:value] + prediction[:rmse] = prediction[:rmse] + prediction + end + end + + end + + def self.fingerprint_regression substance:, neighbors:, method: "pls" #, method_params="sigma=0.05" values = [] fingerprints = {} weights = [] @@ -48,14 +99,14 @@ module OpenTox end if variables.empty? - prediction = local_weighted_average(substance: substance, neighbors: neighbors) + prediction = weighted_average(substance: substance, neighbors: neighbors) prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." prediction else substance_features = variables.collect{|f| substance.fingerprint.include?(f) ? "T" : "F"} prediction = r_model_prediction method, data_frame, variables, weights, substance_features if prediction.nil? or prediction[:value].nil? - prediction = local_weighted_average(substance: substance, neighbors: neighbors) + prediction = weighted_average(substance: substance, neighbors: neighbors) prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." prediction else @@ -68,7 +119,8 @@ module OpenTox end - def self.local_physchem_regression substance:, neighbors:, method: "pls" +=begin + def self.physchem_regression substance:, neighbors:, method: "pls" activities = [] weights = [] @@ -104,7 +156,7 @@ module OpenTox pc_ids.compact! if pc_ids.empty? - prediction = local_weighted_average(substance: substance, neighbors: neighbors) + prediction = weighted_average(substance: substance, neighbors: neighbors) prediction[:warning] = "No relevant variables for regression model. Using weighted average of similar substances." prediction else @@ -122,7 +174,7 @@ module OpenTox pc_ids.compact! prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors if prediction.nil? - prediction = local_weighted_average(substance: substance, neighbors: neighbors) + prediction = weighted_average(substance: substance, neighbors: neighbors) prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." end p prediction @@ -130,6 +182,7 @@ module OpenTox end end +=end def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values R.assign "weights", training_weights diff --git a/lib/similarity.rb b/lib/similarity.rb index 00179c1..b9b4571 100644 --- a/lib/similarity.rb +++ b/lib/similarity.rb @@ -15,21 +15,22 @@ module OpenTox class Similarity - def self.tanimoto a, b - ( a & b).size/(a|b).size.to_f + def self.tanimoto fingerprints + ( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f end - def self.euclid a, b - sq = a.zip(b).map{|a,b| (a - b) ** 2} + def self.euclid fingerprints + sq = fingerprints[0].zip(fingerprints[1]).map{|a,b| (a - b) ** 2} Math.sqrt(sq.inject(0) {|s,c| s + c}) end # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity - def self.cosine a, b - Algorithm::Vector.dot_product(a, b) / (Algorithm::Vector.magnitude(a) * Algorithm::Vector.magnitude(b)) + def self.cosine fingerprints + Algorithm::Vector.dot_product(fingerprints[0], fingerprints[1]) / (Algorithm::Vector.magnitude(fingerprints[0]) * Algorithm::Vector.magnitude(fingerprints[1])) end - def self.weighted_cosine(a, b, w) + def self.weighted_cosine fingerprints # [a,b,weights] + a, b, w = fingerprints dot_product = 0 magnitude_a = 0 magnitude_b = 0 diff --git a/lib/substance.rb b/lib/substance.rb index 6768ce7..d271327 100644 --- a/lib/substance.rb +++ b/lib/substance.rb @@ -1,9 +1,68 @@ module OpenTox class Substance - field :physchem_descriptors, type: Hash, default: {} + field :properties, type: Hash, default: {} field :dataset_ids, type: Array, default: [] end -end + def neighbors dataset_id:,prediction_feature_id:,descriptors:,similarity:,relevant_features:nil + # TODO enable empty dataset_id -> use complete db + case descriptors[:method] + when "fingerprint" + fingerprint_neighbors dataset_id:dataset_id, prediction_feature_id:prediction_feature_id, descriptors:descriptors, similarity:similarity + when "properties" + properties_neighbors dataset_id:dataset_id, prediction_feature_id:prediction_feature_id, descriptors:descriptors, similarity:similarity, relevant_features: relevant_features + else + bad_request_error "Descriptor method '#{descriptors[:method]}' not implemented." + end + end + + def fingerprint_neighbors dataset_id:,prediction_feature_id:,descriptors:,similarity: + neighbors = [] + dataset = Dataset.find(dataset_id) + dataset.substances.each do |substance| + values = dataset.values(substance,prediction_feature_id) + if values + query_descriptors = self.send(descriptors[:method].to_sym, descriptors[:type]) + candidate_descriptors = substance.send(descriptors[:method].to_sym, descriptors[:type]) + sim = Algorithm.run similarity[:method], [query_descriptors, candidate_descriptors] + neighbors << {"_id" => substance.id, "measurements" => values, "descriptors" => candidate_descriptors, "similarity" => sim} if sim >= similarity[:min] + end + end + neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]} + end + def properties_neighbors dataset_id:,prediction_feature_id:,descriptors:,similarity:,relevant_features: + neighbors = [] + dataset = Dataset.find(dataset_id) + weights = relevant_features.collect{|k,v| v["r"]**2} + means = relevant_features.collect{|k,v| v["mean"]} + standard_deviations = relevant_features.collect{|k,v| v["sd"]} + query_descriptors = relevant_features.keys.collect{|i| properties[i].is_a?(Array) ? properties[i].median : nil } + dataset.substances.each do |substance| + values = dataset.values(substance,prediction_feature_id) + # exclude nanoparticles with different core + # TODO validate exclusion + next if substance.is_a? Nanoparticle and substance.core != self.core + if values + candidate_descriptors = relevant_features.keys.collect{|i| substance.properties[i].is_a?(Array) ? substance.properties[i].median : nil } + q = [] + c = [] + w = [] + (0..relevant_features.size-1).each do |i| + # add only complete pairs + if query_descriptors[i] and candidate_descriptors[i] + w << weights[i] + # scale values + q << (query_descriptors[i] - means[i])/standard_deviations[i] + c << (candidate_descriptors[i] - means[i])/standard_deviations[i] + end + end + sim = Algorithm.run similarity[:method], [q, c, w] + neighbors << {"_id" => substance.id, "measurements" => values, "descriptors" => candidate_descriptors, "similarity" => sim} if sim >= similarity[:min] + end + end + neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]} + end + +end -- cgit v1.2.3 From 016403f7db0dedf8237f29af41312b5ff2720c30 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 5 Oct 2016 14:10:25 +0200 Subject: compound and descriptor tests fixed --- lib/compound.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index 4d62c53..93cfc03 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -77,7 +77,7 @@ module OpenTox def calculated_physchem descriptors=PhysChem.openbabel_descriptors # TODO: speedup java descriptors - calculated_ids = descriptors.keys + calculated_ids = properties.keys # BSON::ObjectId instances are not allowed as keys in a BSON document. new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids descs = {} @@ -90,11 +90,11 @@ module OpenTox # avoid recalculating Cdk features with multiple values descs.keys.uniq.each do |k| descs[k].send(k[0].downcase,k[1],self).each do |n,v| - descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document. + properties[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document. end end save - descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id} + properties.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id} end def smarts_match smarts, count=false -- cgit v1.2.3 From 4348eec89033e6677c9f628646fc67bd03c73fe6 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 6 Oct 2016 19:14:10 +0200 Subject: nano caret regression fixed --- lib/lazar.rb | 1 + lib/model.rb | 64 ++++++------- lib/regression.rb | 220 ------------------------------------------- lib/train-test-validation.rb | 5 +- 4 files changed, 31 insertions(+), 259 deletions(-) (limited to 'lib') diff --git a/lib/lazar.rb b/lib/lazar.rb index d0f05c0..f251379 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -83,6 +83,7 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation"," "model.rb", "classification.rb", "regression.rb", + "caret.rb", "validation-statistics.rb", "validation.rb", "train-test-validation.rb", diff --git a/lib/model.rb b/lib/model.rb index a272580..290309a 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -23,10 +23,12 @@ module OpenTox # explicit prediction algorithm if algorithms[:prediction] and algorithms[:prediction][:method] case algorithms[:prediction][:method] - when /Classifiction/ + when /Classification/i model = LazarClassification.new - when /Regression/ + when /Regression/i model = LazarRegression.new + else + bad_request_error "Prediction method '#{algorithms[:prediction][:method]}' not implemented." end # guess model type @@ -36,6 +38,10 @@ module OpenTox model = LazarClassification.new end + model.prediction_feature_id = prediction_feature.id + model.training_dataset_id = training_dataset.id + model.name = "#{training_dataset.name} #{prediction_feature.name}" + # set defaults substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1 @@ -60,7 +66,7 @@ module OpenTox } elsif model.class == LazarRegression model.algorithms[:prediction] = { - :method => "Algorithm::Regression.caret", + :method => "Algorithm::Caret.regression", :parameters => "pls", } end @@ -77,7 +83,7 @@ module OpenTox :min => 0.5 }, :prediction => { - :method => "Algorithm::Regression.caret", + :method => "Algorithm::Caret.regression", :parameters => "rf", }, :feature_selection => { @@ -100,10 +106,6 @@ module OpenTox end end - model.prediction_feature_id = prediction_feature.id - model.training_dataset_id = training_dataset.id - model.name = "#{training_dataset.name} #{prediction_feature.name}" - if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method] model.relevant_features = Algorithm.run model.algorithms[:feature_selection][:method], dataset: training_dataset, prediction_feature: prediction_feature, types: model.algorithms[:descriptors][:types] end @@ -151,8 +153,12 @@ module OpenTox else bad_request_error "Descriptor method '#{algorithms[:descriptors][:method]}' not available." end - params = algorithms[:prediction].merge({:descriptors => descriptors, :neighbors => neighbors}) - params.delete :method + params = { + :method => algorithms[:prediction][:parameters], + :descriptors => descriptors, + :neighbors => neighbors, + :relevant_features => relevant_features + } result = Algorithm.run algorithms[:prediction][:method], params prediction.merge! result prediction[:neighbors] = neighbors @@ -218,11 +224,9 @@ module OpenTox end class LazarClassification < Lazar - end class LazarRegression < Lazar - end class Prediction @@ -240,7 +244,7 @@ module OpenTox field :leave_one_out_validation_id, type: BSON::ObjectId def predict object - Lazar.find(model_id).predict object + model.predict object end def training_dataset @@ -251,6 +255,10 @@ module OpenTox Lazar.find model_id end + def prediction_feature + model.prediction_feature + end + def repeated_crossvalidation Validation::RepeatedCrossValidation.find repeated_crossvalidation_id end @@ -276,15 +284,8 @@ module OpenTox bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file prediction_model = self.new JSON.parse(File.read(metadata_file)) training_dataset = Dataset.from_csv_file file - prediction_feature = training_dataset.features.first - model = nil - if prediction_feature.nominal? - model = LazarClassification.create prediction_feature, training_dataset - elsif prediction_feature.numeric? - model = LazarRegression.create prediction_feature, training_dataset - end + model = Lazar.create training_dataset: training_dataset prediction_model[:model_id] = model.id - prediction_model[:prediction_feature_id] = prediction_feature.id prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id prediction_model.save @@ -297,26 +298,19 @@ module OpenTox def self.from_json_dump dir, category Import::Enanomapper.import dir - + training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + unless training_dataset + Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm") + training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + end prediction_model = self.new( :endpoint => "log2(Net cell association)", :source => "https://data.enanomapper.net/", :species => "A549 human lung epithelial carcinoma cells", :unit => "log2(ug/Mg)" ) - params = { - :feature_selection_algorithm => :correlation_filter, - :feature_selection_algorithm_parameters => {:category => category}, - :neighbor_algorithm => "physchem_neighbors", - :neighbor_algorithm_parameters => {:min_sim => 0.5}, - :prediction_algorithm => "OpenTox::Algorithm::Regression.physchem_regression", - :prediction_algorithm_parameters => {:method => 'rf'}, # random forests - } - training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") - prediction_feature = Feature.find_or_create_by(name: "log2(Net cell association)", category: "TOX") - #prediction_feature = Feature.find("579621b84de73e267b414e55") - prediction_model[:prediction_feature_id] = prediction_feature.id - model = Model::LazarRegression.create(prediction_feature, training_dataset, params) + prediction_feature = Feature.where(name: "log2(Net cell association)", category: "TOX").first + model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset) prediction_model[:model_id] = model.id repeated_cv = Validation::RepeatedCrossValidation.create model prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id diff --git a/lib/regression.rb b/lib/regression.rb index 396c9e4..cf6d9cb 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -19,226 +19,6 @@ module OpenTox {:value => prediction} end - def self.caret descriptors:, neighbors:, method: "pls", parameters:nil - values = [] - descriptors = {} - weights = [] - descriptor_ids = neighbors.collect{|n| n["descriptors"]}.flatten.uniq.sort - - neighbors.each do |n| - activities = n["measurements"] - activities.each do |act| - values << act - weights << n["similarity"] - descriptor_ids.each do |id| - descriptors[id] ||= [] - descriptors[id] << n["descriptors"].include?(id) - end - end if activities - end - - variables = [] - data_frame = [values] - - descriptors.each do |k,v| - unless v.uniq.size == 1 - data_frame << v.collect{|m| m ? "T" : "F"} - variables << k - end - end - - if variables.empty? - prediction = weighted_average(descriptors: descriptors, neighbors: neighbors) - prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." - prediction - else - substance_features = variables.collect{|f| descriptors.include?(f) ? "T" : "F"} - #puts data_frame.to_yaml - prediction = r_model_prediction method, data_frame, variables, weights, substance_features - if prediction.nil? or prediction[:value].nil? - prediction = weighted_average(descriptors: descriptors, neighbors: neighbors) - prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances." - prediction - else - prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]] - prediction[:value] = prediction[:value] - prediction[:rmse] = prediction[:rmse] - prediction - end - end - - end - - def self.fingerprint_regression substance:, neighbors:, method: "pls" #, method_params="sigma=0.05" - values = [] - fingerprints = {} - weights = [] - fingerprint_ids = neighbors.collect{|n| Compound.find(n["_id"]).fingerprint}.flatten.uniq.sort - - neighbors.each do |n| - fingerprint = Substance.find(n["_id"]).fingerprint - activities = n["measurements"] - activities.each do |act| - values << act - weights << n["similarity"] - fingerprint_ids.each do |id| - fingerprints[id] ||= [] - fingerprints[id] << fingerprint.include?(id) - end - end if activities - end - - variables = [] - data_frame = [values] - - fingerprints.each do |k,v| - unless v.uniq.size == 1 - data_frame << v.collect{|m| m ? "T" : "F"} - variables << k - end - end - - if variables.empty? - prediction = weighted_average(substance: substance, neighbors: neighbors) - prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." - prediction - else - substance_features = variables.collect{|f| substance.fingerprint.include?(f) ? "T" : "F"} - prediction = r_model_prediction method, data_frame, variables, weights, substance_features - if prediction.nil? or prediction[:value].nil? - prediction = weighted_average(substance: substance, neighbors: neighbors) - prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." - prediction - else - prediction[:prediction_interval] = [prediction[:value]-1.96*prediction[:rmse], prediction[:value]+1.96*prediction[:rmse]] - prediction[:value] = prediction[:value] - prediction[:rmse] = prediction[:rmse] - prediction - end - end - - end - -=begin - def self.physchem_regression substance:, neighbors:, method: "pls" - - activities = [] - weights = [] - pc_ids = neighbors.collect{|n| n["common_descriptors"].collect{|d| d[:id]}}.flatten.uniq.sort - data_frame = [] - data_frame[0] = [] - - neighbors.each_with_index do |n,i| - activities = n["measurements"] - activities.each do |act| - data_frame[0][i] = act - weights << n["similarity"] - n["common_descriptors"].each do |d| - j = pc_ids.index(d[:id])+1 - data_frame[j] ||= [] - data_frame[j][i] = d[:scaled_value] - end - end if activities - (0..pc_ids.size).each do |j| # for R: fill empty values with NA - data_frame[j] ||= [] - data_frame[j][i] ||= "NA" - end - end - - data_frame = data_frame.each_with_index.collect do |r,i| - if r.uniq.size == 1 # remove properties with a single value - r = nil - pc_ids[i-1] = nil # data_frame frame has additional activity entry - end - r - end - data_frame.compact! - pc_ids.compact! - - if pc_ids.empty? - prediction = weighted_average(substance: substance, neighbors: neighbors) - prediction[:warning] = "No relevant variables for regression model. Using weighted average of similar substances." - prediction - else - query_descriptors = pc_ids.collect { |i| substance.scaled_values[i] } - query_descriptors = query_descriptors.each_with_index.collect do |v,i| - unless v - v = nil - data_frame[i] = nil - pc_ids[i] = nil - end - v - end - query_descriptors.compact! - data_frame.compact! - pc_ids.compact! - prediction = r_model_prediction method, data_frame, pc_ids.collect{|i| "\"#{i}\""}, weights, query_descriptors - if prediction.nil? - prediction = weighted_average(substance: substance, neighbors: neighbors) - prediction[:warning] = "Could not create local PLS model. Using weighted average of similar substances." - end - p prediction - prediction - end - - end -=end - - def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values - R.assign "weights", training_weights - r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" -=begin -rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) - File.open("tmp.R","w+"){|f| - f.puts "suppressPackageStartupMessages({ - library(iterators,lib=\"#{rlib}\") - library(foreach,lib=\"#{rlib}\") - library(ggplot2,lib=\"#{rlib}\") - library(grid,lib=\"#{rlib}\") - library(gridExtra,lib=\"#{rlib}\") - library(pls,lib=\"#{rlib}\") - library(caret,lib=\"#{rlib}\") - library(doMC,lib=\"#{rlib}\") - registerDoMC(#{NR_CORES}) -})" - - f.puts "data <- #{r_data_frame}\n" - f.puts "weights <- c(#{training_weights.join(', ')})" - f.puts "features <- c(#{training_features.join(', ')})" - f.puts "names(data) <- append(c('activities'),features)" # - f.puts "ctrl <- rfeControl(functions = #{method}, method = 'repeatedcv', repeats = 5, verbose = T)" - f.puts "lmProfile <- rfe(activities ~ ., data = data, rfeControl = ctrl)" - - f.puts "model <- train(activities ~ ., data = data, method = '#{method}')" - f.puts "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" - f.puts "names(fingerprint) <- features" - f.puts "prediction <- predict(model,fingerprint)" - } -=end - - R.eval "data <- #{r_data_frame}" - R.assign "features", training_features - begin - R.eval "names(data) <- append(c('activities'),features)" # - R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)" - R.eval "fingerprint <- data.frame(rbind(c(#{query_feature_values.join ','})))" - R.eval "names(fingerprint) <- features" - R.eval "prediction <- predict(model,fingerprint)" - value = R.eval("prediction").to_f - rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f - r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f - prediction_interval = value-1.96*rmse, value+1.96*rmse - { - :value => value, - :rmse => rmse, - :r_squared => r_squared, - :prediction_interval => prediction_interval - } - rescue - return nil - end - end - end end end diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb index 286614a..e3f5905 100644 --- a/lib/train-test-validation.rb +++ b/lib/train-test-validation.rb @@ -9,10 +9,7 @@ module OpenTox def self.create model, training_set, test_set - atts = model.attributes.dup # do not modify attributes of the original model - atts["_id"] = BSON::ObjectId.new - atts[:training_dataset_id] = training_set.id - validation_model = model.class.create model.prediction_feature, training_set, atts + validation_model = model.class.create prediction_feature: model.prediction_feature, training_dataset: training_set, algorithms: model.algorithms validation_model.save predictions = validation_model.predict test_set.substances nr_unpredicted = 0 -- cgit v1.2.3 From 91787edb3682900bc5a2feeca66e5142f387fcc6 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 7 Oct 2016 10:25:58 +0200 Subject: unified interface for prediction algorithms --- lib/caret.rb | 152 +++++++++++++++++++++++++++++++++++++++++++++++++ lib/classification.rb | 2 +- lib/crossvalidation.rb | 4 +- lib/dataset.rb | 2 - lib/feature.rb | 18 +++--- lib/import.rb | 3 +- lib/nanoparticle.rb | 50 ---------------- lib/physchem.rb | 6 +- lib/regression.rb | 2 +- 9 files changed, 169 insertions(+), 70 deletions(-) create mode 100644 lib/caret.rb (limited to 'lib') diff --git a/lib/caret.rb b/lib/caret.rb new file mode 100644 index 0000000..b999b06 --- /dev/null +++ b/lib/caret.rb @@ -0,0 +1,152 @@ +module OpenTox + module Algorithm + + class Caret + # TODO classification + # model list: https://topepo.github.io/caret/modelList.html + + attr_accessor :descriptors, :neighbors, :method, :relevant_features, :data_frame, :feature_names, :weights, :query_features + + def initialize descriptors:, neighbors:, method:, relevant_features: + @descriptors = descriptors + @neighbors = neighbors + @method = method + @relevant_features = relevant_features + end + + def self.regression descriptors:, neighbors:, method:, relevant_features:nil + + caret = new(descriptors:descriptors, neighbors:neighbors, method:method, relevant_features:relevant_features) + # collect training data for R + if descriptors.is_a? Array + caret.fingerprint2R + elsif descriptors.is_a? Hash + caret.properties2R + else + bad_request_error "Descriptors should be a fingerprint (Array) or properties (Hash). Cannot handle '#{descriptors.class}'." + end + if caret.feature_names.empty? or caret.data_frame.flatten.uniq == ["NA"] + prediction = Algorithm::Regression::weighted_average(descriptors: @descriptors, neighbors: neighbors) + prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." + else + prediction = caret.r_model_prediction + if prediction.nil? or prediction[:value].nil? + prediction = Algorithm::Regression::weighted_average(descriptors: @descriptors, neighbors: neighbors) + prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances." + end + end + prediction + + end + + def fingerprint2R + + values = [] + features = {} + @weights = [] + descriptor_ids = neighbors.collect{|n| n["descriptors"]}.flatten.uniq.sort + + neighbors.each do |n| + activities = n["measurements"] + activities.each do |act| + values << act + @weights << n["similarity"] + descriptor_ids.each do |id| + features[id] ||= [] + features[id] << n["descriptors"].include?(id) + end + end if activities + end + + @feature_names = [] + @data_frame = [values] + + features.each do |k,v| + unless v.uniq.size == 1 + @data_frame << v.collect{|m| m ? "T" : "F"} + @feature_names << k + end + end + @query_features = @feature_names.collect{|f| descriptors.include?(f) ? "T" : "F"} + + end + + + def properties2R + + @weights = [] + @feature_names = [] + @query_features = [] + + # keep only descriptors with values + @relevant_features.keys.each_with_index do |f,i| + if @descriptors[f] + @feature_names << f + @query_features << @descriptors[f].median + else + neighbors.each do |n| + n["descriptors"].delete_at i + end + end + end + + measurements = neighbors.collect{|n| n["measurements"]}.flatten + # initialize data frame with 'NA' defaults + @data_frame = Array.new(@feature_names.size+1){Array.new(measurements.size,"NA") } + + i = 0 + # parse neighbor activities and descriptors + neighbors.each do |n| + activities = n["measurements"] + activities.each do |act| # multiple measurements are treated as separate instances + unless n["descriptors"].include?(nil) + data_frame[0][i] = act + @weights << n["similarity"] + n["descriptors"].each_with_index do |d,j| + @data_frame[j+1][i] = d + end + i += 1 + end + end if activities # ignore neighbors without measurements + end + + end + + def r_model_prediction + begin + R.assign "weights", @weights + r_data_frame = "data.frame(#{@data_frame.collect{|r| "c(#{r.join(',')})"}.join(', ')})" + R.eval "data <- #{r_data_frame}" + R.assign "features", @feature_names + R.eval "names(data) <- append(c('activities'),features)" # + R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)" + rescue => e + $logger.debug "R caret model creation error for:" + $logger.debug JSON.pretty_generate(self.inspect) + return nil + end + begin + R.eval "query <- data.frame(rbind(c(#{@query_features.join ','})))" + R.eval "names(query) <- features" + R.eval "prediction <- predict(model,query)" + value = R.eval("prediction").to_f + rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f + r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f + prediction_interval = value-1.96*rmse, value+1.96*rmse + { + :value => value, + :rmse => rmse, + :r_squared => r_squared, + :prediction_interval => prediction_interval + } + rescue => e + $logger.debug "R caret prediction error for:" + $logger.debug self.inspect + return nil + end + end + + end + end +end + diff --git a/lib/classification.rb b/lib/classification.rb index 01ba878..6582e7d 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -3,7 +3,7 @@ module OpenTox class Classification - def self.weighted_majority_vote descriptors:nil, neighbors: + def self.weighted_majority_vote descriptors:nil, neighbors:, method:nil, relevant_features:nil sims = {} neighbors.each do |neighbor| sim = neighbor["similarity"] diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index d7a1f08..15d1031 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -16,10 +16,10 @@ module OpenTox folds: n ) cv.save # set created_at + nr_instances = 0 nr_unpredicted = 0 - #predictions = {} - training_dataset = Dataset.find model.training_dataset_id + training_dataset = model.training_dataset training_dataset.folds(n).each_with_index do |fold,fold_nr| #fork do # parallel execution of validations can lead to Rserve and memory problems $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" diff --git a/lib/dataset.rb b/lib/dataset.rb index 2e21e5b..453fc35 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -165,11 +165,9 @@ module OpenTox feature = nil if values.size == 0 # empty feature elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes - metadata["numeric"] = true numeric[i] = true feature = NumericFeature.find_or_create_by(metadata) else - metadata["nominal"] = true metadata["accept_values"] = values numeric[i] = false feature = NominalFeature.find_or_create_by(metadata) diff --git a/lib/feature.rb b/lib/feature.rb index c6fb68a..0ca4d41 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -2,30 +2,28 @@ module OpenTox # Basic feature class class Feature - field :nominal, type: Boolean - field :numeric, type: Boolean field :measured, type: Boolean field :calculated, type: Boolean field :category, type: String field :unit, type: String field :conditions, type: Hash + + def nominal? + self.class == NominalFeature + end + + def numeric? + self.class == NumericFeature + end end # Feature for categorical variables class NominalFeature < Feature field :accept_values, type: Array - def initialize params - super params - nominal = true - end end # Feature for quantitative variables class NumericFeature < Feature - def initialize params - super params - numeric = true - end end # Feature for SMARTS fragments diff --git a/lib/import.rb b/lib/import.rb index 17894a9..8e57401 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -76,7 +76,7 @@ module OpenTox if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step - proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source) + proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true) nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset end else @@ -98,6 +98,7 @@ module OpenTox :category => category, :conditions => effect["conditions"], :source => study["protocol"]["category"]["term"], + :measured => true, :warnings => warnings ) nanoparticle.parse_ambit_value feature, effect["result"], dataset diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 6905f6f..f74f263 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -8,54 +8,6 @@ module OpenTox attr_accessor :scaled_values -=begin - def physchem_neighbors min_sim: 0.9, dataset_id:, prediction_feature_id:, relevant_features: - dataset = Dataset.find(dataset_id) - #relevant_features = {} - measurements = [] - substances = [] - # TODO: exclude query activities!!! - dataset.substances.each do |s| - if s.core == self.core # exclude nanoparticles with different core - dataset.values(s,prediction_feature_id).each do |act| - measurements << act - substances << s - end - end - end - neighbors = [] - substances.each do |substance| - values = dataset.values(substance,prediction_feature_id) - if values - common_descriptors = relevant_features.keys & substance.descriptors.keys - # scale values - query_descriptors = common_descriptors.collect{|d| (descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} - @scaled_values = common_descriptors.collect{|d| [d,(descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h - neighbor_descriptors = common_descriptors.collect{|d| (substance.descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]} - neighbor_scaled_values = common_descriptors.collect{|d| [d,(substance.descriptors[d].median-relevant_features[d]["mean"])/relevant_features[d]["sd"]]}.to_h - #weights = common_descriptors.collect{|d| 1-relevant_features[d]["p_value"]} - weights = common_descriptors.collect{|d| relevant_features[d]["r"]**2} - sim = Algorithm::Similarity.weighted_cosine(query_descriptors,neighbor_descriptors,weights) - neighbors << { - "_id" => substance.id, - "measurements" => values, - "similarity" => sim, - "common_descriptors" => common_descriptors.collect do |id| - { - :id => id, - :scaled_value => neighbor_scaled_values[id], - :p_value => relevant_features[id]["p_value"], - :r_squared => relevant_features[id]["r"]**2} - end - } if sim >= min_sim - end - end - $logger.debug "#{self.name}: #{neighbors.size} neighbors" - neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} - neighbors - end -=end - def add_feature feature, value, dataset unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand case feature.category @@ -78,8 +30,6 @@ module OpenTox end def parse_ambit_value feature, v, dataset - #p dataset - #p feature # TODO add study id to warnings v.delete "unit" # TODO: ppm instead of weights diff --git a/lib/physchem.rb b/lib/physchem.rb index 86300ba..c32e382 100644 --- a/lib/physchem.rb +++ b/lib/physchem.rb @@ -42,7 +42,7 @@ module OpenTox def self.descriptors desc=DESCRIPTORS desc.collect do |name,description| lib,desc = name.split('.',2) - self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false) + self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true) end end @@ -54,11 +54,11 @@ module OpenTox CDK_DESCRIPTIONS.select{|d| desc == d[:java_class].split('.').last.sub('Descriptor','') }.first[:names].each do |n| dname = "#{name}.#{n}" description = DESCRIPTORS[dname] - udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false) + udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true) end else description = DESCRIPTORS[name] - udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false) + udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true) end end udesc diff --git a/lib/regression.rb b/lib/regression.rb index cf6d9cb..0e5e06b 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -3,7 +3,7 @@ module OpenTox class Regression - def self.weighted_average descriptors:nil, neighbors:, parameters:nil + def self.weighted_average descriptors:nil, neighbors:, parameters:nil, method:nil, relevant_features:nil # TODO: prediction_interval weighted_sum = 0.0 sim_sum = 0.0 -- cgit v1.2.3 From dc4ab1f4e64d738d6c0b70f0b690a2359685080f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 12 Oct 2016 21:32:27 +0200 Subject: physchem regression, correlation_filter for fingerprints --- lib/caret.rb | 184 ++++++++++++++----------------------------- lib/classification.rb | 23 ++---- lib/compound.rb | 48 ++---------- lib/feature_selection.rb | 60 +++++++-------- lib/model.rb | 197 ++++++++++++++++++++++++++++++++--------------- lib/overwrite.rb | 13 +++- lib/physchem.rb | 14 ++-- lib/regression.rb | 15 ++-- lib/similarity.rb | 25 ++++-- lib/substance.rb | 60 --------------- 10 files changed, 278 insertions(+), 361 deletions(-) (limited to 'lib') diff --git a/lib/caret.rb b/lib/caret.rb index b999b06..59e02da 100644 --- a/lib/caret.rb +++ b/lib/caret.rb @@ -5,33 +5,56 @@ module OpenTox # TODO classification # model list: https://topepo.github.io/caret/modelList.html - attr_accessor :descriptors, :neighbors, :method, :relevant_features, :data_frame, :feature_names, :weights, :query_features - - def initialize descriptors:, neighbors:, method:, relevant_features: - @descriptors = descriptors - @neighbors = neighbors - @method = method - @relevant_features = relevant_features - end - - def self.regression descriptors:, neighbors:, method:, relevant_features:nil - - caret = new(descriptors:descriptors, neighbors:neighbors, method:method, relevant_features:relevant_features) - # collect training data for R - if descriptors.is_a? Array - caret.fingerprint2R - elsif descriptors.is_a? Hash - caret.properties2R - else - bad_request_error "Descriptors should be a fingerprint (Array) or properties (Hash). Cannot handle '#{descriptors.class}'." - end - if caret.feature_names.empty? or caret.data_frame.flatten.uniq == ["NA"] - prediction = Algorithm::Regression::weighted_average(descriptors: @descriptors, neighbors: neighbors) + def self.create_model_and_predict dependent_variables:, independent_variables:, weights:, method:, query_variables: + if independent_variables.flatten.uniq == ["NA"] + prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." else - prediction = caret.r_model_prediction + dependent_variables.each_with_index do |v,i| + dependent_variables[i] = to_r(v) + end + independent_variables.each_with_index do |c,i| + c.each_with_index do |v,j| + independent_variables[i][j] = to_r(v) + end + end + query_variables.each_with_index do |v,i| + query_variables[i] = to_r(v) + end + begin + R.assign "weights", weights + r_data_frame = "data.frame(#{([dependent_variables]+independent_variables).collect{|r| "c(#{r.join(',')})"}.join(', ')})" + R.eval "data <- #{r_data_frame}" + R.assign "features", (0..independent_variables.size-1).to_a + R.eval "names(data) <- append(c('activities'),features)" # + R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)" + rescue => e + $logger.debug "R caret model creation error for:" + $logger.debug JSON.pretty_generate(dependent_variables) + $logger.debug JSON.pretty_generate(independent_variables) + return {:value => nil, :warning => "R caret model cration error."} + end + begin + R.eval "query <- data.frame(rbind(c(#{query_variables.join ','})))" + R.eval "names(query) <- features" + R.eval "prediction <- predict(model,query)" + value = R.eval("prediction").to_f + rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f + r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f + prediction_interval = value-1.96*rmse, value+1.96*rmse + prediction = { + :value => value, + :rmse => rmse, + :r_squared => r_squared, + :prediction_interval => prediction_interval + } + rescue => e + $logger.debug "R caret prediction error for:" + $logger.debug self.inspect + return nil + end if prediction.nil? or prediction[:value].nil? - prediction = Algorithm::Regression::weighted_average(descriptors: @descriptors, neighbors: neighbors) + prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights prediction[:warning] = "Could not create local caret model. Using weighted average of similar substances." end end @@ -39,111 +62,18 @@ module OpenTox end - def fingerprint2R - - values = [] - features = {} - @weights = [] - descriptor_ids = neighbors.collect{|n| n["descriptors"]}.flatten.uniq.sort - - neighbors.each do |n| - activities = n["measurements"] - activities.each do |act| - values << act - @weights << n["similarity"] - descriptor_ids.each do |id| - features[id] ||= [] - features[id] << n["descriptors"].include?(id) - end - end if activities - end - - @feature_names = [] - @data_frame = [values] - - features.each do |k,v| - unless v.uniq.size == 1 - @data_frame << v.collect{|m| m ? "T" : "F"} - @feature_names << k - end - end - @query_features = @feature_names.collect{|f| descriptors.include?(f) ? "T" : "F"} - + # call caret methods dynamically, e.g. Caret.pls + def self.method_missing(sym, *args, &block) + args.first[:method] = sym.to_s + self.create_model_and_predict args.first end - - def properties2R - - @weights = [] - @feature_names = [] - @query_features = [] - - # keep only descriptors with values - @relevant_features.keys.each_with_index do |f,i| - if @descriptors[f] - @feature_names << f - @query_features << @descriptors[f].median - else - neighbors.each do |n| - n["descriptors"].delete_at i - end - end - end - - measurements = neighbors.collect{|n| n["measurements"]}.flatten - # initialize data frame with 'NA' defaults - @data_frame = Array.new(@feature_names.size+1){Array.new(measurements.size,"NA") } - - i = 0 - # parse neighbor activities and descriptors - neighbors.each do |n| - activities = n["measurements"] - activities.each do |act| # multiple measurements are treated as separate instances - unless n["descriptors"].include?(nil) - data_frame[0][i] = act - @weights << n["similarity"] - n["descriptors"].each_with_index do |d,j| - @data_frame[j+1][i] = d - end - i += 1 - end - end if activities # ignore neighbors without measurements - end - - end - - def r_model_prediction - begin - R.assign "weights", @weights - r_data_frame = "data.frame(#{@data_frame.collect{|r| "c(#{r.join(',')})"}.join(', ')})" - R.eval "data <- #{r_data_frame}" - R.assign "features", @feature_names - R.eval "names(data) <- append(c('activities'),features)" # - R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)" - rescue => e - $logger.debug "R caret model creation error for:" - $logger.debug JSON.pretty_generate(self.inspect) - return nil - end - begin - R.eval "query <- data.frame(rbind(c(#{@query_features.join ','})))" - R.eval "names(query) <- features" - R.eval "prediction <- predict(model,query)" - value = R.eval("prediction").to_f - rmse = R.eval("getTrainPerf(model)$TrainRMSE").to_f - r_squared = R.eval("getTrainPerf(model)$TrainRsquared").to_f - prediction_interval = value-1.96*rmse, value+1.96*rmse - { - :value => value, - :rmse => rmse, - :r_squared => r_squared, - :prediction_interval => prediction_interval - } - rescue => e - $logger.debug "R caret prediction error for:" - $logger.debug self.inspect - return nil - end + def self.to_r v + return "F" if v == false + return "T" if v == true + return "NA" if v.nil? + return "NA" if v.is_a? Float and v.nan? + v end end diff --git a/lib/classification.rb b/lib/classification.rb index 6582e7d..e8c179f 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -3,24 +3,17 @@ module OpenTox class Classification - def self.weighted_majority_vote descriptors:nil, neighbors:, method:nil, relevant_features:nil - sims = {} - neighbors.each do |neighbor| - sim = neighbor["similarity"] - activities = neighbor["measurements"] - activities.each do |act| - sims[act] ||= [] - sims[act] << sim - end if activities + def self.weighted_majority_vote dependent_variables:, independent_variables:nil, weights:, query_variables: + class_weights = {} + dependent_variables.each_with_index do |v,i| + class_weights[v] ||= [] + class_weights[v] << weights[i] unless v.nil? end - sim_all = sims.collect{|a,s| s}.flatten - sim_sum = sim_all.sum - sim_max = sim_all.max probabilities = {} - sims.each do |a,s| - probabilities[a] = s.sum/sim_sum + class_weights.each do |a,w| + probabilities[a] = w.sum/weights.sum end - probabilities = probabilities.collect{|a,p| [a,sim_max*p]}.to_h + probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h p_max = probabilities.collect{|a,p| p}.max prediction = probabilities.key(p_max) {:value => prediction,:probabilities => probabilities} diff --git a/lib/compound.rb b/lib/compound.rb index 93cfc03..0f178ce 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -75,7 +75,11 @@ module OpenTox fingerprints[type] end - def calculated_physchem descriptors=PhysChem.openbabel_descriptors + def calculated_properties types=["OPENBABEL"] + descriptors = [] + types.each do |t| + descriptors += PhysChem.descriptors OpenTox.const_get(t) + end # TODO: speedup java descriptors calculated_ids = properties.keys # BSON::ObjectId instances are not allowed as keys in a BSON document. @@ -254,48 +258,6 @@ module OpenTox self["chemblid"] end -=begin - def fingerprint_neighbors(type:, min_sim: 0.1, dataset_id:, prediction_feature_id:) - neighbors = [] - dataset = Dataset.find(dataset_id) - # TODO: fix db_neighbors -# if type == DEFAULT_FINGERPRINT -# neighbors = db_neighbors(min_sim: min_sim, dataset_id: dataset_id) -# neighbors.each do |n| -# n["measurements"] = dataset.values(n["_id"],prediction_feature_id) -# end -# else - query_fingerprint = self.fingerprint type - dataset.compounds.each do |compound| - values = dataset.values(compound,prediction_feature_id) - if values - candidate_fingerprint = compound.fingerprint type - sim = Algorithm::Similarity.tanimoto(query_fingerprint , candidate_fingerprint) - neighbors << {"_id" => compound.id, "measurements" => values, "similarity" => sim} if sim >= min_sim - end -# end - end - neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]} - end -=end - -# def physchem_neighbors params -# # TODO: fix, tests -# feature_dataset = Dataset.find params[:feature_dataset_id] -# query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors] -# neighbors = [] -# feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i| -# # TODO implement pearson and cosine similarity separatly -# R.assign "x", query_fingerprint -# R.assign "y", candidate_fingerprint -# sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first -# if sim >= params[:min_sim] -# neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming -# end -# end -# neighbors -# end - def db_neighbors min_sim: 0.1, dataset_id: p fingerprints[DEFAULT_FINGERPRINT] # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb diff --git a/lib/feature_selection.rb b/lib/feature_selection.rb index 43e3bea..f599539 100644 --- a/lib/feature_selection.rb +++ b/lib/feature_selection.rb @@ -3,41 +3,39 @@ module OpenTox class FeatureSelection - def self.correlation_filter dataset:, prediction_feature:, types:nil - # TODO: speedup, single assignment of all features to R+ parallel computation of significance? + def self.correlation_filter model relevant_features = {} - measurements = [] - substances = [] - dataset.substances.each do |s| - dataset.values(s,prediction_feature).each do |act| - measurements << act - substances << s - end - end - R.assign "tox", measurements - feature_ids = dataset.substances.collect{ |s| s["properties"].keys}.flatten.uniq - feature_ids.select!{|fid| types.include? Feature.find(fid).category} if types - feature_ids.each do |feature_id| - feature_values = substances.collect{|s| s["properties"][feature_id].first if s["properties"][feature_id]} - unless feature_values.uniq.size == 1 - R.assign "feature", feature_values - begin - R.eval "cor <- cor.test(tox,feature,method = 'pearson',use='pairwise')" - pvalue = R.eval("cor$p.value").to_ruby - if pvalue <= 0.05 - r = R.eval("cor$estimate").to_ruby - relevant_features[feature_id] = {} - relevant_features[feature_id]["pvalue"] = pvalue - relevant_features[feature_id]["r"] = r - relevant_features[feature_id]["mean"] = R.eval("mean(feature, na.rm=TRUE)").to_ruby - relevant_features[feature_id]["sd"] = R.eval("sd(feature, na.rm=TRUE)").to_ruby - end - rescue - warn "Correlation of '#{Feature.find(feature_id).name}' (#{feature_values}) with '#{Feature.find(prediction_feature_id).name}' (#{measurements}) failed." + R.assign "dependent", model.dependent_variables.collect{|v| to_r(v)} + model.descriptor_weights = [] + selected_variables = [] + selected_descriptor_ids = [] + model.independent_variables.each_with_index do |v,i| + R.assign "independent", v.collect{|n| to_r(n)} + begin + R.eval "cor <- cor.test(dependent,independent,method = 'pearson',use='pairwise')" + pvalue = R.eval("cor$p.value").to_ruby + if pvalue <= 0.05 + model.descriptor_weights << R.eval("cor$estimate").to_ruby**2 + selected_variables << v + selected_descriptor_ids << model.descriptor_ids[i] end + rescue + #warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with '#{Feature.find(model.descriptor_ids[i]).name}' (#{v}) failed." + warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with (#{v}) failed." end end - relevant_features.sort{|a,b| a[1]["pvalue"] <=> b[1]["pvalue"]}.to_h + + model.independent_variables = selected_variables + model.descriptor_ids = selected_descriptor_ids + model + end + + def self.to_r v + return 0 if v == false + return 1 if v == true + return "NA" if v.nil? + return "NA" if v.is_a? Float and v.nan? + v end end diff --git a/lib/model.rb b/lib/model.rb index 290309a..f3f0603 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -11,10 +11,18 @@ module OpenTox field :name, type: String field :creator, type: String, default: __FILE__ + field :algorithms, type: Hash, default:{} field :training_dataset_id, type: BSON::ObjectId + field :substance_ids, type: Array, default:[] field :prediction_feature_id, type: BSON::ObjectId - field :algorithms, type: Hash - field :relevant_features, type: Hash + field :dependent_variables, type: Array, default:[] + field :descriptor_ids, type:Array, default:[] + field :independent_variables, type: Array, default:[] + field :fingerprints, type: Array, default:[] + field :descriptor_weights, type: Array, default:[] + field :descriptor_means, type: Array, default:[] + field :descriptor_sds, type: Array, default:[] + field :scaled_variables, type: Array, default:[] def self.create prediction_feature:nil, training_dataset:nil, algorithms:{} bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset @@ -40,7 +48,7 @@ module OpenTox model.prediction_feature_id = prediction_feature.id model.training_dataset_id = training_dataset.id - model.name = "#{training_dataset.name} #{prediction_feature.name}" + model.name = "#{prediction_feature.name} (#{training_dataset.name})" # set defaults substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq @@ -49,10 +57,7 @@ module OpenTox if substance_classes.first == "OpenTox::Compound" model.algorithms = { - :descriptors => { - :method => "fingerprint", - :type => 'MP2D', - }, + :descriptors => ['MP2D'], :similarity => { :method => "Algorithm::Similarity.tanimoto", :min => 0.1 @@ -66,25 +71,20 @@ module OpenTox } elsif model.class == LazarRegression model.algorithms[:prediction] = { - :method => "Algorithm::Caret.regression", - :parameters => "pls", + :method => "Algorithm::Caret.pls", } end elsif substance_classes.first == "OpenTox::Nanoparticle" model.algorithms = { - :descriptors => { - :method => "properties", - #:types => ["P-CHEM","Proteomics"], - :types => ["P-CHEM"], - }, + :descriptors => ["P-CHEM"], + #:descriptors => ["P-CHEM","Proteomics"], :similarity => { :method => "Algorithm::Similarity.weighted_cosine", :min => 0.5 }, :prediction => { - :method => "Algorithm::Caret.regression", - :parameters => "rf", + :method => "Algorithm::Caret.rf", }, :feature_selection => { :method => "Algorithm::FeatureSelection.correlation_filter", @@ -106,63 +106,128 @@ module OpenTox end end + # parse dependent_variables from training dataset + training_dataset.substances.each do |substance| + values = training_dataset.values(substance,model.prediction_feature_id) + values.each do |v| + model.substance_ids << substance.id.to_s + model.dependent_variables << v + end if values + end + + # parse fingerprints + if model.fingerprints? + model.algorithms[:descriptors].each do |type| + model.substances.each_with_index do |s,i| + model.fingerprints[i] ||= [] + model.fingerprints[i] += s.fingerprint(type) + model.fingerprints[i].uniq! + end + end + model.descriptor_ids = model.fingerprints.flatten.uniq + model.descriptor_ids.each do |d| + model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} + end + else + # parse independent_variables + if (model.algorithms[:descriptors] & ["PhysChem::OPENBABEL","PhysChem::CDK","PhysChem::JOELIB"]).empty? + properties = model.substances.collect { |s| s.properties } + all_property_ids = properties.collect{|p| p.keys}.flatten.uniq + model.descriptor_ids = all_property_ids.select{|id| model.algorithms[:descriptors].include? Feature.find(id).category } + model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} + + # calculate physchem properties + else + properties = model.substances.collect { |s| s.calculated_properties(model.algorithms[:descriptors]) } + model.descriptor_ids = properties.collect{|p| p.keys}.flatten.uniq + model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i]}} + end + end + if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method] - model.relevant_features = Algorithm.run model.algorithms[:feature_selection][:method], dataset: training_dataset, prediction_feature: prediction_feature, types: model.algorithms[:descriptors][:types] + model = Algorithm.run model.algorithms[:feature_selection][:method], model + end + + # scale independent_variables + unless model.fingerprints? + model.independent_variables.each_with_index do |var,i| + model.descriptor_means[i] = var.mean + model.descriptor_sds[i] = var.standard_deviation + model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil} + end end model.save model end def predict_substance substance - neighbors = substance.neighbors dataset_id: training_dataset_id, prediction_feature_id: prediction_feature_id, descriptors: algorithms[:descriptors], similarity: algorithms[:similarity], relevant_features: relevant_features - measurements = nil - prediction = {} - # handle query substance - if neighbors.collect{|n| n["_id"]}.include? substance.id - - query = neighbors.select{|n| n["_id"] == substance.id}.first - measurements = training_dataset.values(query["_id"],prediction_feature_id) - prediction[:measurements] = measurements - prediction[:warning] = "#{measurements.size} substances have been removed from neighbors, because they are identical with the query substance." - neighbors.delete_if{|n| n["_id"] == substance.id} # remove query substance for an unbiased prediction (also useful for loo validation) + + case algorithms[:similarity][:method] + when /tanimoto/ # binary features + similarity_descriptors = algorithms[:descriptors].collect{|type| substance.fingerprint(type)}.flatten.uniq + # TODO this excludes descriptors only present in the query substance + query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id} + when /euclid|cosine/ # quantitative features + similarity_descriptors = descriptor_ids.collect_with_index{|id,i| + prop = substance.properties[id] + prop = prop.median if prop.is_a? Array # measured + (prop-descriptor_means[i])/descriptor_sds[i] + } + query_descriptors = descriptor_ids.collect_with_index{|id,i| + prop = substance.properties[id] + prop = prop.median if prop.is_a? Array # measured + substance.properties[id] + } + else + bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'." end - if neighbors.empty? - prediction.merge!({:value => nil,:probabilities => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) - elsif neighbors.size == 1 - value = nil - m = neighbors.first["measurements"] - if m.size == 1 # single measurement - value = m.first - else # multiple measurement - if m.collect{|t| t.numeric?}.uniq == [true] # numeric - value = m.median - elsif m.uniq.size == 1 # single value - value = m.first - else # contradictory results - # TODO add majority vote?? + + prediction = {} + neighbor_ids = [] + neighbor_similarities = [] + neighbor_dependent_variables = [] + neighbor_independent_variables = [] + + prediction = {} + # find neighbors + substance_ids.each_with_index do |s,i| + # handle query substance + if substance.id.to_s == s + prediction[:measurements] ||= [] + prediction[:measurements] << dependent_variables[i] + prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance." + else + next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core + if fingerprints? + neighbor_descriptors = fingerprints[i] + else + neighbor_descriptors = scaled_variables.collect{|v| v[i]} + end + sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights] + if sim > algorithms[:similarity][:min] + neighbor_ids << s + neighbor_similarities << sim + neighbor_dependent_variables << dependent_variables[i] + independent_variables.each_with_index do |c,j| + neighbor_independent_variables[j] ||= [] + neighbor_independent_variables[j] << independent_variables[j][i] + end end end - prediction.merge!({:value => value, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting median of its experimental values.", :neighbors => neighbors}) if value + end + + measurements = nil + + if neighbor_similarities.empty? + prediction.merge!({:value => nil,:warning => "Could not find similar substances with experimental data in the training dataset.",:neighbors => []}) + elsif neighbor_similarities.size == 1 + prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]}) else # call prediction algorithm - case algorithms[:descriptors][:method] - when "fingerprint" - descriptors = substance.fingerprints[algorithms[:descriptors][:type]] - when "properties" - descriptors = substance.properties - else - bad_request_error "Descriptor method '#{algorithms[:descriptors][:method]}' not available." - end - params = { - :method => algorithms[:prediction][:parameters], - :descriptors => descriptors, - :neighbors => neighbors, - :relevant_features => relevant_features - } - result = Algorithm.run algorithms[:prediction][:method], params + result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors + p result prediction.merge! result - prediction[:neighbors] = neighbors - prediction[:neighbors] ||= [] + prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}} end prediction end @@ -221,6 +286,18 @@ module OpenTox Feature.find(prediction_feature_id) end + def descriptors + descriptor_ids.collect{|id| Feature.find(id)} + end + + def substances + substance_ids.collect{|id| Substance.find(id)} + end + + def fingerprints? + algorithms[:similarity][:method].match("tanimoto") ? true : false + end + end class LazarClassification < Lazar diff --git a/lib/overwrite.rb b/lib/overwrite.rb index 4a79051..d0422ee 100644 --- a/lib/overwrite.rb +++ b/lib/overwrite.rb @@ -101,13 +101,13 @@ class Array end def mean - self.inject{ |sum, el| sum + el }.to_f / self.size + self.compact.inject{ |sum, el| sum + el }.to_f / self.compact.size end def sample_variance m = self.mean - sum = self.inject(0){|accum, i| accum +(i-m)**2 } - sum/(self.length - 1).to_f + sum = self.compact.inject(0){|accum, i| accum +(i-m)**2 } + sum/(self.compact.length - 1).to_f end def standard_deviation @@ -123,6 +123,13 @@ class Array end end + def collect_with_index + result = [] + self.each_with_index do |elt, idx| + result << yield(elt, idx) + end + result + end end module URI diff --git a/lib/physchem.rb b/lib/physchem.rb index c32e382..327acd8 100644 --- a/lib/physchem.rb +++ b/lib/physchem.rb @@ -14,7 +14,7 @@ module OpenTox JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar") obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"] - OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d| + OPENBABEL = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d| name,description = d.split(/\s+/,2) ["Openbabel."+name,description] unless obexclude.include? name end.compact.sort{|a,b| a[0] <=> b[0]}] @@ -25,17 +25,17 @@ module OpenTox prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,'') d[:names].each { |name| cdkdescriptors[prefix+"."+name] = d[:description] } end - CDKDESCRIPTORS = cdkdescriptors + CDK = cdkdescriptors # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug) joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"] # strip Joelib messages from stdout - JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d| + JOELIB = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d| name = d[:java_class].sub(/^joelib2.feature.types./,'') ["Joelib."+name, "JOELIb does not provide meaningful descriptions, see java/JoelibDescriptors.java for details."] unless joelibexclude.include? name end.compact.sort{|a,b| a[0] <=> b[0]}] - DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS)) + DESCRIPTORS = OPENBABEL.merge(CDK.merge(JOELIB)) require_relative "unique_descriptors.rb" @@ -65,15 +65,15 @@ module OpenTox end def self.openbabel_descriptors - descriptors OBDESCRIPTORS + descriptors OPENBABEL end def self.cdk_descriptors - descriptors CDKDESCRIPTORS + descriptors CDK end def self.joelib_descriptors - descriptors JOELIBDESCRIPTORS + descriptors JOELIB end def calculate compound diff --git a/lib/regression.rb b/lib/regression.rb index 0e5e06b..bed6df8 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -3,18 +3,15 @@ module OpenTox class Regression - def self.weighted_average descriptors:nil, neighbors:, parameters:nil, method:nil, relevant_features:nil + def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables: + #def self.weighted_average descriptors:nil, neighbors:, parameters:nil, method:nil, relevant_features:nil # TODO: prediction_interval weighted_sum = 0.0 sim_sum = 0.0 - neighbors.each do |neighbor| - sim = neighbor["similarity"] - activities = neighbor["measurements"] - activities.each do |act| - weighted_sum += sim*act - sim_sum += sim - end if activities - end + dependent_variables.each_with_index do |v,i| + weighted_sum += weights[i]*dependent_variables[i] + sim_sum += weights[i] + end if dependent_variables sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum {:value => prediction} end diff --git a/lib/similarity.rb b/lib/similarity.rb index b9b4571..328d42a 100644 --- a/lib/similarity.rb +++ b/lib/similarity.rb @@ -19,18 +19,19 @@ module OpenTox ( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f end - def self.euclid fingerprints - sq = fingerprints[0].zip(fingerprints[1]).map{|a,b| (a - b) ** 2} + def self.euclid scaled_properties + sq = scaled_properties[0].zip(scaled_properties[1]).map{|a,b| (a - b) ** 2} Math.sqrt(sq.inject(0) {|s,c| s + c}) end # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity - def self.cosine fingerprints - Algorithm::Vector.dot_product(fingerprints[0], fingerprints[1]) / (Algorithm::Vector.magnitude(fingerprints[0]) * Algorithm::Vector.magnitude(fingerprints[1])) + def self.cosine scaled_properties + scaled_properties = remove_nils scaled_properties + Algorithm::Vector.dot_product(scaled_properties[0], scaled_properties[1]) / (Algorithm::Vector.magnitude(scaled_properties[0]) * Algorithm::Vector.magnitude(scaled_properties[1])) end - def self.weighted_cosine fingerprints # [a,b,weights] - a, b, w = fingerprints + def self.weighted_cosine scaled_properties # [a,b,weights] + a,b,w = remove_nils scaled_properties dot_product = 0 magnitude_a = 0 magnitude_b = 0 @@ -42,6 +43,18 @@ module OpenTox dot_product/(Math.sqrt(magnitude_a)*Math.sqrt(magnitude_b)) end + def self.remove_nils scaled_properties + a =[]; b = []; w = [] + (0..scaled_properties.first.size-1).each do |i| + if scaled_properties[0][i] and scaled_properties[1][i] and !scaled_properties[0][i].nan? and !scaled_properties[1][i].nan? + a << scaled_properties[0][i] + b << scaled_properties[1][i] + w << scaled_properties[2][i] + end + end + [a,b,w] + end + end end end diff --git a/lib/substance.rb b/lib/substance.rb index d271327..31c465e 100644 --- a/lib/substance.rb +++ b/lib/substance.rb @@ -5,64 +5,4 @@ module OpenTox field :dataset_ids, type: Array, default: [] end - def neighbors dataset_id:,prediction_feature_id:,descriptors:,similarity:,relevant_features:nil - # TODO enable empty dataset_id -> use complete db - case descriptors[:method] - when "fingerprint" - fingerprint_neighbors dataset_id:dataset_id, prediction_feature_id:prediction_feature_id, descriptors:descriptors, similarity:similarity - when "properties" - properties_neighbors dataset_id:dataset_id, prediction_feature_id:prediction_feature_id, descriptors:descriptors, similarity:similarity, relevant_features: relevant_features - else - bad_request_error "Descriptor method '#{descriptors[:method]}' not implemented." - end - end - - def fingerprint_neighbors dataset_id:,prediction_feature_id:,descriptors:,similarity: - neighbors = [] - dataset = Dataset.find(dataset_id) - dataset.substances.each do |substance| - values = dataset.values(substance,prediction_feature_id) - if values - query_descriptors = self.send(descriptors[:method].to_sym, descriptors[:type]) - candidate_descriptors = substance.send(descriptors[:method].to_sym, descriptors[:type]) - sim = Algorithm.run similarity[:method], [query_descriptors, candidate_descriptors] - neighbors << {"_id" => substance.id, "measurements" => values, "descriptors" => candidate_descriptors, "similarity" => sim} if sim >= similarity[:min] - end - end - neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]} - end - - def properties_neighbors dataset_id:,prediction_feature_id:,descriptors:,similarity:,relevant_features: - neighbors = [] - dataset = Dataset.find(dataset_id) - weights = relevant_features.collect{|k,v| v["r"]**2} - means = relevant_features.collect{|k,v| v["mean"]} - standard_deviations = relevant_features.collect{|k,v| v["sd"]} - query_descriptors = relevant_features.keys.collect{|i| properties[i].is_a?(Array) ? properties[i].median : nil } - dataset.substances.each do |substance| - values = dataset.values(substance,prediction_feature_id) - # exclude nanoparticles with different core - # TODO validate exclusion - next if substance.is_a? Nanoparticle and substance.core != self.core - if values - candidate_descriptors = relevant_features.keys.collect{|i| substance.properties[i].is_a?(Array) ? substance.properties[i].median : nil } - q = [] - c = [] - w = [] - (0..relevant_features.size-1).each do |i| - # add only complete pairs - if query_descriptors[i] and candidate_descriptors[i] - w << weights[i] - # scale values - q << (query_descriptors[i] - means[i])/standard_deviations[i] - c << (candidate_descriptors[i] - means[i])/standard_deviations[i] - end - end - sim = Algorithm.run similarity[:method], [q, c, w] - neighbors << {"_id" => substance.id, "measurements" => values, "descriptors" => candidate_descriptors, "similarity" => sim} if sim >= similarity[:min] - end - end - neighbors.sort{|a,b| b["similarity"] <=> a["similarity"]} - end - end -- cgit v1.2.3 From 1810b12e7faf2f0677482a3c7a8c23e0e11b8d29 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 12 Oct 2016 21:44:29 +0200 Subject: R NAs fixed --- lib/caret.rb | 3 +-- lib/feature_selection.rb | 6 ++---- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/caret.rb b/lib/caret.rb index 59e02da..886e2f9 100644 --- a/lib/caret.rb +++ b/lib/caret.rb @@ -71,8 +71,7 @@ module OpenTox def self.to_r v return "F" if v == false return "T" if v == true - return "NA" if v.nil? - return "NA" if v.is_a? Float and v.nan? + return nil if v.is_a? Float and v.nan? v end diff --git a/lib/feature_selection.rb b/lib/feature_selection.rb index f599539..65f9752 100644 --- a/lib/feature_selection.rb +++ b/lib/feature_selection.rb @@ -10,7 +10,8 @@ module OpenTox selected_variables = [] selected_descriptor_ids = [] model.independent_variables.each_with_index do |v,i| - R.assign "independent", v.collect{|n| to_r(n)} + v.collect!{|n| to_r(n)} + R.assign "independent", v begin R.eval "cor <- cor.test(dependent,independent,method = 'pearson',use='pairwise')" pvalue = R.eval("cor$p.value").to_ruby @@ -20,7 +21,6 @@ module OpenTox selected_descriptor_ids << model.descriptor_ids[i] end rescue - #warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with '#{Feature.find(model.descriptor_ids[i]).name}' (#{v}) failed." warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with (#{v}) failed." end end @@ -33,8 +33,6 @@ module OpenTox def self.to_r v return 0 if v == false return 1 if v == true - return "NA" if v.nil? - return "NA" if v.is_a? Float and v.nan? v end -- cgit v1.2.3 From 8d325866dd7cacdd04bd2306a9144a5e7300c7c8 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 13 Oct 2016 10:11:09 +0200 Subject: molecular_weight fixed --- lib/compound.rb | 5 +++-- lib/model.rb | 4 ++-- lib/regression.rb | 1 - 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index 0f178ce..ca9d5e3 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -75,9 +75,10 @@ module OpenTox fingerprints[type] end - def calculated_properties types=["OPENBABEL"] + def calculated_properties types=["PhysChem::OPENBABEL"] descriptors = [] types.each do |t| + p t descriptors += PhysChem.descriptors OpenTox.const_get(t) end # TODO: speedup java descriptors @@ -304,7 +305,7 @@ module OpenTox # @return [Float] molecular weight def molecular_weight mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW") - calculated_physchem([mw_feature])[mw_feature.id.to_s] + calculated_properties[mw_feature.id.to_s] end private diff --git a/lib/model.rb b/lib/model.rb index f3f0603..859df8b 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -126,7 +126,8 @@ module OpenTox end model.descriptor_ids = model.fingerprints.flatten.uniq model.descriptor_ids.each do |d| - model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} + # resulting model may break BSON size limit (e.g. f Kazius dataset + model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/ end else # parse independent_variables @@ -225,7 +226,6 @@ module OpenTox else # call prediction algorithm result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors - p result prediction.merge! result prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}} end diff --git a/lib/regression.rb b/lib/regression.rb index bed6df8..d1724fd 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -4,7 +4,6 @@ module OpenTox class Regression def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables: - #def self.weighted_average descriptors:nil, neighbors:, parameters:nil, method:nil, relevant_features:nil # TODO: prediction_interval weighted_sum = 0.0 sim_sum = 0.0 -- cgit v1.2.3 From c3a7e75cb36908da36d155cad5478800e32aaf5f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 13 Oct 2016 10:47:19 +0200 Subject: test_physchem fixed --- lib/compound.rb | 1 - 1 file changed, 1 deletion(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index ca9d5e3..72882d0 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -78,7 +78,6 @@ module OpenTox def calculated_properties types=["PhysChem::OPENBABEL"] descriptors = [] types.each do |t| - p t descriptors += PhysChem.descriptors OpenTox.const_get(t) end # TODO: speedup java descriptors -- cgit v1.2.3 From 9e99495ecbff147218023c136bade9e56a502fed Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 13 Oct 2016 14:39:04 +0200 Subject: descriptor tests fixed --- lib/compound.rb | 14 ++++++-------- lib/model.rb | 4 ++-- lib/nanoparticle.rb | 2 -- 3 files changed, 8 insertions(+), 12 deletions(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index 72882d0..b47364c 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -75,11 +75,7 @@ module OpenTox fingerprints[type] end - def calculated_properties types=["PhysChem::OPENBABEL"] - descriptors = [] - types.each do |t| - descriptors += PhysChem.descriptors OpenTox.const_get(t) - end + def calculate_properties descriptors=PhysChem::OPENBABEL # TODO: speedup java descriptors calculated_ids = properties.keys # BSON::ObjectId instances are not allowed as keys in a BSON document. @@ -98,7 +94,8 @@ module OpenTox end end save - properties.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id} + descriptors.collect{|d| properties[d.id.to_s]} + #properties.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id} end def smarts_match smarts, count=false @@ -303,8 +300,9 @@ module OpenTox # Calculate molecular weight of Compound with OB and store it in object # @return [Float] molecular weight def molecular_weight - mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW") - calculated_properties[mw_feature.id.to_s] + mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW").id.to_s + calculate_properties unless properties[mw_feature] + properties[mw_feature] end private diff --git a/lib/model.rb b/lib/model.rb index 859df8b..7029c31 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -131,7 +131,7 @@ module OpenTox end else # parse independent_variables - if (model.algorithms[:descriptors] & ["PhysChem::OPENBABEL","PhysChem::CDK","PhysChem::JOELIB"]).empty? + if (model.algorithms[:descriptors] & [PhysChem::OPENBABEL,PhysChem::CDK,PhysChem::JOELIB]).empty? properties = model.substances.collect { |s| s.properties } all_property_ids = properties.collect{|p| p.keys}.flatten.uniq model.descriptor_ids = all_property_ids.select{|id| model.algorithms[:descriptors].include? Feature.find(id).category } @@ -139,7 +139,7 @@ module OpenTox # calculate physchem properties else - properties = model.substances.collect { |s| s.calculated_properties(model.algorithms[:descriptors]) } + properties = model.substances.collect { |s| s.calculate_properties(model.algorithms[:descriptors]) } model.descriptor_ids = properties.collect{|p| p.keys}.flatten.uniq model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i]}} end diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index f74f263..23e155c 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -6,8 +6,6 @@ module OpenTox field :core, type: Hash, default: {} field :coating, type: Array, default: [] - attr_accessor :scaled_values - def add_feature feature, value, dataset unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand case feature.category -- cgit v1.2.3 From ad7ec6a1e33f69557fe64371581d5f42a65ecaa8 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 13 Oct 2016 17:34:31 +0200 Subject: classification fixed --- lib/model.rb | 63 ++++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 25 deletions(-) (limited to 'lib') diff --git a/lib/model.rb b/lib/model.rb index 7029c31..b949042 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -57,7 +57,10 @@ module OpenTox if substance_classes.first == "OpenTox::Compound" model.algorithms = { - :descriptors => ['MP2D'], + :descriptors => { + :method => "fingerprint", + :type => "MP2D", + }, :similarity => { :method => "Algorithm::Similarity.tanimoto", :min => 0.1 @@ -77,7 +80,10 @@ module OpenTox elsif substance_classes.first == "OpenTox::Nanoparticle" model.algorithms = { - :descriptors => ["P-CHEM"], + :descriptors => { + :method => "properties", + :category => "P-CHEM", + }, #:descriptors => ["P-CHEM","Proteomics"], :similarity => { :method => "Algorithm::Similarity.weighted_cosine", @@ -115,34 +121,41 @@ module OpenTox end if values end + descriptor_method = model.algorithms[:descriptors][:method] + case descriptor_method # parse fingerprints - if model.fingerprints? - model.algorithms[:descriptors].each do |type| - model.substances.each_with_index do |s,i| - model.fingerprints[i] ||= [] - model.fingerprints[i] += s.fingerprint(type) - model.fingerprints[i].uniq! - end + when "fingerprint" + type = model.algorithms[:descriptors][:type] + model.substances.each_with_index do |s,i| + model.fingerprints[i] ||= [] + model.fingerprints[i] += s.fingerprint(type) + model.fingerprints[i].uniq! end model.descriptor_ids = model.fingerprints.flatten.uniq model.descriptor_ids.each do |d| - # resulting model may break BSON size limit (e.g. f Kazius dataset + # resulting model may break BSON size limit (e.g. f Kazius dataset) model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/ end - else - # parse independent_variables - if (model.algorithms[:descriptors] & [PhysChem::OPENBABEL,PhysChem::CDK,PhysChem::JOELIB]).empty? - properties = model.substances.collect { |s| s.properties } - all_property_ids = properties.collect{|p| p.keys}.flatten.uniq - model.descriptor_ids = all_property_ids.select{|id| model.algorithms[:descriptors].include? Feature.find(id).category } - model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} - - # calculate physchem properties - else - properties = model.substances.collect { |s| s.calculate_properties(model.algorithms[:descriptors]) } - model.descriptor_ids = properties.collect{|p| p.keys}.flatten.uniq - model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i]}} + # calculate physchem properties + when "calculate_properties" + features = model.algorithms[:descriptors][:features] + model.descriptor_ids = features.collect{|f| f.id.to_s} + model.algorithms[:descriptors].delete(:features) + model.algorithms[:descriptors].delete(:type) + model.substances.each_with_index do |s,i| + s.calculate_properties(features).each_with_index do |v,j| + model.independent_variables[j] ||= [] + model.independent_variables[j][i] = v + end end + # parse independent_variables + when "properties" + properties = model.substances.collect { |s| s.properties } + all_property_ids = properties.collect{|p| p.keys}.flatten.uniq + model.descriptor_ids = all_property_ids.select{|id| model.algorithms[:descriptors].include? Feature.find(id).category } + model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} + else + bad_request_error "Descriptor method '#{descriptor_method}' not implemented." end if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method] @@ -165,7 +178,7 @@ module OpenTox case algorithms[:similarity][:method] when /tanimoto/ # binary features - similarity_descriptors = algorithms[:descriptors].collect{|type| substance.fingerprint(type)}.flatten.uniq + similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type] # TODO this excludes descriptors only present in the query substance query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id} when /euclid|cosine/ # quantitative features @@ -295,7 +308,7 @@ module OpenTox end def fingerprints? - algorithms[:similarity][:method].match("tanimoto") ? true : false + algorithms[:descriptors][:method] == "fingerprint" ? true : false end end -- cgit v1.2.3 From 160e75e696452ac61e651664ac56d16ce1c9c4b6 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 13 Oct 2016 19:17:03 +0200 Subject: model tests separated and cleaned --- lib/model.rb | 40 ++++++++++++++++++++++++++-------------- lib/similarity.rb | 1 + 2 files changed, 27 insertions(+), 14 deletions(-) (limited to 'lib') diff --git a/lib/model.rb b/lib/model.rb index b949042..4bbb7da 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -82,7 +82,7 @@ module OpenTox model.algorithms = { :descriptors => { :method => "properties", - :category => "P-CHEM", + :categories => ["P-CHEM"], }, #:descriptors => ["P-CHEM","Proteomics"], :similarity => { @@ -150,9 +150,14 @@ module OpenTox end # parse independent_variables when "properties" + categories = model.algorithms[:descriptors][:categories] + feature_ids = [] + categories.each do |category| + Feature.where(category:category).each{|f| feature_ids << f.id.to_s} + end properties = model.substances.collect { |s| s.properties } - all_property_ids = properties.collect{|p| p.keys}.flatten.uniq - model.descriptor_ids = all_property_ids.select{|id| model.algorithms[:descriptors].include? Feature.find(id).category } + property_ids = properties.collect{|p| p.keys}.flatten.uniq + model.descriptor_ids = feature_ids & property_ids model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} else bad_request_error "Descriptor method '#{descriptor_method}' not implemented." @@ -180,18 +185,25 @@ module OpenTox when /tanimoto/ # binary features similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type] # TODO this excludes descriptors only present in the query substance + # use for applicability domain? query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id} when /euclid|cosine/ # quantitative features - similarity_descriptors = descriptor_ids.collect_with_index{|id,i| - prop = substance.properties[id] - prop = prop.median if prop.is_a? Array # measured - (prop-descriptor_means[i])/descriptor_sds[i] - } - query_descriptors = descriptor_ids.collect_with_index{|id,i| - prop = substance.properties[id] - prop = prop.median if prop.is_a? Array # measured - substance.properties[id] - } + if algorithms[:descriptors][:method] == "calculate_properties" # calculate descriptors + features = descriptor_ids.collect{|id| Feature.find(id)} + query_descriptors = substance.calculate_properties(features) + similarity_descriptors = query_descriptors.collect_with_index{|v,i| (v-descriptor_means[i])/descriptor_sds[i]} + else + similarity_descriptors = descriptor_ids.collect_with_index{|id,i| + prop = substance.properties[id] + prop = prop.median if prop.is_a? Array # measured + (prop-descriptor_means[i])/descriptor_sds[i] + } + query_descriptors = descriptor_ids.collect_with_index{|id,i| + prop = substance.properties[id] + prop = prop.median if prop.is_a? Array # measured + substance.properties[id] + } + end else bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'." end @@ -218,7 +230,7 @@ module OpenTox neighbor_descriptors = scaled_variables.collect{|v| v[i]} end sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights] - if sim > algorithms[:similarity][:min] + if sim >= algorithms[:similarity][:min] neighbor_ids << s neighbor_similarities << sim neighbor_dependent_variables << dependent_variables[i] diff --git a/lib/similarity.rb b/lib/similarity.rb index 328d42a..772e812 100644 --- a/lib/similarity.rb +++ b/lib/similarity.rb @@ -32,6 +32,7 @@ module OpenTox def self.weighted_cosine scaled_properties # [a,b,weights] a,b,w = remove_nils scaled_properties + return cosine(scaled_properties) if w.uniq.size == 1 dot_product = 0 magnitude_a = 0 magnitude_b = 0 -- cgit v1.2.3 From 2dc66aef3b7932105868ee8c7d32ad975e142d1b Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 13 Oct 2016 19:48:21 +0200 Subject: compound tests fixed --- lib/caret.rb | 4 ++-- lib/compound.rb | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/caret.rb b/lib/caret.rb index 886e2f9..df86093 100644 --- a/lib/caret.rb +++ b/lib/caret.rb @@ -30,8 +30,8 @@ module OpenTox R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)" rescue => e $logger.debug "R caret model creation error for:" - $logger.debug JSON.pretty_generate(dependent_variables) - $logger.debug JSON.pretty_generate(independent_variables) + $logger.debug dependent_variables + $logger.debug independent_variables return {:value => nil, :warning => "R caret model cration error."} end begin diff --git a/lib/compound.rb b/lib/compound.rb index b47364c..6c53cde 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -300,9 +300,8 @@ module OpenTox # Calculate molecular weight of Compound with OB and store it in object # @return [Float] molecular weight def molecular_weight - mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW").id.to_s - calculate_properties unless properties[mw_feature] - properties[mw_feature] + mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW") + calculate_properties([mw_feature]).first end private -- cgit v1.2.3 From 09452bba5c407c27721223d126e3f45c12b20a0c Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 13 Oct 2016 22:59:45 +0200 Subject: tests pass --- lib/caret.rb | 5 +++++ lib/model.rb | 32 +++++++++----------------------- lib/regression.rb | 2 +- 3 files changed, 15 insertions(+), 24 deletions(-) (limited to 'lib') diff --git a/lib/caret.rb b/lib/caret.rb index df86093..2c4cd0c 100644 --- a/lib/caret.rb +++ b/lib/caret.rb @@ -9,6 +9,11 @@ module OpenTox if independent_variables.flatten.uniq == ["NA"] prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." + elsif + dependent_variables.size < 3 + prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights + prediction[:warning] = "Insufficient number of neighbors (#{dependent_variables.size}) for regression model. Using weighted average of similar substances." + else dependent_variables.each_with_index do |v,i| dependent_variables[i] = to_r(v) diff --git a/lib/model.rb b/lib/model.rb index 4bbb7da..d7b072f 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -28,23 +28,9 @@ module OpenTox bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset prediction_feature = training_dataset.features.first unless prediction_feature # TODO: prediction_feature without training_dataset: use all available data - # explicit prediction algorithm - if algorithms[:prediction] and algorithms[:prediction][:method] - case algorithms[:prediction][:method] - when /Classification/i - model = LazarClassification.new - when /Regression/i - model = LazarRegression.new - else - bad_request_error "Prediction method '#{algorithms[:prediction][:method]}' not implemented." - end # guess model type - elsif prediction_feature.numeric? - model = LazarRegression.new - else - model = LazarClassification.new - end + prediction_feature.numeric? ? model = LazarRegression.new : model = LazarClassification.new model.prediction_feature_id = prediction_feature.id model.training_dataset_id = training_dataset.id @@ -193,17 +179,17 @@ module OpenTox query_descriptors = substance.calculate_properties(features) similarity_descriptors = query_descriptors.collect_with_index{|v,i| (v-descriptor_means[i])/descriptor_sds[i]} else - similarity_descriptors = descriptor_ids.collect_with_index{|id,i| - prop = substance.properties[id] - prop = prop.median if prop.is_a? Array # measured - (prop-descriptor_means[i])/descriptor_sds[i] - } - query_descriptors = descriptor_ids.collect_with_index{|id,i| + similarity_descriptors = [] + query_descriptors = [] + descriptor_ids.each_with_index do |id,i| prop = substance.properties[id] prop = prop.median if prop.is_a? Array # measured - substance.properties[id] - } + if prop + similarity_descriptors[i] = (prop-descriptor_means[i])/descriptor_sds[i] + query_descriptors[i] = prop + end end + end else bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'." end diff --git a/lib/regression.rb b/lib/regression.rb index d1724fd..3890987 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -3,7 +3,7 @@ module OpenTox class Regression - def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables: + def self.weighted_average dependent_variables:, independent_variables:nil, weights:, query_variables:nil # TODO: prediction_interval weighted_sum = 0.0 sim_sum = 0.0 -- cgit v1.2.3 From fbded88db8b51f41ffbd5a02f601e4538ec87258 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 14 Oct 2016 09:55:51 +0200 Subject: git commit added to model metadata --- lib/caret.rb | 9 ++++++++- lib/compound.rb | 2 -- lib/dataset.rb | 1 - lib/model.rb | 11 +++++++++++ lib/rest-client-wrapper.rb | 6 ------ 5 files changed, 19 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/lib/caret.rb b/lib/caret.rb index 2c4cd0c..e24c943 100644 --- a/lib/caret.rb +++ b/lib/caret.rb @@ -2,10 +2,17 @@ module OpenTox module Algorithm class Caret - # TODO classification # model list: https://topepo.github.io/caret/modelList.html def self.create_model_and_predict dependent_variables:, independent_variables:, weights:, method:, query_variables: + remove = [] + # remove independent_variables with single values + independent_variables.each_with_index { |values,i| remove << i if values.uniq.size == 1} + remove.sort.reverse.each do |i| + independent_variables.delete_at i + weights.delete_at i + query_variables.delete_at i + end if independent_variables.flatten.uniq == ["NA"] prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." diff --git a/lib/compound.rb b/lib/compound.rb index 6c53cde..e2a55ea 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -76,7 +76,6 @@ module OpenTox end def calculate_properties descriptors=PhysChem::OPENBABEL - # TODO: speedup java descriptors calculated_ids = properties.keys # BSON::ObjectId instances are not allowed as keys in a BSON document. new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids @@ -95,7 +94,6 @@ module OpenTox end save descriptors.collect{|d| properties[d.id.to_s]} - #properties.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id} end def smarts_match smarts, count=false diff --git a/lib/dataset.rb b/lib/dataset.rb index 453fc35..ab55294 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -130,7 +130,6 @@ module OpenTox #end # Create a dataset from CSV file - # TODO: document structure def self.from_csv_file file, accept_empty_values=false source = file name = File.basename(file,".*") diff --git a/lib/model.rb b/lib/model.rb index d7b072f..7503215 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -23,6 +23,7 @@ module OpenTox field :descriptor_means, type: Array, default:[] field :descriptor_sds, type: Array, default:[] field :scaled_variables, type: Array, default:[] + field :version, type: Hash, default:{} def self.create prediction_feature:nil, training_dataset:nil, algorithms:{} bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset @@ -35,6 +36,16 @@ module OpenTox model.prediction_feature_id = prediction_feature.id model.training_dataset_id = training_dataset.id model.name = "#{prediction_feature.name} (#{training_dataset.name})" + # TODO: check if this works for gem version, add gem versioning? + dir = File.dirname(__FILE__) + commit = `cd #{dir}; git rev-parse HEAD`.chomp + branch = `cd #{dir}; git rev-parse --abbrev-ref HEAD`.chomp + url = `cd #{dir}; git config --get remote.origin.url`.chomp + if branch + model.version = {:url => url, :branch => branch, :commit => commit} + else + model.version = {:warning => "git is not installed"} + end # set defaults substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq diff --git a/lib/rest-client-wrapper.rb b/lib/rest-client-wrapper.rb index 9321a75..2073be2 100644 --- a/lib/rest-client-wrapper.rb +++ b/lib/rest-client-wrapper.rb @@ -55,14 +55,8 @@ module OpenTox if [301, 302, 307].include? response.code and request.method == :get response.follow_redirection(request, result) elsif response.code >= 400 and !URI.task?(uri) - #TODO add parameters to error-report - #parameters = request.args - #parameters[:headers][:subjectid] = "REMOVED" if parameters[:headers] and parameters[:headers][:subjectid] - #parameters[:url] = parameters[:url].gsub(/(http|https|)\:\/\/[a-zA-Z0-9\-]+\:[a-zA-Z0-9]+\@/, "REMOVED@") if parameters[:url] - #message += "\nREST parameters:\n#{parameters.inspect}" error = known_errors.collect{|e| e if e[:code] == response.code}.compact.first begin # errors are returned as error reports in json, try to parse - # TODO: may be the reason for failure of task.rb -n test_11_wait_for_error_task content = JSON.parse(response) msg = content["message"].to_s cause = content["errorCause"].to_s -- cgit v1.2.3 From aada2ff67eaba251d1eeedb7f3eb29282706f997 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 14 Oct 2016 11:09:50 +0200 Subject: weighted average for failed caret predictions fixed --- lib/caret.rb | 9 ++++++--- lib/compound.rb | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/caret.rb b/lib/caret.rb index e24c943..18bfc41 100644 --- a/lib/caret.rb +++ b/lib/caret.rb @@ -10,7 +10,6 @@ module OpenTox independent_variables.each_with_index { |values,i| remove << i if values.uniq.size == 1} remove.sort.reverse.each do |i| independent_variables.delete_at i - weights.delete_at i query_variables.delete_at i end if independent_variables.flatten.uniq == ["NA"] @@ -44,7 +43,9 @@ module OpenTox $logger.debug "R caret model creation error for:" $logger.debug dependent_variables $logger.debug independent_variables - return {:value => nil, :warning => "R caret model cration error."} + prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights + prediction[:warning] = "R caret model creation error. Using weighted average of similar substances." + return prediction end begin R.eval "query <- data.frame(rbind(c(#{query_variables.join ','})))" @@ -63,7 +64,9 @@ module OpenTox rescue => e $logger.debug "R caret prediction error for:" $logger.debug self.inspect - return nil + prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights + prediction[:warning] = "R caret prediction error. Using weighted average of similar substances" + return prediction end if prediction.nil? or prediction[:value].nil? prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights diff --git a/lib/compound.rb b/lib/compound.rb index e2a55ea..a399169 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -254,7 +254,7 @@ module OpenTox end def db_neighbors min_sim: 0.1, dataset_id: - p fingerprints[DEFAULT_FINGERPRINT] + #p fingerprints[DEFAULT_FINGERPRINT] # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb #qn = default_fingerprint_size -- cgit v1.2.3 From 8519274487166d75b3b9ae28e61f7a7be9f7e83c Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 27 Oct 2016 11:58:07 +0200 Subject: probability plot for classification validations --- lib/crossvalidation.rb | 18 ++++++++---- lib/leave-one-out-validation.rb | 3 ++ lib/train-test-validation.rb | 14 +++++++++ lib/validation-statistics.rb | 64 ++++++++++++++++++++++++----------------- 4 files changed, 67 insertions(+), 32 deletions(-) (limited to 'lib') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 15d1031..4f779a2 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -64,14 +64,16 @@ module OpenTox field :weighted_accuracy, type: Float field :true_rate, type: Hash field :predictivity, type: Hash - field :confidence_plot_id, type: BSON::ObjectId + field :probability_plot_id, type: BSON::ObjectId end class RegressionCrossValidation < CrossValidation include RegressionStatistics - field :rmse, type: Float - field :mae, type: Float + field :rmse, type: Float, default:0 + field :mae, type: Float, default:0 field :r_squared, type: Float + field :within_prediction_interval, type: Integer, default:0 + field :out_of_prediction_interval, type: Integer, default:0 field :correlation_plot_id, type: BSON::ObjectId end @@ -93,6 +95,7 @@ module OpenTox crossvalidation_ids.collect{|id| CrossValidation.find(id)} end +=begin def correlation_plot format: "png" #unless correlation_plot_id feature = Feature.find(crossvalidations.first.model.prediction_feature) @@ -104,16 +107,18 @@ module OpenTox x = [] y = [] cv.predictions.each do |sid,p| - x << p["value"] - y << p["measurements"].median + x << p["measurements"].median + y << p["value"] end R.assign "measurement", x R.assign "prediction", y R.eval "all = c(measurement,prediction)" R.eval "range = c(min(all), max(all))" - R.eval "image#{i} = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" + R.eval "image#{i} = qplot(prediction,measurement,main='#{title} #{i}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" R.eval "image#{i} = image#{i} + geom_abline(intercept=0, slope=1)" images << "image#{i}" + + R.eval "ggsave(file='/home/ist/lazar/test/tmp#{i}.pdf', plot=image#{i})" end R.eval "pdf('#{tmpfile}')" R.eval "grid.arrange(#{images.join ","},ncol=#{images.size})" @@ -124,6 +129,7 @@ module OpenTox #end $gridfs.find_one(_id: correlation_plot_id).data end +=end end end diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index 59f43c5..538b7b3 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -5,6 +5,7 @@ module OpenTox class LeaveOneOut < Validation def self.create model + bad_request_error "Cannot create leave one out validation for models with supervised feature selection. Please use crossvalidation instead." if model.algorithms[:feature_selection] $logger.debug "#{model.name}: LOO validation started" t = Time.now model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOut : klass = RegressionLeaveOneOut @@ -48,6 +49,8 @@ module OpenTox field :rmse, type: Float, default: 0 field :mae, type: Float, default: 0 field :r_squared, type: Float + field :within_prediction_interval, type: Integer, default:0 + field :out_of_prediction_interval, type: Integer, default:0 field :correlation_plot_id, type: BSON::ObjectId end diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb index e3f5905..71abad2 100644 --- a/lib/train-test-validation.rb +++ b/lib/train-test-validation.rb @@ -44,10 +44,24 @@ module OpenTox class ClassificationTrainTest < TrainTest include ClassificationStatistics + field :accept_values, type: Array + field :confusion_matrix, type: Array + field :weighted_confusion_matrix, type: Array + field :accuracy, type: Float + field :weighted_accuracy, type: Float + field :true_rate, type: Hash + field :predictivity, type: Hash + field :probability_plot_id, type: BSON::ObjectId end class RegressionTrainTest < TrainTest include RegressionStatistics + field :rmse, type: Float, default:0 + field :mae, type: Float, default:0 + field :r_squared, type: Float + field :within_prediction_interval, type: Integer, default:0 + field :out_of_prediction_interval, type: Integer, default:0 + field :correlation_plot_id, type: BSON::ObjectId end end diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 3582c71..4ab4b13 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -65,43 +65,44 @@ module OpenTox } end - def confidence_plot - unless confidence_plot_id - tmpfile = "/tmp/#{id.to_s}_confidence.svg" + def probability_plot format: "pdf" + #unless probability_plot_id + tmpfile = "/tmp/#{id.to_s}_probability.#{format}" accuracies = [] - confidences = [] + probabilities = [] correct_predictions = 0 incorrect_predictions = 0 - predictions.each do |p| - p[:measurements].each do |db_act| - if p[:value] - p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1 - accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f - confidences << p[:confidence] - - end + pp = [] + predictions.values.select{|p| p["probabilities"]}.compact.each do |p| + p["measurements"].each do |m| + pp << [ p["probabilities"][p["value"]], p["value"] == m ] end end + pp.sort_by!{|p| 1-p.first} + pp.each do |p| + p[1] ? correct_predictions += 1 : incorrect_predictions += 1 + accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f + probabilities << p[0] + end R.assign "accuracy", accuracies - R.assign "confidence", confidences - R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" + R.assign "probability", probabilities + R.eval "image = qplot(probability,accuracy)+ylab('Accumulated accuracy')+xlab('Prediction probability')+ylim(c(0,1))+scale_x_reverse()+geom_line()" R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.svg") plot_id = $gridfs.insert_one(file) - update(:confidence_plot_id => plot_id) - end - $gridfs.find_one(_id: confidence_plot_id).data + update(:probability_plot_id => plot_id) + #end + $gridfs.find_one(_id: probability_plot_id).data end end module RegressionStatistics def statistics - # TODO: predictions within prediction_interval self.rmse = 0 self.mae = 0 - #self.within_prediction_interval = 0 - #self.outside_prediction_interval = 0 + self.within_prediction_interval = 0 + self.out_of_prediction_interval = 0 x = [] y = [] predictions.each do |cid,pred| @@ -111,9 +112,13 @@ module OpenTox error = pred[:value]-pred[:measurements].median self.rmse += error**2 self.mae += error.abs - #if pred[:prediction_interval] - #if pred[:measurements] - #end + if pred[:prediction_interval] + if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1] + self.within_prediction_interval += 1 + else + self.out_of_prediction_interval += 1 + end + end else warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." @@ -128,16 +133,23 @@ module OpenTox $logger.debug "R^2 #{r_squared}" $logger.debug "RMSE #{rmse}" $logger.debug "MAE #{mae}" + $logger.debug "#{percent_within_prediction_interval.round(2)}% measurements within prediction interval" save { :mae => mae, :rmse => rmse, :r_squared => r_squared, + :within_prediction_interval => within_prediction_interval, + :out_of_prediction_interval => out_of_prediction_interval, } end + def percent_within_prediction_interval + 100*within_prediction_interval.to_f/(within_prediction_interval+out_of_prediction_interval) + end + def correlation_plot format: "png" - unless correlation_plot_id + #unless correlation_plot_id tmpfile = "/tmp/#{id.to_s}_correlation.#{format}" x = [] y = [] @@ -158,7 +170,7 @@ module OpenTox file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}") plot_id = $gridfs.insert_one(file) update(:correlation_plot_id => plot_id) - end + #end $gridfs.find_one(_id: correlation_plot_id).data end -- cgit v1.2.3 From 5418c2477a1a48b06f97d693f6c117336aec5b4c Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 27 Oct 2016 12:09:06 +0200 Subject: GridFS storage for plots. --- lib/validation-statistics.rb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 4ab4b13..b251bdb 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -66,7 +66,7 @@ module OpenTox end def probability_plot format: "pdf" - #unless probability_plot_id + unless probability_plot_id tmpfile = "/tmp/#{id.to_s}_probability.#{format}" accuracies = [] probabilities = [] @@ -91,7 +91,7 @@ module OpenTox file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.svg") plot_id = $gridfs.insert_one(file) update(:probability_plot_id => plot_id) - #end + end $gridfs.find_one(_id: probability_plot_id).data end end @@ -133,7 +133,7 @@ module OpenTox $logger.debug "R^2 #{r_squared}" $logger.debug "RMSE #{rmse}" $logger.debug "MAE #{mae}" - $logger.debug "#{percent_within_prediction_interval.round(2)}% measurements within prediction interval" + $logger.debug "#{percent_within_prediction_interval.round(2)}% of measurements within prediction interval" save { :mae => mae, @@ -149,7 +149,7 @@ module OpenTox end def correlation_plot format: "png" - #unless correlation_plot_id + unless correlation_plot_id tmpfile = "/tmp/#{id.to_s}_correlation.#{format}" x = [] y = [] @@ -170,7 +170,7 @@ module OpenTox file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}") plot_id = $gridfs.insert_one(file) update(:correlation_plot_id => plot_id) - #end + end $gridfs.find_one(_id: correlation_plot_id).data end -- cgit v1.2.3 From 280f81dcffb3b8b929ff9cbe92ba17403f5a9dd3 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 28 Oct 2016 12:31:53 +0200 Subject: adjusted r^2 removed (does not apply well to local models) --- lib/crossvalidation.rb | 2 -- lib/validation-statistics.rb | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 4f779a2..be680ae 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -95,7 +95,6 @@ module OpenTox crossvalidation_ids.collect{|id| CrossValidation.find(id)} end -=begin def correlation_plot format: "png" #unless correlation_plot_id feature = Feature.find(crossvalidations.first.model.prediction_feature) @@ -129,7 +128,6 @@ module OpenTox #end $gridfs.find_one(_id: correlation_plot_id).data end -=end end end diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index b251bdb..799bb34 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -139,6 +139,7 @@ module OpenTox :mae => mae, :rmse => rmse, :r_squared => r_squared, + :r_squared_adjusted => r_squared_adjusted, :within_prediction_interval => within_prediction_interval, :out_of_prediction_interval => out_of_prediction_interval, } -- cgit v1.2.3 From c6e86fc1bfee7cb91782dd7067408d78a8e48ed9 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 8 Nov 2016 16:04:49 +0100 Subject: probability plot for classification --- lib/validation-statistics.rb | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 799bb34..b6f8a60 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -66,8 +66,13 @@ module OpenTox end def probability_plot format: "pdf" - unless probability_plot_id - tmpfile = "/tmp/#{id.to_s}_probability.#{format}" + #unless probability_plot_id + + #tmpdir = File.join(ENV["HOME"], "tmp") + tmpdir = "/tmp" + #p tmpdir + FileUtils.mkdir_p tmpdir + tmpfile = File.join(tmpdir,"#{id.to_s}_probability.#{format}") accuracies = [] probabilities = [] correct_predictions = 0 @@ -91,7 +96,7 @@ module OpenTox file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.svg") plot_id = $gridfs.insert_one(file) update(:probability_plot_id => plot_id) - end + #end $gridfs.find_one(_id: probability_plot_id).data end end @@ -139,7 +144,6 @@ module OpenTox :mae => mae, :rmse => rmse, :r_squared => r_squared, - :r_squared_adjusted => r_squared_adjusted, :within_prediction_interval => within_prediction_interval, :out_of_prediction_interval => out_of_prediction_interval, } -- cgit v1.2.3 From 295dcfc74e1375e495ec3d9c1e74a402eb4decd4 Mon Sep 17 00:00:00 2001 From: gebele Date: Thu, 10 Nov 2016 11:06:27 +0000 Subject: added nanomodel create --- lib/model.rb | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'lib') diff --git a/lib/model.rb b/lib/model.rb index 7503215..adcbcc6 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -418,6 +418,28 @@ module OpenTox prediction_model end + def self.create dir: dir, algorithms: algorithms + training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + unless training_dataset + Import::Enanomapper.import dir + training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + end + prediction_model = self.new( + :endpoint => "log2(Net cell association)", + :source => "https://data.enanomapper.net/", + :species => "A549 human lung epithelial carcinoma cells", + :unit => "log2(ug/Mg)" + ) + prediction_feature = Feature.where(name: "log2(Net cell association)", category: "TOX").first + model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms) + prediction_model[:model_id] = model.id + repeated_cv = Validation::RepeatedCrossValidation.create model + prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id + #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id + prediction_model.save + prediction_model + end + end end -- cgit v1.2.3 From 9e7b36613e98601de7b2ceb2d4442e11f1ae868a Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 10 Nov 2016 12:23:46 +0100 Subject: intermediate commit, may be defunct --- lib/compound.rb | 3 --- lib/import.rb | 44 +++++++++++++++++++++++++++++++------------- lib/model.rb | 11 +++++++---- lib/nanoparticle.rb | 46 ++++++++++++++++++++++++++++++++++------------ 4 files changed, 72 insertions(+), 32 deletions(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index a399169..8a1143b 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -136,9 +136,6 @@ module OpenTox # @param inchi [String] smiles InChI string # @return [OpenTox::Compound] Compound def self.from_inchi inchi - # Temporary workaround for OpenBabels Inchi bug - # http://sourceforge.net/p/openbabel/bugs/957/ - # bug has not been fixed in latest git/development version #smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip smiles = obconversion(inchi,"inchi","can") if smiles.empty? diff --git a/lib/import.rb b/lib/import.rb index 8e57401..541c9b5 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -9,6 +9,12 @@ module OpenTox #get list of bundle URIs bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] File.open(File.join(dir,"bundles.json"),"w+"){|f| f.puts JSON.pretty_generate(bundles)} + # bundles + # id/summary + # id/compound + # id/substance + # id/property + bundles.each do |bundle| $logger.debug bundle["title"] nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"] @@ -32,32 +38,43 @@ module OpenTox t2 = 0 datasets = {} JSON.parse(File.read(File.join(dir,"bundles.json"))).each do |bundle| + if bundle["id"] == 3 datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"]) + end end - Dir[File.join(dir,"study*.json")].each do |s| + # TODO this is only for protein corona + Dir[File.join(dir,"study-F*.json")].each do |s| t = Time.now study = JSON.parse(File.read(s)) np = JSON.parse(File.read(File.join(dir,"nanoparticle-#{study['owner']['substance']['uuid']}.json"))) - core = {} - coating = [] + core_id = nil + coating_ids = [] np["composition"].each do |c| + uri = c["component"]["compound"]["URI"] + uri = CGI.escape File.join(uri,"&media=application/json") + data = JSON.parse(RestClientWrapper.get "https://data.enanomapper.net/query/compound/url/all?media=application/json&search=#{uri}") + smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"] + names = [] + names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"] + names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"] + if smiles + compound = Compound.find_or_create_by(:smiles => smiles) + compound.names = names.compact + else + compound = Compound.find_or_create_by(:names => names) + end + compound.save if c["relation"] == "HAS_CORE" - core = { - :uri => c["component"]["compound"]["URI"], - :name => c["component"]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"] - } + core_id = compound.id.to_s elsif c["relation"] == "HAS_COATING" - coating << { - :uri => c["component"]["compound"]["URI"], - :name => c["component"]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"] - } + coating_ids << compound.id.to_s end end if np["composition"] nanoparticle = Nanoparticle.find_or_create_by( :name => np["values"]["https://data.enanomapper.net/identifier/name"], :source => np["compound"]["URI"], - :core => core, - :coating => coating + :core_id => core_id, + :coating_ids => coating_ids ) np["bundles"].keys.each do |bundle_uri| nanoparticle.dataset_ids << datasets[bundle_uri].id @@ -104,6 +121,7 @@ module OpenTox nanoparticle.parse_ambit_value feature, effect["result"], dataset end end + p nanoparticle nanoparticle.save end datasets.each { |u,d| d.save } diff --git a/lib/model.rb b/lib/model.rb index 7503215..6a5e614 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -81,7 +81,6 @@ module OpenTox :method => "properties", :categories => ["P-CHEM"], }, - #:descriptors => ["P-CHEM","Proteomics"], :similarity => { :method => "Algorithm::Similarity.weighted_cosine", :min => 0.5 @@ -140,10 +139,11 @@ module OpenTox model.algorithms[:descriptors].delete(:features) model.algorithms[:descriptors].delete(:type) model.substances.each_with_index do |s,i| - s.calculate_properties(features).each_with_index do |v,j| + props = s.calculate_properties(features) + props.each_with_index do |v,j| model.independent_variables[j] ||= [] model.independent_variables[j][i] = v - end + end if props and !props.empty? end # parse independent_variables when "properties" @@ -152,7 +152,10 @@ module OpenTox categories.each do |category| Feature.where(category:category).each{|f| feature_ids << f.id.to_s} end - properties = model.substances.collect { |s| s.properties } + #p feature_ids + #properties = Nanoparticle.all.collect { |s| p s.name; p s.id; p s.properties } + properties = model.substances.collect { |s| s.properties } + #p properties property_ids = properties.collect{|p| p.keys}.flatten.uniq model.descriptor_ids = feature_ids & property_ids model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index 23e155c..02d9a89 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -3,8 +3,30 @@ module OpenTox class Nanoparticle < Substance include OpenTox - field :core, type: Hash, default: {} - field :coating, type: Array, default: [] + field :core_id, type: String, default: nil + field :coating_ids, type: Array, default: [] + + def core + Compound.find core_id + end + + def coating + coating_ids.collect{|i| Compound.find i } + end + + def fingerprint type=DEFAULT_FINGERPRINT + core_fp = core.fingerprint type + coating_fp = coating.collect{|c| c.fingerprint type}.flatten.uniq.compact + (core_fp.empty? or coating_fp.empty?) ? [] : (core_fp+coating_fp).uniq.compact + end + + def calculate_properties descriptors=PhysChem::OPENBABEL + if core.smiles and !coating.collect{|c| c.smiles}.compact.empty? + core_prop = core.calculate_properties descriptors + coating_prop = coating.collect{|c| c.calculate_properties descriptors if c.smiles} + descriptors.collect_with_index{|d,i| [core_prop[i],coating_prop.collect{|c| c[i] if c}]} + end + end def add_feature feature, value, dataset unless feature.name == "ATOMIC COMPOSITION" or feature.name == "FUNCTIONAL GROUP" # redundand @@ -37,28 +59,28 @@ module OpenTox add_feature feature, v["loValue"], dataset elsif v.keys.size == 2 and v["errorValue"] add_feature feature, v["loValue"], dataset - warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." + #warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." elsif v.keys.size == 2 and v["loQualifier"] == "mean" add_feature feature, v["loValue"], dataset - warn "'#{feature.name}' is a mean value. Original data is not available." + #warn "'#{feature.name}' is a mean value. Original data is not available." elsif v.keys.size == 2 and v["loQualifier"] #== ">=" - warn "Only min value available for '#{feature.name}', entry ignored" + #warn "Only min value available for '#{feature.name}', entry ignored" elsif v.keys.size == 2 and v["upQualifier"] #== ">=" - warn "Only max value available for '#{feature.name}', entry ignored" + #warn "Only max value available for '#{feature.name}', entry ignored" elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil? add_feature feature, v["loValue"], dataset - warn "loQualifier and upQualifier are empty." + #warn "loQualifier and upQualifier are empty." elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"] == "" and v["upQualifier"] == "" add_feature feature, v["loValue"], dataset - warn "loQualifier and upQualifier are empty." + #warn "loQualifier and upQualifier are empty." elsif v.keys.size == 4 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil? add_feature feature, v["loValue"], dataset - warn "loQualifier and upQualifier are empty." + #warn "loQualifier and upQualifier are empty." elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] and v["loValue"] and v["upValue"] - add_feature feature, [v["loValue"],v["upValue"]].mean, dataset - warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available." + #add_feature feature, [v["loValue"],v["upValue"]].mean, dataset + #warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available." elsif v.size == 4 and v["loQualifier"] == "mean" and v["errorValue"] - warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." + #warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." add_feature feature, v["loValue"], dataset elsif v == {} # do nothing else -- cgit v1.2.3 From 9a06f2ff5ae6bdbe7dc90555599e186f1585e0d2 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 10 Nov 2016 15:27:26 +0100 Subject: Model::NanoPrediction parameters --- lib/caret.rb | 2 +- lib/import.rb | 7 ++++++- lib/model.rb | 51 +++++++++++++++++++-------------------------------- lib/similarity.rb | 4 ++++ 4 files changed, 30 insertions(+), 34 deletions(-) (limited to 'lib') diff --git a/lib/caret.rb b/lib/caret.rb index 18bfc41..7e4f771 100644 --- a/lib/caret.rb +++ b/lib/caret.rb @@ -12,7 +12,7 @@ module OpenTox independent_variables.delete_at i query_variables.delete_at i end - if independent_variables.flatten.uniq == ["NA"] + if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == [] prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights prediction[:warning] = "No variables for regression model. Using weighted average of similar substances." elsif diff --git a/lib/import.rb b/lib/import.rb index 541c9b5..8f640b1 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -5,7 +5,12 @@ module OpenTox class Enanomapper include OpenTox - def self.mirror dir="." + def self.mirror dir=nil + # clean download dir + dir ||= File.join(File.dirname(__FILE__),"..","data","enm") + FileUtils.rm_rf dir + FileUtils.mkdir_p dir + #get list of bundle URIs bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] File.open(File.join(dir,"bundles.json"),"w+"){|f| f.puts JSON.pretty_generate(bundles)} diff --git a/lib/model.rb b/lib/model.rb index 549cbd2..809dc48 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -106,7 +106,7 @@ module OpenTox else model.algorithms[type] = parameters end - end + end if algorithms # parse dependent_variables from training dataset training_dataset.substances.each do |substance| @@ -249,6 +249,7 @@ module OpenTox elsif neighbor_similarities.size == 1 prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]}) else + query_descriptors.collect!{|d| d ? 1 : 0} if independent_variables[0][0].numeric? # call prediction algorithm result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors prediction.merge! result @@ -343,7 +344,7 @@ module OpenTox field :unit, type: String field :model_id, type: BSON::ObjectId field :repeated_crossvalidation_id, type: BSON::ObjectId - field :leave_one_out_validation_id, type: BSON::ObjectId + #field :leave_one_out_validation_id, type: BSON::ObjectId def predict object model.predict object @@ -398,42 +399,28 @@ module OpenTox class NanoPrediction < Prediction - def self.from_json_dump dir, category - Import::Enanomapper.import dir - training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - unless training_dataset - Import::Enanomapper.import File.join(File.dirname(__FILE__),"data","enm") + def self.create training_dataset: nil, prediction_feature:nil, algorithms: nil + + # find/import training_dataset + training_dataset ||= Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + unless training_dataset # try to import from json dump + Import::Enanomapper.import training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + unless training_dataset + Import::Enanomapper.mirror + Import::Enanomapper.import + training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first + bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset + end end - prediction_model = self.new( - :endpoint => "log2(Net cell association)", - :source => "https://data.enanomapper.net/", - :species => "A549 human lung epithelial carcinoma cells", - :unit => "log2(ug/Mg)" - ) - prediction_feature = Feature.where(name: "log2(Net cell association)", category: "TOX").first - model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset) - prediction_model[:model_id] = model.id - repeated_cv = Validation::RepeatedCrossValidation.create model - prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id - #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id - prediction_model.save - prediction_model - end + prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first - def self.create dir: dir, algorithms: algorithms - training_dataset = Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - unless training_dataset - Import::Enanomapper.import dir - training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - end prediction_model = self.new( - :endpoint => "log2(Net cell association)", - :source => "https://data.enanomapper.net/", + :endpoint => prediction_feature.name, + :source => prediction_feature.source, :species => "A549 human lung epithelial carcinoma cells", - :unit => "log2(ug/Mg)" + :unit => prediction_feature.unit ) - prediction_feature = Feature.where(name: "log2(Net cell association)", category: "TOX").first model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms) prediction_model[:model_id] = model.id repeated_cv = Validation::RepeatedCrossValidation.create model diff --git a/lib/similarity.rb b/lib/similarity.rb index 772e812..0901936 100644 --- a/lib/similarity.rb +++ b/lib/similarity.rb @@ -19,6 +19,10 @@ module OpenTox ( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f end + #def self.weighted_tanimoto fingerprints + #( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f + #end + def self.euclid scaled_properties sq = scaled_properties[0].zip(scaled_properties[1]).map{|a,b| (a - b) ** 2} Math.sqrt(sq.inject(0) {|s,c| s + c}) -- cgit v1.2.3 From b6116bc4705066da30668ff3370f3b1c307e44e7 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 11 Nov 2016 13:07:53 +0100 Subject: enm import fixed --- lib/import.rb | 194 ++++++++++++++++++++++++---------------------------------- lib/model.rb | 21 +------ 2 files changed, 83 insertions(+), 132 deletions(-) (limited to 'lib') diff --git a/lib/import.rb b/lib/import.rb index 8f640b1..aa2ee75 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -5,129 +5,95 @@ module OpenTox class Enanomapper include OpenTox - def self.mirror dir=nil - # clean download dir - dir ||= File.join(File.dirname(__FILE__),"..","data","enm") - FileUtils.rm_rf dir - FileUtils.mkdir_p dir - - #get list of bundle URIs + # time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%) + def self.import dir="." + datasets = {} bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] - File.open(File.join(dir,"bundles.json"),"w+"){|f| f.puts JSON.pretty_generate(bundles)} - # bundles - # id/summary - # id/compound - # id/substance - # id/property - bundles.each do |bundle| + datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"]) $logger.debug bundle["title"] nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"] - $logger.debug nanoparticles.size - nanoparticles.each do |nanoparticle| - uuid = nanoparticle["values"]["https://data.enanomapper.net/identifier/uuid"] - $logger.debug uuid - File.open(File.join(dir,"nanoparticle-#{uuid}.json"),"w+"){|f| f.puts JSON.pretty_generate(nanoparticle)} - studies = JSON.parse(RestClientWrapper.get(File.join(nanoparticle["compound"]["URI"],"study")))["study"] - $logger.debug uuid if studies.size < 1 - studies.each do |study| - File.open(File.join(dir,"study-#{study["uuid"]}.json"),"w+"){|f| f.puts JSON.pretty_generate(study)} - end - end - end - end - - def self.import dir="." - start_time = Time.now - t1 = 0 - t2 = 0 - datasets = {} - JSON.parse(File.read(File.join(dir,"bundles.json"))).each do |bundle| - if bundle["id"] == 3 - datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"]) - end - end - # TODO this is only for protein corona - Dir[File.join(dir,"study-F*.json")].each do |s| - t = Time.now - study = JSON.parse(File.read(s)) - np = JSON.parse(File.read(File.join(dir,"nanoparticle-#{study['owner']['substance']['uuid']}.json"))) - core_id = nil - coating_ids = [] - np["composition"].each do |c| - uri = c["component"]["compound"]["URI"] - uri = CGI.escape File.join(uri,"&media=application/json") - data = JSON.parse(RestClientWrapper.get "https://data.enanomapper.net/query/compound/url/all?media=application/json&search=#{uri}") - smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"] - names = [] - names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"] - names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"] - if smiles - compound = Compound.find_or_create_by(:smiles => smiles) - compound.names = names.compact - else - compound = Compound.find_or_create_by(:names => names) - end - compound.save - if c["relation"] == "HAS_CORE" - core_id = compound.id.to_s - elsif c["relation"] == "HAS_COATING" - coating_ids << compound.id.to_s + nanoparticles.each_with_index do |np,n| + core_id = nil + coating_ids = [] + np["composition"].each do |c| + uri = c["component"]["compound"]["URI"] + uri = CGI.escape File.join(uri,"&media=application/json") + data = JSON.parse(RestClientWrapper.get "https://data.enanomapper.net/query/compound/url/all?media=application/json&search=#{uri}") + smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"] + names = [] + names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"] + names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"] + if smiles + compound = Compound.find_or_create_by(:smiles => smiles) + compound.name = names.first + compound.names = names.compact + else + compound = Compound.find_or_create_by(:name => names.first,:names => names) + end + compound.save + if c["relation"] == "HAS_CORE" + core_id = compound.id.to_s + elsif c["relation"] == "HAS_COATING" + coating_ids << compound.id.to_s + end + end if np["composition"] + nanoparticle = Nanoparticle.find_or_create_by( + :name => np["values"]["https://data.enanomapper.net/identifier/name"], + :source => np["compound"]["URI"], + :core_id => core_id, + :coating_ids => coating_ids + ) + np["bundles"].keys.each do |bundle_uri| + nanoparticle.dataset_ids << datasets[bundle_uri].id end - end if np["composition"] - nanoparticle = Nanoparticle.find_or_create_by( - :name => np["values"]["https://data.enanomapper.net/identifier/name"], - :source => np["compound"]["URI"], - :core_id => core_id, - :coating_ids => coating_ids - ) - np["bundles"].keys.each do |bundle_uri| - nanoparticle.dataset_ids << datasets[bundle_uri].id - end - dataset = datasets[np["bundles"].keys.first] - proteomics_features = {} - category = study["protocol"]["topcategory"] - source = study["protocol"]["category"]["term"] - - study["effects"].each do |effect| - - effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature - effect["conditions"].delete_if { |k, v| v.nil? } - - if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data - - JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step - proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true) - nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset - end - else - name = effect["endpoint"] - unit = effect["result"]["unit"] - warnings = [] - case name - when "Log2 transformed" # use a sensible name - name = "log2(Net cell association)" - warnings = ["Original name was 'Log2 transformed'"] - unit = "log2(mL/ug(Mg))" - when "Total protein (BCA assay)" - category = "P-CHEM" - warnings = ["Category changed from TOX to P-CHEM"] + studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study")))["study"] + studies.each do |study| + dataset = datasets[np["bundles"].keys.first] + proteomics_features = {} + category = study["protocol"]["topcategory"] + source = study["protocol"]["category"]["term"] + study["effects"].each do |effect| + + effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature + effect["conditions"].delete_if { |k, v| v.nil? } + + if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data + + JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step + proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true) + nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset + end + else + name = effect["endpoint"] + unit = effect["result"]["unit"] + warnings = [] + case name + when "Log2 transformed" # use a sensible name + name = "log2(Net cell association)" + warnings = ["Original name was 'Log2 transformed'"] + unit = "log2(mL/ug(Mg))" + when "Total protein (BCA assay)" + category = "P-CHEM" + warnings = ["Category changed from TOX to P-CHEM"] + end + feature = klass.find_or_create_by( + :name => name, + :unit => unit, + :category => category, + :conditions => effect["conditions"], + :source => study["protocol"]["category"]["term"], + :measured => true, + :warnings => warnings + ) + nanoparticle.parse_ambit_value feature, effect["result"], dataset + end end - feature = klass.find_or_create_by( - :name => name, - :unit => unit, - :category => category, - :conditions => effect["conditions"], - :source => study["protocol"]["category"]["term"], - :measured => true, - :warnings => warnings - ) - nanoparticle.parse_ambit_value feature, effect["result"], dataset end + nanoparticle.save + print "#{n}, " end - p nanoparticle - nanoparticle.save end datasets.each { |u,d| d.save } end diff --git a/lib/model.rb b/lib/model.rb index 809dc48..9be0fa0 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -152,10 +152,7 @@ module OpenTox categories.each do |category| Feature.where(category:category).each{|f| feature_ids << f.id.to_s} end - #p feature_ids - #properties = Nanoparticle.all.collect { |s| p s.name; p s.id; p s.properties } properties = model.substances.collect { |s| s.properties } - #p properties property_ids = properties.collect{|p| p.keys}.flatten.uniq model.descriptor_ids = feature_ids & property_ids model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} @@ -223,10 +220,10 @@ module OpenTox prediction[:measurements] << dependent_variables[i] prediction[:warning] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance." else - next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core if fingerprints? neighbor_descriptors = fingerprints[i] else + next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions neighbor_descriptors = scaled_variables.collect{|v| v[i]} end sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights] @@ -344,7 +341,6 @@ module OpenTox field :unit, type: String field :model_id, type: BSON::ObjectId field :repeated_crossvalidation_id, type: BSON::ObjectId - #field :leave_one_out_validation_id, type: BSON::ObjectId def predict object model.predict object @@ -370,10 +366,6 @@ module OpenTox repeated_crossvalidation.crossvalidations end - def leave_one_out_validation - Validation::LeaveOneOut.find leave_one_out_validation_id - end - def regression? model.is_a? LazarRegression end @@ -390,7 +382,6 @@ module OpenTox model = Lazar.create training_dataset: training_dataset prediction_model[:model_id] = model.id prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id - #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id prediction_model.save prediction_model end @@ -406,12 +397,7 @@ module OpenTox unless training_dataset # try to import from json dump Import::Enanomapper.import training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - unless training_dataset - Import::Enanomapper.mirror - Import::Enanomapper.import - training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset - end + bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset end prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first @@ -424,8 +410,7 @@ module OpenTox model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms) prediction_model[:model_id] = model.id repeated_cv = Validation::RepeatedCrossValidation.create model - prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id - #prediction_model[:leave_one_out_validation_id] = Validation::LeaveOneOut.create(model).id + prediction_model[:repeated_crossvalidation_id] = repeated_cv.id prediction_model.save prediction_model end -- cgit v1.2.3 From 99c42f76b02f9084d0757eb0c52b4a55fa295a95 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 11 Nov 2016 17:19:13 +0100 Subject: p-chem regression and enm import fixed --- lib/crossvalidation.rb | 1 + lib/model.rb | 31 ++++++++++++++----------------- 2 files changed, 15 insertions(+), 17 deletions(-) (limited to 'lib') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index be680ae..5a05955 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -6,6 +6,7 @@ module OpenTox field :folds, type: Integer, default: 10 def self.create model, n=10 + $logger.debug model.algorithms klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification klass = RegressionCrossValidation if model.is_a? Model::LazarRegression bad_request_error "Unknown model class #{model.class}." unless klass diff --git a/lib/model.rb b/lib/model.rb index 9be0fa0..9ed3210 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -102,6 +102,7 @@ module OpenTox parameters.each do |p,v| model.algorithms[type] ||= {} model.algorithms[type][p] = v + model.algorithms[:descriptors].delete :categories if type == :descriptors and p == :type end else model.algorithms[type] = parameters @@ -246,7 +247,7 @@ module OpenTox elsif neighbor_similarities.size == 1 prediction.merge!({:value => dependent_variables.first, :probabilities => nil, :warning => "Only one similar compound in the training set. Predicting its experimental value.", :neighbors => [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]}) else - query_descriptors.collect!{|d| d ? 1 : 0} if independent_variables[0][0].numeric? + query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint" # call prediction algorithm result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors prediction.merge! result @@ -329,7 +330,7 @@ module OpenTox class LazarRegression < Lazar end - class Prediction + class Validation include OpenTox include Mongoid::Document @@ -377,20 +378,16 @@ module OpenTox def self.from_csv_file file metadata_file = file.sub(/csv$/,"json") bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file - prediction_model = self.new JSON.parse(File.read(metadata_file)) + model_validation = self.new JSON.parse(File.read(metadata_file)) training_dataset = Dataset.from_csv_file file model = Lazar.create training_dataset: training_dataset - prediction_model[:model_id] = model.id - prediction_model[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id - prediction_model.save - prediction_model + model_validation[:model_id] = model.id + model_validation[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id + model_validation.save + model_validation end - end - - class NanoPrediction < Prediction - - def self.create training_dataset: nil, prediction_feature:nil, algorithms: nil + def self.from_enanomapper training_dataset: nil, prediction_feature:nil, algorithms: nil # find/import training_dataset training_dataset ||= Dataset.where(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first @@ -401,18 +398,18 @@ module OpenTox end prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first - prediction_model = self.new( + model_validation = self.new( :endpoint => prediction_feature.name, :source => prediction_feature.source, :species => "A549 human lung epithelial carcinoma cells", :unit => prediction_feature.unit ) model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms) - prediction_model[:model_id] = model.id + model_validation[:model_id] = model.id repeated_cv = Validation::RepeatedCrossValidation.create model - prediction_model[:repeated_crossvalidation_id] = repeated_cv.id - prediction_model.save - prediction_model + model_validation[:repeated_crossvalidation_id] = repeated_cv.id + model_validation.save + model_validation end end -- cgit v1.2.3 From 0ddd04c32280e6fd166a52fa6da653df24aabf99 Mon Sep 17 00:00:00 2001 From: gebele Date: Wed, 23 Nov 2016 15:10:58 +0000 Subject: added delog10;generalized mmol2-log10 --- lib/overwrite.rb | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'lib') diff --git a/lib/overwrite.rb b/lib/overwrite.rb index d0422ee..31d30c9 100644 --- a/lib/overwrite.rb +++ b/lib/overwrite.rb @@ -28,6 +28,11 @@ class Float def signif(n) Float("%.#{n}g" % self) end + + # converts -10 logarithmized values back + def delog10 + 10**(-1*self) + end end module Enumerable -- cgit v1.2.3 From 2baffb4a3ebfa2b4a32c0c148bf61a5da89ec210 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 25 Nov 2016 10:36:02 +0100 Subject: algorithms accessor for Model::Validation --- lib/model.rb | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/model.rb b/lib/model.rb index 9ed3210..e8b30ca 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -355,6 +355,10 @@ module OpenTox Lazar.find model_id end + def algorithms + model.algorithms + end + def prediction_feature model.prediction_feature end @@ -404,7 +408,7 @@ module OpenTox :species => "A549 human lung epithelial carcinoma cells", :unit => prediction_feature.unit ) - model = Model::LazarRegression.create(prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms) + model = LazarRegression.create prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms model_validation[:model_id] = model.id repeated_cv = Validation::RepeatedCrossValidation.create model model_validation[:repeated_crossvalidation_id] = repeated_cv.id -- cgit v1.2.3 From 4570f11444bc10da88d849e9a2812e95a8933c8a Mon Sep 17 00:00:00 2001 From: gebele Date: Tue, 6 Dec 2016 09:59:24 +0000 Subject: full class name required --- lib/model.rb | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/model.rb b/lib/model.rb index e8b30ca..38c1915 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -364,7 +364,8 @@ module OpenTox end def repeated_crossvalidation - Validation::RepeatedCrossValidation.find repeated_crossvalidation_id + # full class name required + OpenTox::Validation::RepeatedCrossValidation.find repeated_crossvalidation_id end def crossvalidations @@ -386,7 +387,8 @@ module OpenTox training_dataset = Dataset.from_csv_file file model = Lazar.create training_dataset: training_dataset model_validation[:model_id] = model.id - model_validation[:repeated_crossvalidation_id] = Validation::RepeatedCrossValidation.create(model).id + # full class name required + model_validation[:repeated_crossvalidation_id] = OpenTox::Validation::RepeatedCrossValidation.create(model).id model_validation.save model_validation end -- cgit v1.2.3