From 51f57e2858b60bed74ebcc97189b2188c900c283 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 6 May 2016 12:49:28 +0200 Subject: dataset tests cleanup --- lib/compound.rb | 7 ++++--- lib/dataset.rb | 39 +++++++++++++++++++++++---------------- lib/lazar.rb | 1 - lib/model.rb | 4 ++-- lib/nanoparticle.rb | 6 +++--- lib/regression.rb | 6 +++--- lib/substance.rb | 2 +- 7 files changed, 36 insertions(+), 29 deletions(-) (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index 143c4f2..6cb7f78 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -22,6 +22,7 @@ module OpenTox # Overwrites standard Mongoid method to create fingerprints before database insertion def self.find_or_create_by params + #PhysChem.descriptors # load descriptor features compound = self.find_or_initialize_by params compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT).size compound.save @@ -77,7 +78,7 @@ module OpenTox def physchem descriptors=PhysChem.openbabel_descriptors # TODO: speedup java descriptors - calculated_ids = physchem.keys + calculated_ids = physchem_descriptors.keys # BSON::ObjectId instances are not allowed as keys in a BSON document. new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids descs = {} @@ -90,11 +91,11 @@ module OpenTox # avoid recalculating Cdk features with multiple values descs.keys.uniq.each do |k| descs[k].send(k[0].downcase,k[1],self).each do |n,v| - physchem[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document. + physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document. end end save - physchem.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id} + physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id} end def smarts_match smarts, count=false diff --git a/lib/dataset.rb b/lib/dataset.rb index b51d74b..9b24440 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -30,19 +30,11 @@ module OpenTox @features end - # Find data entry values for a given compound and feature - # @param compound [OpenTox::Compound] OpenTox Compound object - # @param feature [OpenTox::Feature] OpenTox Feature object - # @return [Array] Data entry values - #def values(compound, feature) - #data_entries[compound.id.to_s][feature.id.to_s] - #end - # Writers # Set compounds def compounds=(compounds) - self.substance_ids = compounds.collect{|c| c.id} + self.substance_ids = compounds.collect{|c| c.id}.uniq end # Set features @@ -95,14 +87,27 @@ module OpenTox csv << ["Name"] + features.collect{|f| f.name} end substances.each do |substance| - features.each do |f| - substance.toxicities[f.id.to_s].each do |v| - if compound - csv << [inchi ? substance.inchi : substance.smiles , v] - else - csv << [substance.name , v] + if compound + name = (inchi ? substance.inchi : substance.smiles) + else + name = substance.name + end + nr_measurements = features.collect{|f| substance.toxicities[f.id.to_s].size if substance.toxicities[f.id.to_s]}.compact.uniq + + if nr_measurements.size > 1 + warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries." + else + (0..nr_measurements.first-1).each do |i| + row = [name] + features.each do |f| + if substance.toxicities[f.id.to_s] + row << substance.toxicities[f.id.to_s][i] + else + row << "" + end end - end if substance.toxicities[f.id.to_s] + csv << row + end end end end @@ -224,6 +229,8 @@ module OpenTox compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi} warn "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." end + substance_ids.uniq! + feature_ids.uniq! $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})" time = Time.now diff --git a/lib/lazar.rb b/lib/lazar.rb index 8eb46e0..8daaaa1 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -86,4 +86,3 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","Cross "experiment.rb", "import.rb", ].each{ |f| require_relative f } -OpenTox::PhysChem.descriptors # load descriptor features diff --git a/lib/model.rb b/lib/model.rb index 12abc6e..841ab20 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -47,9 +47,9 @@ module OpenTox end end R.assign "tox", toxicities - feature_ids = training_dataset.substances.collect{ |s| s["physchem"].keys}.flatten.uniq + feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq feature_ids.each do |feature_id| - feature_values = substances.collect{|s| s["physchem"][feature_id]} + feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id]} R.assign "feature", feature_values begin #R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')" diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb index c9fbb77..9bf419d 100644 --- a/lib/nanoparticle.rb +++ b/lib/nanoparticle.rb @@ -15,9 +15,9 @@ module OpenTox def add_feature feature, value case feature.category when "P-CHEM" - physchem[feature.id.to_s] ||= [] - physchem[feature.id.to_s] << value - physchem[feature.id.to_s].uniq! + physchem_descriptors[feature.id.to_s] ||= [] + physchem_descriptors[feature.id.to_s] << value + physchem_descriptors[feature.id.to_s].uniq! when "Proteomics" proteomics[feature.id.to_s] ||= [] proteomics[feature.id.to_s] << value diff --git a/lib/regression.rb b/lib/regression.rb index fe45f99..d2c4e91 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -84,7 +84,7 @@ module OpenTox activities = [] weights = [] - pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem.keys}.flatten.uniq + pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq data_frame = [] data_frame[0] = [] @@ -93,7 +93,7 @@ module OpenTox n["toxicities"][params[:prediction_feature_id].to_s].each do |act| data_frame[0][i] = act n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? - neighbor.physchem.each do |pid,values| + neighbor.physchem_descriptors.each do |pid,values| values.uniq! warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1 j = pc_ids.index(pid)+1 @@ -121,7 +121,7 @@ module OpenTox return result else query_descriptors = pc_ids.collect do |i| - compound.physchem[i] ? compound.physchem_descriptors[i].for_R : "NA" + compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA" end remove_idx = [] query_descriptors.each_with_index do |v,i| diff --git a/lib/substance.rb b/lib/substance.rb index 34bc94a..82ca65d 100644 --- a/lib/substance.rb +++ b/lib/substance.rb @@ -1,7 +1,7 @@ module OpenTox class Substance - field :physchem, type: Hash, default: {} + field :physchem_descriptors, type: Hash, default: {} field :toxicities, type: Hash, default: {} field :dataset_ids, type: Array, default: [] end -- cgit v1.2.3