summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-05-06 12:49:28 +0200
committerChristoph Helma <helma@in-silico.ch>2016-05-06 12:49:28 +0200
commit51f57e2858b60bed74ebcc97189b2188c900c283 (patch)
treea3be50b410e45fad3f33e956bb302c66e0370226 /lib
parentab7b37541b4f8a762be737009631d3eefd898b4a (diff)
dataset tests cleanup
Diffstat (limited to 'lib')
-rw-r--r--lib/compound.rb7
-rw-r--r--lib/dataset.rb39
-rw-r--r--lib/lazar.rb1
-rw-r--r--lib/model.rb4
-rw-r--r--lib/nanoparticle.rb6
-rw-r--r--lib/regression.rb6
-rw-r--r--lib/substance.rb2
7 files changed, 36 insertions, 29 deletions
diff --git a/lib/compound.rb b/lib/compound.rb
index 143c4f2..6cb7f78 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -22,6 +22,7 @@ module OpenTox
# Overwrites standard Mongoid method to create fingerprints before database insertion
def self.find_or_create_by params
+ #PhysChem.descriptors # load descriptor features
compound = self.find_or_initialize_by params
compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT).size
compound.save
@@ -77,7 +78,7 @@ module OpenTox
def physchem descriptors=PhysChem.openbabel_descriptors
# TODO: speedup java descriptors
- calculated_ids = physchem.keys
+ calculated_ids = physchem_descriptors.keys
# BSON::ObjectId instances are not allowed as keys in a BSON document.
new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids
descs = {}
@@ -90,11 +91,11 @@ module OpenTox
# avoid recalculating Cdk features with multiple values
descs.keys.uniq.each do |k|
descs[k].send(k[0].downcase,k[1],self).each do |n,v|
- physchem[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
+ physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
end
end
save
- physchem.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id}
+ physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id}
end
def smarts_match smarts, count=false
diff --git a/lib/dataset.rb b/lib/dataset.rb
index b51d74b..9b24440 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -30,19 +30,11 @@ module OpenTox
@features
end
- # Find data entry values for a given compound and feature
- # @param compound [OpenTox::Compound] OpenTox Compound object
- # @param feature [OpenTox::Feature] OpenTox Feature object
- # @return [Array] Data entry values
- #def values(compound, feature)
- #data_entries[compound.id.to_s][feature.id.to_s]
- #end
-
# Writers
# Set compounds
def compounds=(compounds)
- self.substance_ids = compounds.collect{|c| c.id}
+ self.substance_ids = compounds.collect{|c| c.id}.uniq
end
# Set features
@@ -95,14 +87,27 @@ module OpenTox
csv << ["Name"] + features.collect{|f| f.name}
end
substances.each do |substance|
- features.each do |f|
- substance.toxicities[f.id.to_s].each do |v|
- if compound
- csv << [inchi ? substance.inchi : substance.smiles , v]
- else
- csv << [substance.name , v]
+ if compound
+ name = (inchi ? substance.inchi : substance.smiles)
+ else
+ name = substance.name
+ end
+ nr_measurements = features.collect{|f| substance.toxicities[f.id.to_s].size if substance.toxicities[f.id.to_s]}.compact.uniq
+
+ if nr_measurements.size > 1
+ warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries."
+ else
+ (0..nr_measurements.first-1).each do |i|
+ row = [name]
+ features.each do |f|
+ if substance.toxicities[f.id.to_s]
+ row << substance.toxicities[f.id.to_s][i]
+ else
+ row << ""
+ end
end
- end if substance.toxicities[f.id.to_s]
+ csv << row
+ end
end
end
end
@@ -224,6 +229,8 @@ module OpenTox
compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi}
warn "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
end
+ substance_ids.uniq!
+ feature_ids.uniq!
$logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
time = Time.now
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 8eb46e0..8daaaa1 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -86,4 +86,3 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","Cross
"experiment.rb",
"import.rb",
].each{ |f| require_relative f }
-OpenTox::PhysChem.descriptors # load descriptor features
diff --git a/lib/model.rb b/lib/model.rb
index 12abc6e..841ab20 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -47,9 +47,9 @@ module OpenTox
end
end
R.assign "tox", toxicities
- feature_ids = training_dataset.substances.collect{ |s| s["physchem"].keys}.flatten.uniq
+ feature_ids = training_dataset.substances.collect{ |s| s["physchem_descriptors"].keys}.flatten.uniq
feature_ids.each do |feature_id|
- feature_values = substances.collect{|s| s["physchem"][feature_id]}
+ feature_values = substances.collect{|s| s["physchem_descriptors"][feature_id]}
R.assign "feature", feature_values
begin
#R.eval "cor <- cor.test(-log(tox),-log(feature),use='complete')"
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
index c9fbb77..9bf419d 100644
--- a/lib/nanoparticle.rb
+++ b/lib/nanoparticle.rb
@@ -15,9 +15,9 @@ module OpenTox
def add_feature feature, value
case feature.category
when "P-CHEM"
- physchem[feature.id.to_s] ||= []
- physchem[feature.id.to_s] << value
- physchem[feature.id.to_s].uniq!
+ physchem_descriptors[feature.id.to_s] ||= []
+ physchem_descriptors[feature.id.to_s] << value
+ physchem_descriptors[feature.id.to_s].uniq!
when "Proteomics"
proteomics[feature.id.to_s] ||= []
proteomics[feature.id.to_s] << value
diff --git a/lib/regression.rb b/lib/regression.rb
index fe45f99..d2c4e91 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -84,7 +84,7 @@ module OpenTox
activities = []
weights = []
- pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem.keys}.flatten.uniq
+ pc_ids = neighbors.collect{|n| Substance.find(n["_id"]).physchem_descriptors.keys}.flatten.uniq
data_frame = []
data_frame[0] = []
@@ -93,7 +93,7 @@ module OpenTox
n["toxicities"][params[:prediction_feature_id].to_s].each do |act|
data_frame[0][i] = act
n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
- neighbor.physchem.each do |pid,values|
+ neighbor.physchem_descriptors.each do |pid,values|
values.uniq!
warn "More than one value for '#{Feature.find(pid).name}': #{values.join(', ')}. Using the median." unless values.size == 1
j = pc_ids.index(pid)+1
@@ -121,7 +121,7 @@ module OpenTox
return result
else
query_descriptors = pc_ids.collect do |i|
- compound.physchem[i] ? compound.physchem_descriptors[i].for_R : "NA"
+ compound.physchem_descriptors[i] ? compound.physchem_descriptors[i].for_R : "NA"
end
remove_idx = []
query_descriptors.each_with_index do |v,i|
diff --git a/lib/substance.rb b/lib/substance.rb
index 34bc94a..82ca65d 100644
--- a/lib/substance.rb
+++ b/lib/substance.rb
@@ -1,7 +1,7 @@
module OpenTox
class Substance
- field :physchem, type: Hash, default: {}
+ field :physchem_descriptors, type: Hash, default: {}
field :toxicities, type: Hash, default: {}
field :dataset_ids, type: Array, default: []
end