summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorhelma@in-silico.ch <helma@in-silico.ch>2018-10-24 18:21:34 +0200
committerhelma@in-silico.ch <helma@in-silico.ch>2018-10-24 18:21:34 +0200
commit1652fd5df948da7ace622c73d158010add656b9f (patch)
tree49e7eef3c6cdaaaadc38742e56996aaa145ac3de /lib
parent9d17895ab9e8cd31e0f32e8e622e13612ea5ff77 (diff)
dataset mapfeature_classes
Diffstat (limited to 'lib')
-rw-r--r--lib/compound.rb28
-rw-r--r--lib/dataset.rb178
-rw-r--r--lib/feature.rb34
-rw-r--r--lib/lazar.rb2
-rw-r--r--lib/model.rb8
-rw-r--r--lib/opentox.rb10
6 files changed, 164 insertions, 96 deletions
diff --git a/lib/compound.rb b/lib/compound.rb
index 22c8575..0714574 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -33,13 +33,11 @@ module OpenTox
def fingerprint type=DEFAULT_FINGERPRINT
unless fingerprints[type]
return [] unless self.smiles
- #http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format
- if type == "MP2D"
+ if type == "MP2D" # http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format
fp = obconversion(smiles,"smi","mpd").strip.split("\t")
name = fp.shift # remove Title
fingerprints[type] = fp.uniq # no fingerprint counts
- #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html
- elsif type== "MNA"
+ elsif type== "MNA" # http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html
level = 2 # TODO: level as parameter, evaluate level 1, see paper
fp = obconversion(smiles,"smi","mna","xL\"#{level}\"").split("\n")
fp.shift # remove Title
@@ -128,17 +126,9 @@ module OpenTox
# @param [String] smiles
# @return [OpenTox::Compound]
def self.from_smiles smiles
- if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles
- warn "SMILES parsing failed for '#{smiles}'', SMILES string contains whitespaces."
- return nil
- end
+ return nil if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles
smiles = obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons)
- if smiles.empty?
- warn "SMILES parsing failed for '#{smiles}'', this may be caused by an incorrect SMILES string."
- return nil
- else
- Compound.find_or_create_by :smiles => smiles
- end
+ smiles.empty? ? nil : Compound.find_or_create_by(:smiles => smiles)
end
# Create a compound from InChI string
@@ -146,11 +136,7 @@ module OpenTox
# @return [OpenTox::Compound]
def self.from_inchi inchi
smiles = obconversion(inchi,"inchi","can")
- if smiles.empty?
- Compound.find_or_create_by(:warnings => ["InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries."])
- else
- Compound.find_or_create_by(:smiles => smiles, :inchi => inchi)
- end
+ smiles.empty? ? nil : Compound.find_or_create_by(:smiles => smiles, :inchi => inchi)
end
# Create a compound from SDF
@@ -328,11 +314,11 @@ module OpenTox
print sdf
if sdf.match(/.nan/)
- warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure"
+ #warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure"
obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS)
sdf = obconversion.write_string(obmol)
if sdf.match(/.nan/)
- warn "2D generation failed for compound #{identifier}, rendering without coordinates."
+ #warn "2D generation failed for compound #{identifier}, rendering without coordinates."
obconversion.remove_option("gen2D", OpenBabel::OBConversion::GENOPTIONS)
sdf = obconversion.write_string(obmol)
end
diff --git a/lib/dataset.rb b/lib/dataset.rb
index bbb20be..aa66c9f 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -8,6 +8,7 @@ module OpenTox
class Dataset
field :data_entries, type: Hash, default: {}
+ field :source, type: String
field :md5, type: String
# Readers
@@ -52,6 +53,44 @@ module OpenTox
end
end
+ # Get OriginalId feature
+ # @return [OpenTox::OriginalId]
+ def original_id_feature
+ features.select{|f| f.is_a?(OriginalId)}.first
+ end
+
+ # Get original id
+ # @param [OpenTox::Substance] substance
+ # @return [String] original id
+ def original_id substance
+ values(substance,original_id_feature).first
+ end
+
+ # Get OriginalSmiles feature
+ # @return [OpenTox::OriginalSmiles]
+ def original_smiles_feature
+ features.select{|f| f.is_a?(OriginalSmiles)}.first
+ end
+
+ # Get original SMILES
+ # @param [OpenTox::Substance] substance
+ # @return [String] original SMILES
+ def original_smiles substance
+ values(substance,original_smiles_feature).first
+ end
+
+ # Get nominal and numeric bioactivity features
+ # @return [Array<OpenTox::NominalBioActivity,OpenTox::NumericBioActivity>]
+ def bioactivity_features
+ features.select{|f| f.class.to_s.match("BioActivity")}
+ end
+
+ # Get nominal and numeric bioactivity features
+ # @return [Array<OpenTox::NominalBioActivity,OpenTox::NumericBioActivity>]
+ def transformed_bioactivity_features
+ features.select{|f| f.class.to_s.match(/Transformed.*BioActivity/)}
+ end
+
# Writers
# Add a value for a given substance and feature
@@ -188,41 +227,38 @@ module OpenTox
# features
feature_names = table.shift.collect{|f| f.strip}
- warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
+ bad_request_error "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
- original_id = nil
if feature_names[0] =~ /ID/i # check ID column
- feature_names.shift
- original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => self.name+".ID")
+ original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => feature_names.shift)
+ else
+ original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => "LineID")
end
+ warnings = Warnings.find_or_create_by(:dataset_id => self.id)
+
compound_format = feature_names.shift
bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
+ original_smiles = OriginalSmiles.create if compound_format.match(/SMILES/i)
+
numeric = []
features = []
# guess feature types
bioactivity = true if feature_names.size == 1
+
feature_names.each_with_index do |f,i|
- original_id ? j = i+2 : j = i+1
+ original_id.name.match(/LineID$/) ? j = i+1 : j = i+2
values = table.collect{|row| val=row[j].to_s.strip; val.blank? ? nil : val }.uniq.compact
types = values.collect{|v| v.numeric? ? true : false}.uniq
feature = nil
if values.size == 0 # empty feature
elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes
numeric[i] = true
- if bioactivity
- feature = NumericBioActivity.find_or_create_by(:name => f)
- else
- feature = NumericSubstanceProperty.find_or_create_by(:name => f)
- end
+ bioactivity ? feature = NumericBioActivity.find_or_create_by(:name => f) : feature = NumericSubstanceProperty.find_or_create_by(:name => f)
else
numeric[i] = false
- if bioactivity
- feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort)
- else
- feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort)
- end
+ bioactivity ? feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort) : feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort)
end
features << feature if feature
end
@@ -231,32 +267,37 @@ module OpenTox
all_substances = []
table.each_with_index do |vals,i|
- original_id_value = vals.shift.strip if original_id
+ original_id.name.match(/LineID$/) ? original_id_value = i+1 : original_id_value = vals.shift.strip
identifier = vals.shift.strip
begin
case compound_format
when /SMILES/i
substance = Compound.from_smiles(identifier)
+ add substance, original_smiles, identifier
when /InChI/i
substance = Compound.from_inchi(identifier)
end
rescue
substance = nil
end
+
if substance.nil? # compound parsers may return nil
- warn "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored."
+ add substance, original_id, original_id_value
+ add substance, original_smiles, identifier
+ add substance, warnings, "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored."
next
end
+
all_substances << substance
substance.dataset_ids << self.id
substance.dataset_ids.uniq!
substance.save
- add substance, original_id, original_id_value if original_id
+ add substance, original_id, original_id_value
vals.each_with_index do |v,j|
if v.blank?
- warn "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'."
+ add substance, warnings, "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'."
v = nil
elsif numeric[j]
v = v.to_f
@@ -265,13 +306,15 @@ module OpenTox
end
add substance, features[j], v
end
- data_entries[substance.id.to_s] ||= nil #if vals.empty? # no features, eg batch predictions
+ #data_entries[substance.id.to_s] ||= nil #if vals.empty? # no features, eg batch predictions
end
all_substances.duplicates.each do |substance|
positions = []
- all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi}
- warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
+ all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.smiles and c.smiles == substance.smiles}
+ all_substances.select{|s| s.smiles == substance.smiles}.each do |s|
+ add s, warnings, "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
+ end
end
save
end
@@ -280,13 +323,20 @@ module OpenTox
# Convert dataset to csv format including compound smiles as first column, other column headers are feature names
# @return [String]
- def to_csv(inchi=false)
+ def to_csv inchi=false
CSV.generate() do |csv|
compound = substances.first.is_a? Compound
+ id = features.select{|f| f.is_a? OriginalId}.first
+ features.delete(id)
+ original_smiles = features.select{|f| f.is_a? OriginalSmiles}.first
+ features.delete(original_smiles)
+ warning = features.select{|f| f.is_a? Warnings}.first
+ features.delete(warning)
+
if compound
- csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
+ csv << [id.name, inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} + ["OriginalSmiles", "Warnings"]
else
- csv << ["Name"] + features.collect{|f| f.name}
+ csv << [id.name, "Name"] + features.collect{|f| f.name}
end
substances.each do |substance|
if compound
@@ -294,19 +344,10 @@ module OpenTox
else
name = substance.name
end
- nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq
-
- if nr_measurements.size > 1
- warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries."
- else
- (0..nr_measurements.first-1).each do |i|
- row = [name]
- features.each do |f|
- values(substance,f) ? row << values(substance,f)[i] : row << ""
- end
- csv << row
- end
- end
+ row = [values(substance,id).first,name] + features.collect{|f| values(substance,f).join(" ")}
+ row << values(substance,original_smiles).join(" ")
+ row << values(substance,warning).join(" ")
+ csv << row
end
end
end
@@ -332,18 +373,19 @@ module OpenTox
# Merge an array of datasets
# @param [Array] OpenTox::Dataset Array to be merged
- # @param [Hash] feature modifications
- # @param [Hash] value modifications
+ # @param [Array] OpenTox::Feature Array to be merged
# @return [OpenTox::Dataset] merged dataset
- def self.merge datasets, feature_map=nil, value_map=nil
- dataset = self.new(:source => datasets.collect{|d| d.source}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", "))
+ def self.merge datasets, features
+ # TODO warnings
+ features.uniq!
+ dataset = self.create(:source => datasets.collect{|d| d.id.to_s}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", "))
datasets.each do |d|
d.substances.each do |s|
- d.features.each do |f|
+ dataset.add s,d.original_id_feature,d.original_id(s)
+ dataset.add s,d.original_smiles_feature,d.original_smiles(s)
+ features.each do |f|
d.values(s,f).each do |v|
- f = feature_map[f] if feature_map and feature_map[f]
- v = value_map[v] if value_map and value_map[v]
- dataset.add s,f,v #unless dataset.values(s,f).include? v
+ dataset.add s,features.first,v #unless dataset.values(s,f).include? v
end
end
end
@@ -352,6 +394,17 @@ module OpenTox
dataset
end
+ # Copy a dataset
+ # @return OpenTox::Dataset dataset copy
+ def copy
+ dataset = Dataset.new
+ dataset.data_entries = data_entries
+ dataset.name = name
+ dataset.source = id.to_s
+ dataset.save
+ dataset
+ end
+
# Split a dataset into n folds
# @param [Integer] number of folds
# @return [Array] Array with folds [training_dataset,test_dataset]
@@ -384,6 +437,19 @@ module OpenTox
end
chunks
end
+
+ # Change nominal feature values
+ # @param [NominalFeature] Original feature
+ # @param [Hash] how to change feature values
+ def map feature, map
+ dataset = self.copy
+ new_feature = TransformedNominalBioActivity.find_or_create_by(:name => feature.name + " (transformed)", :original_feature_id => feature.id, :transformation => map, :accept_values => map.values.sort)
+ compounds.each do |c|
+ values(c,feature).each { |v| dataset.add c, new_feature, map[v] }
+ end
+ dataset.save
+ dataset
+ end
def transform # TODO
end
@@ -397,9 +463,9 @@ module OpenTox
end
# Dataset for lazar predictions
- class LazarPrediction #< Dataset
+ class LazarPrediction < Dataset
field :creator, type: String
- field :prediction_feature_id, type: BSON::ObjectId
+ #field :prediction_feature_id, type: BSON::ObjectId
field :predictions, type: Hash, default: {}
# Get prediction feature
@@ -408,16 +474,16 @@ module OpenTox
Feature.find prediction_feature_id
end
- # Get all compounds
- # @return [Array<OpenTox::Compound>]
- def compounds
- substances.select{|s| s.is_a? Compound}
+ def prediction compound
end
- # Get all substances
- # @return [Array<OpenTox::Substance>]
- def substances
- predictions.keys.collect{|id| Substance.find id}
+ def probability klass
+ end
+
+ def prediction_interval
+ end
+
+ def predictions
end
end
diff --git a/lib/feature.rb b/lib/feature.rb
index 2c10c26..056957b 100644
--- a/lib/feature.rb
+++ b/lib/feature.rb
@@ -1,32 +1,46 @@
module OpenTox
- # Basic feature class
- class Feature
- end
-
# Original ID (e.g. from CSV input)
class OriginalId < Feature
field :dataset_id, type: BSON::ObjectId
end
- # Feature for categorical variables
+ # Original SMILES (e.g. from CSV input)
+ class OriginalSmiles < Feature
+ field :dataset_id, type: BSON::ObjectId
+ end
+
+ # Warnings
+ class Warnings < Feature
+ field :dataset_id, type: BSON::ObjectId
+ end
+
+ # Categorical variables
class NominalFeature < Feature
field :accept_values, type: Array
end
- # Feature for quantitative variables
+ # Quantitative variables
class NumericFeature < Feature
field :unit, type: String
end
# Nominal biological activity
class NominalBioActivity < NominalFeature
- field :original_feature_id, type: BSON::ObjectId
- field :transformation, type: Hash
end
# Numeric biological activity
class NumericBioActivity < NumericFeature
+ end
+
+ # Transformed nominal biological activity
+ class TransformedNominalBioActivity < NominalFeature
+ field :original_feature_id, type: BSON::ObjectId
+ field :transformation, type: Hash
+ end
+
+ # Transformed numeric biological activity
+ class TransformedNumericBioActivity < NumericFeature
field :original_feature_id, type: BSON::ObjectId
field :transformation, type: String
end
@@ -38,7 +52,6 @@ module OpenTox
end
class LazarPredictionProbability < NominalLazarPrediction
- field :value, type: Float
end
# Numeric lazar prediction
@@ -47,6 +60,9 @@ module OpenTox
field :training_feature_id, type: BSON::ObjectId
end
+ class LazarConfidenceInterval < NumericLazarPrediction
+ end
+
class NominalSubstanceProperty < NominalFeature
end
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 13ad1f8..7e813e4 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -72,7 +72,7 @@ PUBCHEM_URI = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/"
CHEMBL_URI = "https://www.ebi.ac.uk/chembl/api/data/molecule/"
# OpenTox classes and includes
-CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
+CLASSES = ["Feature","Substance","Dataset","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation"]# Algorithm and Models are modules
[ # be aware of the require sequence as it affects class/method overwrites
"overwrite.rb",
diff --git a/lib/model.rb b/lib/model.rb
index 7ee50fe..9858949 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -199,7 +199,6 @@ module OpenTox
# @return [Hash]
def predict_substance substance, threshold = self.algorithms[:similarity][:min]
- #p substance.smiles
t = Time.now
@independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
case algorithms[:similarity][:method]
@@ -286,7 +285,6 @@ module OpenTox
else # try again with a lower threshold
predict_substance substance, 0.2
end
- #p prediction
#p Time.now - t
prediction
end
@@ -330,11 +328,12 @@ module OpenTox
elsif object.is_a? Array
return predictions
elsif object.is_a? Dataset
+ warning_feature = InfoFeature.find_or_create_by(:name => "Warnings")
if prediction_feature.is_a? NominalBioActivity
f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id)
probability_features = {}
prediction_feature.accept_values.each do |v|
- probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => "probability(#{v})", :accept_values => prediction_feature.accept_values, :value => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
+ probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
end
elsif prediction_feature.is_a? NumericBioActivity
f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id)
@@ -344,10 +343,11 @@ module OpenTox
d = Dataset.new(:name => object.name)
# add predictions to dataset
predictions.each do |substance_id,p|
- d.warnings += p[:warnings]
+ d.add substance_id,warning_feature,p[:warnings].join(" ") if p[:warnings]
unless p[:value].nil?
d.add substance_id,f,p[:value]
p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p}
+ # TODO prediction interval
end
end
d.save
diff --git a/lib/opentox.rb b/lib/opentox.rb
index 03d65b0..9cc8260 100644
--- a/lib/opentox.rb
+++ b/lib/opentox.rb
@@ -11,13 +11,13 @@ module OpenTox
include Mongoid::Timestamps
store_in collection: klass.downcase.pluralize
field :name, type: String
- field :source, type: String
- field :warnings, type: Array, default: []
+ #field :source, type: String
+ #field :warnings, type: Array, default: []
- def warn warning
+# def warn warning
#$logger.warn warning
- warnings << warning
- end
+# warnings << warning
+# end
end
OpenTox.const_set klass,c
end