summaryrefslogtreecommitdiff
path: root/lib/compound.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/compound.rb')
-rw-r--r--lib/compound.rb148
1 files changed, 83 insertions, 65 deletions
diff --git a/lib/compound.rb b/lib/compound.rb
index 4d32e24..e002305 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -1,12 +1,9 @@
-# TODO: check
-# *** Open Babel Error in ParseFile
-# Could not find contribution data file.
-
CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/"
module OpenTox
class Compound
+ require_relative "unique_descriptors.rb"
include OpenTox
DEFAULT_FINGERPRINT = "MP2D"
@@ -15,34 +12,35 @@ module OpenTox
field :smiles, type: String
field :inchikey, type: String
field :names, type: Array
- field :warning, type: String
field :cid, type: String
field :chemblid, type: String
field :png_id, type: BSON::ObjectId
field :svg_id, type: BSON::ObjectId
field :sdf_id, type: BSON::ObjectId
- field :molecular_weight, type: Float
field :fingerprints, type: Hash, default: {}
field :default_fingerprint_size, type: Integer
+ field :physchem_descriptors, type: Hash, default: {}
+ field :dataset_ids, type: Array, default: []
+ field :features, type: Hash, default: {}
index({smiles: 1}, {unique: true})
# Overwrites standard Mongoid method to create fingerprints before database insertion
def self.find_or_create_by params
compound = self.find_or_initialize_by params
- compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT)
+ compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT).size
compound.save
compound
end
- def fingerprint type="MP2D"
+ def fingerprint type=DEFAULT_FINGERPRINT
unless fingerprints[type]
return [] unless self.smiles
#http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format
if type == "MP2D"
fp = obconversion(smiles,"smi","mpd").strip.split("\t")
name = fp.shift # remove Title
- fingerprints[type] = fp
+ fingerprints[type] = fp.uniq # no fingerprint counts
#http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html
elsif type== "MNA"
level = 2 # TODO: level as parameter, evaluate level 1, see paper
@@ -82,19 +80,60 @@ module OpenTox
fingerprints[type]
end
+ def physchem descriptors=PhysChem.openbabel_descriptors
+ # TODO: speedup java descriptors
+ calculated_ids = physchem_descriptors.keys
+ # BSON::ObjectId instances are not allowed as keys in a BSON document.
+ new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids
+ descs = {}
+ algos = {}
+ new_ids.each do |id|
+ descriptor = PhysChem.find id
+ descs[[descriptor.library, descriptor.descriptor]] = descriptor
+ algos[descriptor.name] = descriptor
+ end
+ # avoid recalculating Cdk features with multiple values
+ descs.keys.uniq.each do |k|
+ descs[k].send(k[0].downcase,k[1],self).each do |n,v|
+ physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document.
+ end
+ end
+ save
+ physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id}
+ end
+
+ def smarts_match smarts, count=false
+ obconversion = OpenBabel::OBConversion.new
+ obmol = OpenBabel::OBMol.new
+ obconversion.set_in_format('smi')
+ obconversion.read_string(obmol,self.smiles)
+ smarts_pattern = OpenBabel::OBSmartsPattern.new
+ smarts.collect do |sma|
+ smarts_pattern.init(sma.smarts)
+ if smarts_pattern.match(obmol)
+ count ? value = smarts_pattern.get_map_list.to_a.size : value = 1
+ else
+ value = 0
+ end
+ value
+ end
+ end
+
# Create a compound from smiles string
# @example
# compound = OpenTox::Compound.from_smiles("c1ccccc1")
# @param [String] smiles Smiles string
# @return [OpenTox::Compound] Compound
def self.from_smiles smiles
- return nil if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles
+ if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles
+ $logger.warn "SMILES parsing failed for '#{smiles}'', SMILES string contains whitespaces."
+ return nil
+ end
smiles = obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons)
if smiles.empty?
+ $logger.warn "SMILES parsing failed for '#{smiles}'', this may be caused by an incorrect SMILES string."
return nil
- #Compound.find_or_create_by(:warning => "SMILES parsing failed for '#{smiles}', this may be caused by an incorrect SMILES string.")
else
- #Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons)
Compound.find_or_create_by :smiles => smiles
end
end
@@ -109,7 +148,7 @@ module OpenTox
#smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip
smiles = obconversion(inchi,"inchi","can")
if smiles.empty?
- Compound.find_or_create_by(:warning => "InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries.")
+ Compound.find_or_create_by(:warnings => ["InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries."])
else
Compound.find_or_create_by(:smiles => smiles, :inchi => inchi)
end
@@ -245,34 +284,19 @@ module OpenTox
def fingerprint_neighbors params
bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim]
neighbors = []
- #if params[:type] == DEFAULT_FINGERPRINT
- #neighbors = db_neighbors params
- #p neighbors
- #else
+ if params[:type] == DEFAULT_FINGERPRINT
+ neighbors = db_neighbors params
+ else
query_fingerprint = self.fingerprint params[:type]
- training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
- unless self == compound
- candidate_fingerprint = compound.fingerprint params[:type]
- sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
- neighbors << [compound.id, sim] if sim >= params[:min_sim]
- end
- end
- #end
- neighbors.sort{|a,b| b.last <=> a.last}
- end
-
- def fminer_neighbors params
- bad_request_error "Incorrect parameters for Compound#fminer_neighbors. Please provide :feature_dataset_id, :min_sim." unless params[:feature_dataset_id] and params[:min_sim]
- feature_dataset = Dataset.find params[:feature_dataset_id]
- query_fingerprint = Algorithm::Descriptor.smarts_match(self, feature_dataset.features)
- neighbors = []
-
- # find neighbors
- feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
- sim = Algorithm::Similarity.tanimoto candidate_fingerprint, query_fingerprint
- if sim >= params[:min_sim]
- neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
+ training_dataset = Dataset.find(params[:training_dataset_id])
+ prediction_feature = training_dataset.features.first
+ training_dataset.compounds.each do |compound|
+ candidate_fingerprint = compound.fingerprint params[:type]
+ sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
+ feature_values = training_dataset.values(compound,prediction_feature)
+ neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim]
end
+ neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]}
end
neighbors
end
@@ -285,13 +309,7 @@ module OpenTox
# TODO implement pearson and cosine similarity separatly
R.assign "x", query_fingerprint
R.assign "y", candidate_fingerprint
- # pearson r
- #sim = R.eval("cor(x,y,use='complete.obs',method='pearson')").to_ruby
- #p "pearson"
- #p sim
- #p "cosine"
sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first
- #p sim
if sim >= params[:min_sim]
neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
end
@@ -300,53 +318,53 @@ module OpenTox
end
def db_neighbors params
- p "DB NEIGHBORS"
- p params
- # TODO restrict to dataset
# from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
- qn = fingerprint(params[:type]).size
+
+ #qn = default_fingerprint_size
#qmin = qn * threshold
#qmax = qn / threshold
#not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...)
#reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)]
aggregate = [
#{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
- {'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
+ #{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
{'$project' => {
'tanimoto' => {'$let' => {
- 'vars' => {'common' => {'$size' => {'$setIntersection' => ["'$#{DEFAULT_FINGERPRINT}'", DEFAULT_FINGERPRINT]}}},
- 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [qn, '$default_fingerprint_size']}, '$$common']}]}
+ 'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}},
+ #'vars' => {'common' => {'$size' => {'$setIntersection' => ["$default_fingerprint", default_fingerprint]}}},
+ 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]}
}},
- '_id' => 1
+ '_id' => 1,
+ 'features' => 1,
+ 'dataset_ids' => 1
}},
{'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}},
{'$sort' => {'tanimoto' => -1}}
]
- $mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] }
+ $mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]}
end
- # Get mg from mmol
- # @return [Float] value in mg
- def mmol_to_mg mmol
- mmol.to_f*molecular_weight
+ # Convert mmol to mg
+ # @return [Float] value in mg
+ def mmol_to_mg mmol
+ mmol.to_f*molecular_weight
end
- def mg_to_mmol mg
- mg.to_f/molecular_weight
+ # Convert mg to mmol
+ # @return [Float] value in mg
+ def mg_to_mmol mg
+ mg.to_f/molecular_weight
end
# Calculate molecular weight of Compound with OB and store it in object
# @return [Float] molecular weight
def molecular_weight
- if self["molecular_weight"]==0.0 || self["molecular_weight"].nil?
- update(:molecular_weight => OpenTox::Algorithm::Descriptor.physchem(self, ["Openbabel.MW"]).first)
- end
- self["molecular_weight"]
+ mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW")
+ physchem([mw_feature])[mw_feature.id.to_s]
end
-
private
def self.obconversion(identifier,input_format,output_format,option=nil)