summaryrefslogtreecommitdiff
path: root/lib/compound.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/compound.rb')
-rw-r--r--lib/compound.rb370
1 files changed, 370 insertions, 0 deletions
diff --git a/lib/compound.rb b/lib/compound.rb
new file mode 100644
index 0000000..7a3dc5c
--- /dev/null
+++ b/lib/compound.rb
@@ -0,0 +1,370 @@
+# TODO: check
+# *** Open Babel Error in ParseFile
+# Could not find contribution data file.
+
+CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/"
+
+module OpenTox
+
+ class Compound
+ include OpenTox
+
+ DEFAULT_FINGERPRINT = "MP2D"
+
+ field :inchi, type: String
+ field :smiles, type: String
+ field :inchikey, type: String
+ field :names, type: Array
+ field :warning, type: String
+ field :cid, type: String
+ field :chemblid, type: String
+ field :png_id, type: BSON::ObjectId
+ field :svg_id, type: BSON::ObjectId
+ field :sdf_id, type: BSON::ObjectId
+ field :fingerprints, type: Hash, default: {}
+ field :default_fingerprint_size, type: Integer
+
+ index({smiles: 1}, {unique: true})
+
+ # Overwrites standard Mongoid method to create fingerprints before database insertion
+ def self.find_or_create_by params
+ compound = self.find_or_initialize_by params
+ compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT)
+ compound.save
+ compound
+ end
+
+ def fingerprint type="MP2D"
+ unless fingerprints[type]
+ return [] unless self.smiles
+ #http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format
+ if type == "MP2D"
+ fp = obconversion(smiles,"smi","mpd").strip.split("\t")
+ name = fp.shift # remove Title
+ fingerprints[type] = fp
+ #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html
+ elsif type== "MNA"
+ level = 2 # TODO: level as parameter, evaluate level 1, see paper
+ fp = obconversion(smiles,"smi","mna","xL\"#{level}\"").split("\n")
+ fp.shift # remove Title
+ fingerprints[type] = fp
+ else # standard fingerprints
+ fp = OpenBabel::OBFingerprint.find_fingerprint(type)
+ obmol = OpenBabel::OBMol.new
+ obconversion = OpenBabel::OBConversion.new
+ obconversion.set_in_format "smi"
+ obconversion.read_string obmol, self.smiles
+ result = OpenBabel::VectorUnsignedInt.new
+ fp.get_fingerprint(obmol,result)
+ # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i
+ #p OpenBabel::OBFingerprint.describe_bits(result)
+ # convert result to a list of the bits that are set
+ # from openbabel/scripts/python/pybel.py line 830
+ # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints
+ result = result.to_a
+ bitsperint = OpenBabel::OBFingerprint.getbitsperint()
+ bits_set = []
+ start = 1
+ result.each do |x|
+ i = start
+ while x > 0 do
+ bits_set << i if (x % 2) == 1
+ x >>= 1
+ i += 1
+ end
+ start += bitsperint
+ end
+ fingerprints[type] = bits_set
+ end
+ save
+ end
+ fingerprints[type]
+ end
+
+ # Create a compound from smiles string
+ # @example
+ # compound = OpenTox::Compound.from_smiles("c1ccccc1")
+ # @param [String] smiles Smiles string
+ # @return [OpenTox::Compound] Compound
+ def self.from_smiles smiles
+ smiles = obconversion(smiles,"smi","can")
+ if smiles.empty?
+ return nil
+ #Compound.find_or_create_by(:warning => "SMILES parsing failed for '#{smiles}', this may be caused by an incorrect SMILES string.")
+ else
+ Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can")
+ end
+ end
+
+ # Create a compound from inchi string
+ # @param inchi [String] smiles InChI string
+ # @return [OpenTox::Compound] Compound
+ def self.from_inchi inchi
+ # Temporary workaround for OpenBabels Inchi bug
+ # http://sourceforge.net/p/openbabel/bugs/957/
+ # bug has not been fixed in latest git/development version
+ #smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip
+ smiles = obconversion(inchi,"inchi","can")
+ if smiles.empty?
+ Compound.find_or_create_by(:warning => "InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries.")
+ else
+ Compound.find_or_create_by(:smiles => smiles, :inchi => inchi)
+ end
+ end
+
+ # Create a compound from sdf string
+ # @param sdf [String] smiles SDF string
+ # @return [OpenTox::Compound] Compound
+ def self.from_sdf sdf
+ # do not store sdf because it might be 2D
+ Compound.from_smiles obconversion(sdf,"sdf","can")
+ end
+
+ # Create a compound from name. Relies on an external service for name lookups.
+ # @example
+ # compound = OpenTox::Compound.from_name("Benzene")
+ # @param name [String] can be also an InChI/InChiKey, CAS number, etc
+ # @return [OpenTox::Compound] Compound
+ def self.from_name name
+ Compound.from_smiles RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"smiles"))
+ end
+
+ # Get InChI
+ # @return [String] InChI string
+ def inchi
+ unless self["inchi"]
+
+ result = obconversion(smiles,"smi","inchi")
+ #result = `echo "#{self.smiles}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -ismi - -oinchi`.chomp
+ update(:inchi => result.chomp) if result and !result.empty?
+ end
+ self["inchi"]
+ end
+
+ # Get InChIKey
+ # @return [String] InChIKey string
+ def inchikey
+ update(:inchikey => obconversion(smiles,"smi","inchikey")) unless self["inchikey"]
+ self["inchikey"]
+ end
+
+ # Get (canonical) smiles
+ # @return [String] Smiles string
+ def smiles
+ update(:smiles => obconversion(self["smiles"],"smi","can")) unless self["smiles"]
+ self["smiles"]
+ end
+
+ # Get sdf
+ # @return [String] SDF string
+ def sdf
+ if self.sdf_id.nil?
+ sdf = obconversion(smiles,"smi","sdf")
+ file = Mongo::Grid::File.new(sdf, :filename => "#{id}.sdf",:content_type => "chemical/x-mdl-sdfile")
+ sdf_id = $gridfs.insert_one file
+ update :sdf_id => sdf_id
+ end
+ $gridfs.find_one(_id: self.sdf_id).data
+ end
+
+ # Get SVG image
+ # @return [image/svg] Image data
+ def svg
+ if self.svg_id.nil?
+ svg = obconversion(smiles,"smi","svg")
+ file = Mongo::Grid::File.new(svg, :filename => "#{id}.svg", :content_type => "image/svg")
+ update(:svg_id => $gridfs.insert_one(file))
+ end
+ $gridfs.find_one(_id: self.svg_id).data
+
+ end
+
+ # Get png image
+ # @example
+ # image = compound.png
+ # @return [image/png] Image data
+ def png
+ if self.png_id.nil?
+ png = obconversion(smiles,"smi","_png2")
+ file = Mongo::Grid::File.new(Base64.encode64(png), :filename => "#{id}.png", :content_type => "image/png")
+ update(:png_id => $gridfs.insert_one(file))
+ end
+ Base64.decode64($gridfs.find_one(_id: self.png_id).data)
+
+ end
+
+ # Get all known compound names. Relies on an external service for name lookups.
+ # @example
+ # names = compound.names
+ # @return [String] Compound names
+ def names
+ update(:names => RestClientWrapper.get("#{CACTUS_URI}#{inchi}/names").split("\n")) unless self["names"]
+ self["names"]
+ end
+
+ # @return [String] PubChem Compound Identifier (CID), derieved via restcall to pubchem
+ def cid
+ pug_uri = "http://pubchem.ncbi.nlm.nih.gov/rest/pug/"
+ update(:cid => RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"]
+ self["cid"]
+ end
+
+ # @return [String] ChEMBL database compound id, derieved via restcall to chembl
+ def chemblid
+ # https://www.ebi.ac.uk/chembldb/ws#individualCompoundByInChiKey
+ uri = "http://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json"
+ update(:chemblid => JSON.parse(RestClientWrapper.get(uri))["compounds"].first["chemblId"]) unless self["chemblid"]
+ self["chemblid"]
+ end
+
+ def fingerprint_count_neighbors params
+ # TODO fix
+ neighbors = []
+ query_fingerprint = self.fingerprint params[:type]
+ training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
+ unless self == compound
+ candidate_fingerprint = compound.fingerprint params[:type]
+ features = (query_fingerprint + candidate_fingerprint).uniq
+ min_sum = 0
+ max_sum = 0
+ features.each do |f|
+ min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax
+ min_sum += min
+ max_sum += max
+ end
+ max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f
+ neighbors << [compound.id, sim] if sim and sim >= params[:min_sim]
+ end
+ end
+ neighbors.sort{|a,b| b.last <=> a.last}
+ end
+
+ def fingerprint_neighbors params
+ bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim]
+ neighbors = []
+ #if params[:type] == DEFAULT_FINGERPRINT
+ #neighbors = db_neighbors params
+ #p neighbors
+ #else
+ query_fingerprint = self.fingerprint params[:type]
+ training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
+ unless self == compound
+ candidate_fingerprint = compound.fingerprint params[:type]
+ sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
+ neighbors << [compound.id, sim] if sim >= params[:min_sim]
+ end
+ end
+ #end
+ neighbors.sort{|a,b| b.last <=> a.last}
+ end
+
+ def fminer_neighbors params
+ bad_request_error "Incorrect parameters for Compound#fminer_neighbors. Please provide :feature_dataset_id, :min_sim." unless params[:feature_dataset_id] and params[:min_sim]
+ feature_dataset = Dataset.find params[:feature_dataset_id]
+ query_fingerprint = Algorithm::Descriptor.smarts_match(self, feature_dataset.features)
+ neighbors = []
+
+ # find neighbors
+ feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
+ sim = Algorithm::Similarity.tanimoto candidate_fingerprint, query_fingerprint
+ if sim >= params[:min_sim]
+ neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
+ end
+ end
+ neighbors
+ end
+
+ def physchem_neighbors params
+ feature_dataset = Dataset.find params[:feature_dataset_id]
+ query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors]
+ neighbors = []
+ feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
+ # TODO implement pearson and cosine similarity separatly
+ R.assign "x", query_fingerprint
+ R.assign "y", candidate_fingerprint
+ # pearson r
+ #sim = R.eval("cor(x,y,use='complete.obs',method='pearson')").to_ruby
+ #p "pearson"
+ #p sim
+ #p "cosine"
+ sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first
+ #p sim
+ if sim >= params[:min_sim]
+ neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
+ end
+ end
+ neighbors
+ end
+
+ def db_neighbors params
+ p "DB NEIGHBORS"
+ p params
+ # TODO restrict to dataset
+ # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
+ qn = fingerprint(params[:type]).size
+ #qmin = qn * threshold
+ #qmax = qn / threshold
+ #not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...)
+ #reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)]
+ aggregate = [
+ #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
+ {'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
+ {'$project' => {
+ 'tanimoto' => {'$let' => {
+ 'vars' => {'common' => {'$size' => {'$setIntersection' => ["'$#{DEFAULT_FINGERPRINT}'", DEFAULT_FINGERPRINT]}}},
+ 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [qn, '$default_fingerprint_size']}, '$$common']}]}
+ }},
+ '_id' => 1
+ }},
+ {'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}},
+ {'$sort' => {'tanimoto' => -1}}
+ ]
+
+ $mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] }
+
+ end
+
+ private
+
+ def self.obconversion(identifier,input_format,output_format,option=nil)
+ obconversion = OpenBabel::OBConversion.new
+ obconversion.set_options(option, OpenBabel::OBConversion::OUTOPTIONS) if option
+ obmol = OpenBabel::OBMol.new
+ obconversion.set_in_and_out_formats input_format, output_format
+ return nil if identifier.nil?
+ obconversion.read_string obmol, identifier
+ case output_format
+ when /smi|can|inchi/
+ obconversion.write_string(obmol).gsub(/\s/,'').chomp
+ when /sdf/
+ # TODO: find disconnected structures
+ # strip_salts
+ # separate
+ obmol.add_hydrogens
+ builder = OpenBabel::OBBuilder.new
+ builder.build(obmol)
+
+ sdf = obconversion.write_string(obmol)
+print sdf
+ if sdf.match(/.nan/)
+
+ $logger.warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure"
+ obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS)
+ sdf = obconversion.write_string(obmol)
+ if sdf.match(/.nan/)
+ $logger.warn "2D generation failed for compound #{identifier}, rendering without coordinates."
+ obconversion.remove_option("gen2D", OpenBabel::OBConversion::GENOPTIONS)
+ sdf = obconversion.write_string(obmol)
+ end
+ end
+ sdf
+ else
+ obconversion.write_string(obmol)
+ end
+ end
+
+ def obconversion(identifier,input_format,output_format,option=nil)
+ self.class.obconversion(identifier,input_format,output_format,option)
+ end
+ end
+end