From 1f789133d961c29d3babfaf69cdde3d675288537 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sat, 24 Aug 2019 14:44:52 +0200 Subject: initial refactored version for mutagenicity paper --- lib/compound.rb | 475 +++++++++++++++++++++++++------------------------------- 1 file changed, 213 insertions(+), 262 deletions(-) (limited to 'lib/compound.rb') diff --git a/lib/compound.rb b/lib/compound.rb index 6d0e075..615ea6e 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -1,296 +1,247 @@ -module OpenTox +require 'openbabel' - # Small molecules with defined chemical structures - class Compound < Substance - require_relative "unique_descriptors.rb" - DEFAULT_FINGERPRINT = "MP2D" +# Small molecules with defined chemical structures +class Compound + DEFAULT_FINGERPRINT = "MP2D" - field :inchi, type: String - field :smiles, type: String - field :inchikey, type: String - field :names, type: Array - field :cid, type: String - field :png_id, type: BSON::ObjectId - field :svg_id, type: BSON::ObjectId - field :sdf_id, type: BSON::ObjectId - field :fingerprints, type: Hash, default: {} - field :default_fingerprint_size, type: Integer - - index({smiles: 1}, {unique: true}) - - # Overwrites standard Mongoid method to create fingerprints before database insertion - def self.find_or_create_by params - compound = self.find_or_initialize_by params - compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT).size - compound.save - compound - end + def initialize smiles + @smiles = smiles + @fingerprints = {} + end - # Create chemical fingerprint - # @param [String] fingerprint type - # @return [Array] - def fingerprint type=DEFAULT_FINGERPRINT - unless fingerprints[type] - return [] unless self.smiles - if type == "MP2D" # http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format - fp = obconversion(smiles,"smi","mpd").strip.split("\t") - name = fp.shift # remove Title - fingerprints[type] = fp.uniq # no fingerprint counts - elsif type== "MNA" # http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html - level = 2 # TODO: level as parameter, evaluate level 1, see paper - fp = obconversion(smiles,"smi","mna","xL\"#{level}\"").split("\n") - fp.shift # remove Title - fingerprints[type] = fp - else # standard fingerprints - fp = OpenBabel::OBFingerprint.find_fingerprint(type) - obmol = OpenBabel::OBMol.new - obconversion = OpenBabel::OBConversion.new - obconversion.set_in_format "smi" - obconversion.read_string obmol, self.smiles - result = OpenBabel::VectorUnsignedInt.new - fp.get_fingerprint(obmol,result) - # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i - #p OpenBabel::OBFingerprint.describe_bits(result) - # convert result to a list of the bits that are set - # from openbabel/scripts/python/pybel.py line 830 - # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints - result = result.to_a - bitsperint = OpenBabel::OBFingerprint.getbitsperint() - bits_set = [] - start = 1 - result.each do |x| - i = start - while x > 0 do - bits_set << i if (x % 2) == 1 - x >>= 1 - i += 1 - end - start += bitsperint + # Create chemical fingerprint + # @param [String] fingerprint type + # @return [Array] + def fingerprint type=DEFAULT_FINGERPRINT + unless @fingerprints[type] + if type == "MP2D" # http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format + fp = obconversion(@smiles,"smi","mpd").strip.split("\t") + fp.shift # remove Title + @fingerprints[type] = fp.uniq # no fingerprint counts + elsif type== "MNA" # http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html + level = 2 # TODO: level as parameter, evaluate level 1, see paper + fp = obconversion(@smiles,"smi","mna","xL\"#{level}\"").split("\n") + fp.shift # remove Title + @fingerprints[type] = fp + else # standard fingerprints + fp = OpenBabel::OBFingerprint.find_fingerprint(type) + obmol = OpenBabel::OBMol.new + obconversion = OpenBabel::OBConversion.new + obconversion.set_in_format "smi" + obconversion.read_string obmol, @smiles + result = OpenBabel::VectorUnsignedInt.new + fp.get_fingerprint(obmol,result) + # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i + #p OpenBabel::OBFingerprint.describe_bits(result) + # convert result to a list of the bits that are set + # from openbabel/scripts/python/pybel.py line 830 + # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints + result = result.to_a + bitsperint = OpenBabel::OBFingerprint.getbitsperint() + bits_set = [] + start = 1 + result.each do |x| + i = start + while x > 0 do + bits_set << i if (x % 2) == 1 + x >>= 1 + i += 1 end - fingerprints[type] = bits_set + start += bitsperint end - save + @fingerprints[type] = bits_set end - fingerprints[type] end + @fingerprints[type] + end - # Calculate physchem properties - # @param [Array] list of descriptors - # @return [Array] - def calculate_properties descriptors=PhysChem::OPENBABEL - calculated_ids = properties.keys - # BSON::ObjectId instances are not allowed as keys in a BSON document. - new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids - descs = {} - algos = {} - new_ids.each do |id| - descriptor = PhysChem.find id - descs[[descriptor.library, descriptor.descriptor]] = descriptor - algos[descriptor.name] = descriptor - end - # avoid recalculating Cdk features with multiple values - descs.keys.uniq.each do |k| - descs[k].send(k[0].downcase,k[1],self).each do |n,v| - properties[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document. - end +=begin + # Calculate physchem properties + # @param [Array] list of descriptors + # @return [Array] + def calculate_properties descriptors=PhysChem::OPENBABEL + calculated_ids = properties.keys + # BSON::ObjectId instances are not allowed as keys in a BSON document. + new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids + descs = {} + algos = {} + new_ids.each do |id| + descriptor = PhysChem.find id + descs[[descriptor.library, descriptor.descriptor]] = descriptor + algos[descriptor.name] = descriptor + end + # avoid recalculating Cdk features with multiple values + descs.keys.uniq.each do |k| + descs[k].send(k[0].downcase,k[1],self).each do |n,v| + properties[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document. end - save - descriptors.collect{|d| properties[d.id.to_s]} end - - # Match a SMARTS substructure - # @param [String] smarts - # @param [TrueClass,FalseClass] count matches or return true/false - # @return [TrueClass,FalseClass,Fixnum] - def smarts_match smarts, count=false - obconversion = OpenBabel::OBConversion.new - obmol = OpenBabel::OBMol.new - obconversion.set_in_format('smi') - obconversion.read_string(obmol,self.smiles) - smarts_pattern = OpenBabel::OBSmartsPattern.new - smarts.collect do |sma| - smarts_pattern.init(sma.smarts) - if smarts_pattern.match(obmol) - count ? value = smarts_pattern.get_map_list.to_a.size : value = 1 - else - value = 0 - end - value + save + descriptors.collect{|d| properties[d.id.to_s]} + end +=end + + # Match a SMARTS substructure + # @param [String] smarts + # @param [TrueClass,FalseClass] count matches or return true/false + # @return [TrueClass,FalseClass,Fixnum] + def smarts_match smarts, count=false + obconversion = OpenBabel::OBConversion.new + obmol = OpenBabel::OBMol.new + obconversion.set_in_format('smi') + obconversion.read_string(obmol,@smiles) + smarts_pattern = OpenBabel::OBSmartsPattern.new + smarts.collect do |sma| + smarts_pattern.init(sma.smarts) + if smarts_pattern.match(obmol) + count ? value = smarts_pattern.get_map_list.to_a.size : value = 1 + else + value = 0 end + value end + end - # Create a compound from smiles string - # @example - # compound = OpenTox::Compound.from_smiles("c1ccccc1") - # @param [String] smiles - # @return [OpenTox::Compound] - def self.from_smiles smiles - return nil if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles - smiles = obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons) - smiles.empty? ? nil : Compound.find_or_create_by(:smiles => smiles) - end - - # Create a compound from InChI string - # @param [String] InChI - # @return [OpenTox::Compound] - def self.from_inchi inchi - smiles = obconversion(inchi,"inchi","can") - smiles.empty? ? nil : Compound.find_or_create_by(:smiles => smiles) - end - - # Create a compound from SDF - # @param [String] SDF - # @return [OpenTox::Compound] - def self.from_sdf sdf - # do not store sdf because it might be 2D - Compound.from_smiles obconversion(sdf,"sdf","can") - end - - # Create a compound from name. Relies on an external service for name lookups. - # @example - # compound = OpenTox::Compound.from_name("Benzene") - # @param [String] name, can be also an InChI/InChiKey, CAS number, etc - # @return [OpenTox::Compound] - def self.from_name name - Compound.from_smiles RestClientWrapper.get(File.join(PUBCHEM_URI,"compound","name",URI.escape(name),"property","CanonicalSMILES","TXT")).chomp - end + # Create a compound from smiles string + # @example + # compound = Lazar::Compound.from_smiles("c1ccccc1") + # @param [String] smiles + # @return [Lazar::Compound] + def self.from_smiles smiles + return nil if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles + @smiles = obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons) + @smiles.empty? ? nil : @smiles + end - # Get InChI - # @return [String] - def inchi - unless self["inchi"] - result = obconversion(smiles,"smi","inchi") - update(:inchi => result.chomp) if result and !result.empty? - end - self["inchi"] - end + # Create a compound from InChI string + # @param [String] InChI + # @return [OpenTox::Compound] + def self.from_inchi inchi + @smiles = obconversion(inchi,"inchi","can") + @smiles.empty? ? nil : @smiles + end - # Get InChIKey - # @return [String] - def inchikey - update(:inchikey => obconversion(smiles,"smi","inchikey")) unless self["inchikey"] - self["inchikey"] - end + # Create a compound from SDF + # @param [String] SDF + # @return [OpenTox::Compound] + def self.from_sdf sdf + # do not store sdf because it might be 2D + Compound.from_smiles obconversion(sdf,"sdf","can") + end - # Get (canonical) smiles - # @return [String] - def smiles - update(:smiles => obconversion(self["smiles"],"smi","can")) unless self["smiles"] - self["smiles"] - end + # Create a compound from name. Relies on an external service for name lookups. + # @example + # compound = OpenTox::Compound.from_name("Benzene") + # @param [String] name, can be also an InChI/InChiKey, CAS number, etc + # @return [OpenTox::Compound] + def self.from_name name + Compound.from_smiles RestClientWrapper.get(File.join(PUBCHEM_URI,"compound","name",URI.escape(name),"property","CanonicalSMILES","TXT")).chomp + end - # Get SDF - # @return [String] - def sdf - if self.sdf_id.nil? - sdf = obconversion(smiles,"smi","sdf") - file = Mongo::Grid::File.new(sdf, :filename => "#{id}.sdf",:content_type => "chemical/x-mdl-sdfile") - sdf_id = $gridfs.insert_one file - update :sdf_id => sdf_id - end - $gridfs.find_one(_id: self.sdf_id).data - end + # Get InChI + # @return [String] + def inchi + obconversion(@smiles,"smi","inchi") + end - # Get SVG image - # @return [image/svg] Image data - def svg - if self.svg_id.nil? - svg = obconversion(smiles,"smi","svg") - file = Mongo::Grid::File.new(svg, :filename => "#{id}.svg", :content_type => "image/svg") - update(:svg_id => $gridfs.insert_one(file)) - end - $gridfs.find_one(_id: self.svg_id).data - end + # Get InChIKey + # @return [String] + def inchikey + obconversion(@smiles,"smi","inchikey") + end - # Get png image - # @example - # image = compound.png - # @return [image/png] Image data - def png - if self.png_id.nil? - png = obconversion(smiles,"smi","_png2") - file = Mongo::Grid::File.new(Base64.encode64(png), :filename => "#{id}.png", :content_type => "image/png") - update(:png_id => $gridfs.insert_one(file)) - end - Base64.decode64($gridfs.find_one(_id: self.png_id).data) - end + # Get SDF + # @return [String] + def sdf + obconversion(smiles,"smi","sdf") + end - # Get all known compound names. Relies on an external service for name lookups. - # @example - # names = compound.names - # @return [Array] - def names - update(:names => RestClientWrapper.get(File.join(PUBCHEM_URI,"compound","smiles",URI.escape(smiles),"synonyms","TXT")).split("\n")) #unless self["names"] - self["names"] - end + # Get SVG image + # @return [image/svg] Image data + def svg + obconversion(smiles,"smi","svg") + end - # Get PubChem Compound Identifier (CID), obtained via REST call to PubChem - # @return [String] - def cid - update(:cid => RestClientWrapper.post(File.join(PUBCHEM_URI, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"] - self["cid"] - end - - # Convert mmol to mg - # @return [Float] value in mg - def mmol_to_mg mmol - mmol.to_f*molecular_weight - end + # Get png image + # @example + # image = compound.png + # @return [image/png] Image data + def png + obconversion(smiles,"smi","_png2") + end - # Convert mg to mmol - # @return [Float] value in mmol - def mg_to_mmol mg - mg.to_f/molecular_weight - end - - # Calculate molecular weight of Compound with OB and store it in compound object - # @return [Float] molecular weight - def molecular_weight - mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW") - calculate_properties([mw_feature]).first - end + # Get all known compound names. Relies on an external service for name lookups. + # @example + # names = compound.names + # @return [Array] + def names + RestClientWrapper.get(File.join(PUBCHEM_URI,"compound","smiles",URI.escape(smiles),"synonyms","TXT")).split("\n") + end - private + # Get PubChem Compound Identifier (CID), obtained via REST call to PubChem + # @return [String] + def cid + RestClientWrapper.post(File.join(PUBCHEM_URI, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip + end + + # Convert mmol to mg + # @return [Float] value in mg + def mmol_to_mg mmol + mmol.to_f*molecular_weight + end - def self.obconversion(identifier,input_format,output_format,option=nil) - obconversion = OpenBabel::OBConversion.new - obconversion.set_options(option, OpenBabel::OBConversion::OUTOPTIONS) if option - obmol = OpenBabel::OBMol.new - obconversion.set_in_and_out_formats input_format, output_format - return nil if identifier.nil? - obconversion.read_string obmol, identifier - case output_format - when /smi|can|inchi/ - obconversion.write_string(obmol).split(/\s/).first - when /sdf/ - # TODO: find disconnected structures - # strip_salts - # separate - obmol.add_hydrogens - builder = OpenBabel::OBBuilder.new - builder.build(obmol) + # Convert mg to mmol + # @return [Float] value in mmol + def mg_to_mmol mg + mg.to_f/molecular_weight + end + + # Calculate molecular weight of Compound with OB and store it in compound object + # @return [Float] molecular weight + def molecular_weight + mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW") + calculate_properties([mw_feature]).first + end - sdf = obconversion.write_string(obmol) + def self.obconversion(identifier,input_format,output_format,option=nil) + obconversion = OpenBabel::OBConversion.new + obconversion.set_options(option, OpenBabel::OBConversion::OUTOPTIONS) if option + obmol = OpenBabel::OBMol.new + obconversion.set_in_and_out_formats input_format, output_format + return nil if identifier.nil? + obconversion.read_string obmol, identifier + case output_format + when /smi|can|inchi/ + obconversion.write_string(obmol).split(/\s/).first + when /sdf/ + # TODO: find disconnected structures + # strip_salts + # separate + obmol.add_hydrogens + builder = OpenBabel::OBBuilder.new + builder.build(obmol) + + sdf = obconversion.write_string(obmol) print sdf + if sdf.match(/.nan/) + + #warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure" + obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS) + sdf = obconversion.write_string(obmol) if sdf.match(/.nan/) - - #warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure" - obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS) + #warn "2D generation failed for compound #{identifier}, rendering without coordinates." + obconversion.remove_option("gen2D", OpenBabel::OBConversion::GENOPTIONS) sdf = obconversion.write_string(obmol) - if sdf.match(/.nan/) - #warn "2D generation failed for compound #{identifier}, rendering without coordinates." - obconversion.remove_option("gen2D", OpenBabel::OBConversion::GENOPTIONS) - sdf = obconversion.write_string(obmol) - end end - sdf - else - obconversion.write_string(obmol) end + sdf + else + obconversion.write_string(obmol) end + end - def obconversion(identifier,input_format,output_format,option=nil) - self.class.obconversion(identifier,input_format,output_format,option) - end + def obconversion(identifier,input_format,output_format,option=nil) + self.class.obconversion(identifier,input_format,output_format,option) end + end -- cgit v1.2.3