diff options
Diffstat (limited to 'lib/compound.rb')
-rw-r--r-- | lib/compound.rb | 80 |
1 files changed, 11 insertions, 69 deletions
diff --git a/lib/compound.rb b/lib/compound.rb index bfe69e3..6d0e075 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -1,5 +1,3 @@ -CACTUS_URI="https://cactus.nci.nih.gov/chemical/structure/" - module OpenTox # Small molecules with defined chemical structures @@ -12,7 +10,6 @@ module OpenTox field :inchikey, type: String field :names, type: Array field :cid, type: String - field :chemblid, type: String field :png_id, type: BSON::ObjectId field :svg_id, type: BSON::ObjectId field :sdf_id, type: BSON::ObjectId @@ -35,13 +32,11 @@ module OpenTox def fingerprint type=DEFAULT_FINGERPRINT unless fingerprints[type] return [] unless self.smiles - #http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format - if type == "MP2D" + if type == "MP2D" # http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format fp = obconversion(smiles,"smi","mpd").strip.split("\t") name = fp.shift # remove Title fingerprints[type] = fp.uniq # no fingerprint counts - #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html - elsif type== "MNA" + elsif type== "MNA" # http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html level = 2 # TODO: level as parameter, evaluate level 1, see paper fp = obconversion(smiles,"smi","mna","xL\"#{level}\"").split("\n") fp.shift # remove Title @@ -130,30 +125,17 @@ module OpenTox # @param [String] smiles # @return [OpenTox::Compound] def self.from_smiles smiles - if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles - $logger.warn "SMILES parsing failed for '#{smiles}'', SMILES string contains whitespaces." - return nil - end + return nil if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles smiles = obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons) - if smiles.empty? - $logger.warn "SMILES parsing failed for '#{smiles}'', this may be caused by an incorrect SMILES string." - return nil - else - Compound.find_or_create_by :smiles => smiles - end + smiles.empty? ? nil : Compound.find_or_create_by(:smiles => smiles) end # Create a compound from InChI string # @param [String] InChI # @return [OpenTox::Compound] def self.from_inchi inchi - #smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip smiles = obconversion(inchi,"inchi","can") - if smiles.empty? - Compound.find_or_create_by(:warnings => ["InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries."]) - else - Compound.find_or_create_by(:smiles => smiles, :inchi => inchi) - end + smiles.empty? ? nil : Compound.find_or_create_by(:smiles => smiles) end # Create a compound from SDF @@ -170,7 +152,7 @@ module OpenTox # @param [String] name, can be also an InChI/InChiKey, CAS number, etc # @return [OpenTox::Compound] def self.from_name name - Compound.from_smiles RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"smiles")) + Compound.from_smiles RestClientWrapper.get(File.join(PUBCHEM_URI,"compound","name",URI.escape(name),"property","CanonicalSMILES","TXT")).chomp end # Get InChI @@ -238,56 +220,16 @@ module OpenTox # names = compound.names # @return [Array<String>] def names - update(:names => RestClientWrapper.get("#{CACTUS_URI}#{inchi}/names").split("\n")) unless self["names"] + update(:names => RestClientWrapper.get(File.join(PUBCHEM_URI,"compound","smiles",URI.escape(smiles),"synonyms","TXT")).split("\n")) #unless self["names"] self["names"] end # Get PubChem Compound Identifier (CID), obtained via REST call to PubChem # @return [String] def cid - pug_uri = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/" - update(:cid => RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"] + update(:cid => RestClientWrapper.post(File.join(PUBCHEM_URI, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"] self["cid"] end - - # Get ChEMBL database compound id, obtained via REST call to ChEMBL - # @return [String] - def chemblid - # https://www.ebi.ac.uk/chembldb/ws#individualCompoundByInChiKey - uri = "https://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json" - update(:chemblid => JSON.parse(RestClientWrapper.get(uri))["compounds"].first["chemblId"]) unless self["chemblid"] - self["chemblid"] - end - - def db_neighbors min_sim: 0.1, dataset_id: - #p fingerprints[DEFAULT_FINGERPRINT] - # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb - - #qn = default_fingerprint_size - #qmin = qn * threshold - #qmax = qn / threshold - #not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...) - #reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)] - aggregate = [ - #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}}, - #{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self - {'$project' => { - 'similarity' => {'$let' => { - 'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}}, - 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]} - }}, - '_id' => 1, - #'measurements' => 1, - 'dataset_ids' => 1 - }}, - {'$match' => {'similarity' => {'$gte' => min_sim}}}, - {'$sort' => {'similarity' => -1}} - ] - - # TODO move into aggregate pipeline, see http://stackoverflow.com/questions/30537317/mongodb-aggregation-match-if-value-in-array - $mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? dataset_id} - - end # Convert mmol to mg # @return [Float] value in mg @@ -319,7 +261,7 @@ module OpenTox obconversion.read_string obmol, identifier case output_format when /smi|can|inchi/ - obconversion.write_string(obmol).gsub(/\s/,'').chomp + obconversion.write_string(obmol).split(/\s/).first when /sdf/ # TODO: find disconnected structures # strip_salts @@ -332,11 +274,11 @@ module OpenTox print sdf if sdf.match(/.nan/) - $logger.warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure" + #warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure" obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS) sdf = obconversion.write_string(obmol) if sdf.match(/.nan/) - $logger.warn "2D generation failed for compound #{identifier}, rendering without coordinates." + #warn "2D generation failed for compound #{identifier}, rendering without coordinates." obconversion.remove_option("gen2D", OpenBabel::OBConversion::GENOPTIONS) sdf = obconversion.write_string(obmol) end |