From ca2bb0f90335b1f2c4ecc28ee423e85b281ffcf0 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 4 Nov 2015 17:50:17 +0100 Subject: neighbor search delegated to database backend --- lib/compound.rb | 52 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 21 deletions(-) (limited to 'lib/compound.rb') diff --git a/lib/compound.rb b/lib/compound.rb index a26528b..c5e7f02 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -23,13 +23,16 @@ module OpenTox field :sdf_id, type: BSON::ObjectId field :fingerprints, type: Hash, default: {} field :default_fingerprint_size, type: Integer + field :dataset_ids, type: Array, default: [] + field :features, type: Hash, default: {} index({smiles: 1}, {unique: true}) + #index({default_fingerprint: 1}, {unique: false}) # Overwrites standard Mongoid method to create fingerprints before database insertion def self.find_or_create_by params compound = self.find_or_initialize_by params - compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT) + compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT).size compound.save compound end @@ -41,7 +44,7 @@ module OpenTox if type == "MP2D" fp = obconversion(smiles,"smi","mpd").strip.split("\t") name = fp.shift # remove Title - fingerprints[type] = fp + fingerprints[type] = fp.uniq # no fingerprint counts #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html elsif type== "MNA" level = 2 # TODO: level as parameter, evaluate level 1, see paper @@ -244,20 +247,23 @@ module OpenTox def fingerprint_neighbors params bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim] neighbors = [] - #if params[:type] == DEFAULT_FINGERPRINT - #neighbors = db_neighbors params - #p neighbors - #else + if params[:type] == DEFAULT_FINGERPRINT + neighbors = db_neighbors params + else query_fingerprint = self.fingerprint params[:type] - training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound| - unless self == compound + training_dataset = Dataset.find(params[:training_dataset_id]) + prediction_feature = training_dataset.features.first + training_dataset.compounds.each do |compound| + #unless self == compound candidate_fingerprint = compound.fingerprint params[:type] sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f - neighbors << [compound.id, sim] if sim >= params[:min_sim] - end + feature_values = training_dataset.values(compound,prediction_feature) + neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim] + #end end - #end - neighbors.sort{|a,b| b.last <=> a.last} + neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]} + end + neighbors end def fminer_neighbors params @@ -299,30 +305,34 @@ module OpenTox end def db_neighbors params - p "DB NEIGHBORS" - p params - # TODO restrict to dataset # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb - qn = fingerprint(params[:type]).size + + #qn = default_fingerprint_size #qmin = qn * threshold #qmax = qn / threshold #not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...) #reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)] aggregate = [ #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}}, - {'$match' => {'_id' => {'$ne' => self.id}}}, # remove self + #{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self {'$project' => { 'tanimoto' => {'$let' => { - 'vars' => {'common' => {'$size' => {'$setIntersection' => ["'$#{DEFAULT_FINGERPRINT}'", DEFAULT_FINGERPRINT]}}}, - 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [qn, '$default_fingerprint_size']}, '$$common']}]} + 'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}}, + #'vars' => {'common' => {'$size' => {'$setIntersection' => ["$default_fingerprint", default_fingerprint]}}}, + 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]} }}, - '_id' => 1 + '_id' => 1, + 'features' => 1, + 'dataset_ids' => 1 }}, {'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}}, {'$sort' => {'tanimoto' => -1}} ] - $mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] } + $mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]} + + + #$mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] } end -- cgit v1.2.3 From e63e97086ac05e7a86f1a53bdcbc72eec0cabf16 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 9 Nov 2015 14:58:34 +0100 Subject: leave one out validation implemented --- lib/compound.rb | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'lib/compound.rb') diff --git a/lib/compound.rb b/lib/compound.rb index ad0eaba..d5a4cbb 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -344,16 +344,20 @@ module OpenTox return mg end - # Get mg from mmol - # @return [Float] value in mg - def mmol_to_mg(value, mw) + # Get mg from mmol + # @return [Float] value in mg + def mmol_to_mg(value, mw) mg = (value.to_f)*(mw.to_f) return mg end - # Get mg from logmg - # @return [Float] value in mg - def logmg_to_mg(value) + def mg_to_mmol mg + mg.to_f/molecular_weight + end + + # Get mg from logmg + # @return [Float] value in mg + def logmg_to_mg(value) mg = 10**value.to_f return mg end @@ -364,7 +368,7 @@ module OpenTox if self["molecular_weight"]==0.0 || self["molecular_weight"].nil? update(:molecular_weight => OpenTox::Algorithm::Descriptor.physchem(self, ["Openbabel.MW"]).first) end - self["molecular_weight"] + self["molecular_weight"].to_f end -- cgit v1.2.3 From d6eced29e104b9bc1923b2ac89b2700a48adf07a Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 8 Jan 2016 11:00:20 +0100 Subject: mg-mmol conversion fixed --- lib/compound.rb | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) (limited to 'lib/compound.rb') diff --git a/lib/compound.rb b/lib/compound.rb index d5a4cbb..040fd6f 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -337,30 +337,15 @@ module OpenTox end - # Get mg from logmmol (for nch LOAEL/pTD50 data) - # @return [Float] value in mg - def logmmol_to_mg(value, mw) - mg = (10**(-1.0*value.to_f)*(mw.to_f*1000)) - return mg - end - # Get mg from mmol # @return [Float] value in mg - def mmol_to_mg(value, mw) - mg = (value.to_f)*(mw.to_f) - return mg + def mmol_to_mg mmol + mmol.to_f*molecular_weight end def mg_to_mmol mg mg.to_f/molecular_weight end - - # Get mg from logmg - # @return [Float] value in mg - def logmg_to_mg(value) - mg = 10**value.to_f - return mg - end # Calculate molecular weight of Compound with OB and store it in object # @return [Float] molecular weight @@ -371,7 +356,6 @@ module OpenTox self["molecular_weight"].to_f end - private def self.obconversion(identifier,input_format,output_format,option=nil) -- cgit v1.2.3 From f61b7d3c65d084747dc1bf87214e5ec0c57326be Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 9 Feb 2016 11:04:00 +0100 Subject: pls regression --- lib/compound.rb | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'lib/compound.rb') diff --git a/lib/compound.rb b/lib/compound.rb index 040fd6f..8f37247 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -38,7 +38,7 @@ module OpenTox compound end - def fingerprint type="MP2D" + def fingerprint type=DEFAULT_FINGERPRINT unless fingerprints[type] return [] unless self.smiles #http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format @@ -337,12 +337,14 @@ module OpenTox end - # Get mg from mmol + # Convert mg to mmol # @return [Float] value in mg def mmol_to_mg mmol mmol.to_f*molecular_weight end + # Convert mmol to mg + # @return [Float] value in mg def mg_to_mmol mg mg.to_f/molecular_weight end -- cgit v1.2.3 From b90720cc26d789a96fa6f7a054fe06fc8b4ef33d Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sat, 27 Feb 2016 16:47:48 +0100 Subject: local pls regression as default regression algorithm --- lib/compound.rb | 1 + 1 file changed, 1 insertion(+) (limited to 'lib/compound.rb') diff --git a/lib/compound.rb b/lib/compound.rb index 8f37247..d5d6aa9 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -23,6 +23,7 @@ module OpenTox field :sdf_id, type: BSON::ObjectId field :molecular_weight, type: Float field :fingerprints, type: Hash, default: {} + field :physchem, type: Hash, default: {} field :default_fingerprint_size, type: Integer field :dataset_ids, type: Array, default: [] field :features, type: Hash, default: {} -- cgit v1.2.3 From 8c973e16028cb95c978bb08cf79369a5c3520c31 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sun, 28 Feb 2016 12:43:38 +0100 Subject: physchem feature class --- lib/compound.rb | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'lib/compound.rb') diff --git a/lib/compound.rb b/lib/compound.rb index d5d6aa9..4ea4db4 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -7,7 +7,9 @@ CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/" module OpenTox class Compound + require_relative "unique_descriptors.rb" include OpenTox + include OpenTox::Descriptor DEFAULT_FINGERPRINT = "MP2D" @@ -15,7 +17,7 @@ module OpenTox field :smiles, type: String field :inchikey, type: String field :names, type: Array - field :warning, type: String + #field :warnings, type: Array, default: [] field :cid, type: String field :chemblid, type: String field :png_id, type: BSON::ObjectId @@ -23,8 +25,8 @@ module OpenTox field :sdf_id, type: BSON::ObjectId field :molecular_weight, type: Float field :fingerprints, type: Hash, default: {} - field :physchem, type: Hash, default: {} field :default_fingerprint_size, type: Integer + field :physchem_descriptors, type: Hash, default: {} field :dataset_ids, type: Array, default: [] field :features, type: Hash, default: {} @@ -86,19 +88,34 @@ module OpenTox fingerprints[type] end + def physchem descriptor_ids + calculated_descriptor_ids = self[:physchem_descriptors].keys + p names + new = UNIQUEDESCRIPTORS-names + p new + d = self.physchem(self, new) + #p d + #self[:physchem_descriptors].merge! d + self.update_attribute(:physchem_descriptors, self[:physchem_descriptors].merge(d)) + save + self[:physchem_descriptors] + end + # Create a compound from smiles string # @example # compound = OpenTox::Compound.from_smiles("c1ccccc1") # @param [String] smiles Smiles string # @return [OpenTox::Compound] Compound def self.from_smiles smiles - return nil if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles + if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles + $logger.warn "SMILES parsing failed for '#{smiles}'', SMILES string contains whitespaces." + return nil + end smiles = obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons) if smiles.empty? + $logger.warn "SMILES parsing failed for '#{smiles}'', this may be caused by an incorrect SMILES string." return nil - #Compound.find_or_create_by(:warning => "SMILES parsing failed for '#{smiles}', this may be caused by an incorrect SMILES string.") else - #Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons) Compound.find_or_create_by :smiles => smiles end end @@ -113,7 +130,7 @@ module OpenTox #smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip smiles = obconversion(inchi,"inchi","can") if smiles.empty? - Compound.find_or_create_by(:warning => "InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries.") + Compound.find_or_create_by(:warnings => ["InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries."]) else Compound.find_or_create_by(:smiles => smiles, :inchi => inchi) end -- cgit v1.2.3 From d0c6234fed7d45227fcf9309cb6dc0854d17e647 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sun, 28 Feb 2016 16:00:15 +0100 Subject: physchem calculation and storage in compouds --- lib/compound.rb | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'lib/compound.rb') diff --git a/lib/compound.rb b/lib/compound.rb index 4ea4db4..8c11831 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -17,7 +17,6 @@ module OpenTox field :smiles, type: String field :inchikey, type: String field :names, type: Array - #field :warnings, type: Array, default: [] field :cid, type: String field :chemblid, type: String field :png_id, type: BSON::ObjectId @@ -88,17 +87,26 @@ module OpenTox fingerprints[type] end - def physchem descriptor_ids - calculated_descriptor_ids = self[:physchem_descriptors].keys - p names - new = UNIQUEDESCRIPTORS-names - p new - d = self.physchem(self, new) - #p d - #self[:physchem_descriptors].merge! d - self.update_attribute(:physchem_descriptors, self[:physchem_descriptors].merge(d)) + def physchem descriptors=PhysChem.openbabel_descriptors + # TODO: speedup java descriptors + calculated_ids = physchem_descriptors.keys + # BSON::ObjectId instances are not allowed as keys in a BSON document. + new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids + descs = {} + algos = {} + new_ids.each do |id| + descriptor = PhysChem.find id + descs[[descriptor.library, descriptor.descriptor]] = descriptor + algos[descriptor.name] = descriptor + end + # avoid recalculating Cdk features with multiple values + descs.keys.uniq.each do |k| + descs[k].send(k[0].downcase,k[1],self).each do |n,v| + physchem_descriptors[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document. + end + end save - self[:physchem_descriptors] + physchem_descriptors end # Create a compound from smiles string -- cgit v1.2.3 From 0c5d2e678908a2d4aea43efbedbedc2c0439be30 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 14 Mar 2016 15:25:50 +0100 Subject: descriptor tests --- lib/compound.rb | 67 +++++++++++++++++++++------------------------------------ 1 file changed, 24 insertions(+), 43 deletions(-) (limited to 'lib/compound.rb') diff --git a/lib/compound.rb b/lib/compound.rb index 8c11831..2a79fd6 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -1,7 +1,3 @@ -# TODO: check -# *** Open Babel Error in ParseFile -# Could not find contribution data file. - CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/" module OpenTox @@ -9,7 +5,6 @@ module OpenTox class Compound require_relative "unique_descriptors.rb" include OpenTox - include OpenTox::Descriptor DEFAULT_FINGERPRINT = "MP2D" @@ -22,7 +17,6 @@ module OpenTox field :png_id, type: BSON::ObjectId field :svg_id, type: BSON::ObjectId field :sdf_id, type: BSON::ObjectId - field :molecular_weight, type: Float field :fingerprints, type: Hash, default: {} field :default_fingerprint_size, type: Integer field :physchem_descriptors, type: Hash, default: {} @@ -30,7 +24,6 @@ module OpenTox field :features, type: Hash, default: {} index({smiles: 1}, {unique: true}) - #index({default_fingerprint: 1}, {unique: false}) # Overwrites standard Mongoid method to create fingerprints before database insertion def self.find_or_create_by params @@ -106,7 +99,24 @@ module OpenTox end end save - physchem_descriptors + physchem_descriptors.select{|id,v| descriptors.collect{|d| d.id.to_s}.include? id} + end + + def smarts_match smarts, count=false + obconversion = OpenBabel::OBConversion.new + obmol = OpenBabel::OBMol.new + obconversion.set_in_format('smi') + obconversion.read_string(obmol,self.smiles) + smarts_pattern = OpenBabel::OBSmartsPattern.new + smarts.collect do |sma| + smarts_pattern.init(sma.smarts) + if smarts_pattern.match(obmol) + count ? value = smarts_pattern.get_map_list.to_a.size : value = 1 + else + value = 0 + end + value + end end # Create a compound from smiles string @@ -281,34 +291,16 @@ module OpenTox training_dataset = Dataset.find(params[:training_dataset_id]) prediction_feature = training_dataset.features.first training_dataset.compounds.each do |compound| - #unless self == compound - candidate_fingerprint = compound.fingerprint params[:type] - sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f - feature_values = training_dataset.values(compound,prediction_feature) - neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim] - #end + candidate_fingerprint = compound.fingerprint params[:type] + sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f + feature_values = training_dataset.values(compound,prediction_feature) + neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim] end neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]} end neighbors end - def fminer_neighbors params - bad_request_error "Incorrect parameters for Compound#fminer_neighbors. Please provide :feature_dataset_id, :min_sim." unless params[:feature_dataset_id] and params[:min_sim] - feature_dataset = Dataset.find params[:feature_dataset_id] - query_fingerprint = Algorithm::Descriptor.smarts_match(self, feature_dataset.features) - neighbors = [] - - # find neighbors - feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i| - sim = Algorithm::Similarity.tanimoto candidate_fingerprint, query_fingerprint - if sim >= params[:min_sim] - neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming - end - end - neighbors - end - def physchem_neighbors params feature_dataset = Dataset.find params[:feature_dataset_id] query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors] @@ -317,13 +309,7 @@ module OpenTox # TODO implement pearson and cosine similarity separatly R.assign "x", query_fingerprint R.assign "y", candidate_fingerprint - # pearson r - #sim = R.eval("cor(x,y,use='complete.obs',method='pearson')").to_ruby - #p "pearson" - #p sim - #p "cosine" sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first - #p sim if sim >= params[:min_sim] neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming end @@ -357,9 +343,6 @@ module OpenTox ] $mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]} - - - #$mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] } end @@ -378,10 +361,8 @@ module OpenTox # Calculate molecular weight of Compound with OB and store it in object # @return [Float] molecular weight def molecular_weight - if self["molecular_weight"]==0.0 || self["molecular_weight"].nil? - update(:molecular_weight => OpenTox::Algorithm::Descriptor.physchem(self, ["Openbabel.MW"]).first) - end - self["molecular_weight"].to_f + mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW") + physchem([mw_feature])[mw_feature.id.to_s] end private -- cgit v1.2.3