From 6ab86c253ba0eb79b9e6a20effa2d18626accf2b Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 13 Aug 2015 11:56:40 +0200 Subject: OpenBabel can (canonical smiles) instead of inchi as internal identifier to avoid OpenBabel InChi bug. --- lib/compound.rb | 54 +++++++--- lib/descriptor.rb | 8 +- lib/lazar-model.rb | 287 +++++++++++++++++++++++++++++++++++++++++++++++++++++ lib/lazar.rb | 2 +- lib/neighbor.rb | 2 +- lib/overwrite.rb | 6 ++ 6 files changed, 339 insertions(+), 20 deletions(-) create mode 100644 lib/lazar-model.rb (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index 3418fcc..5343aa0 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -10,13 +10,13 @@ module OpenTox include OpenTox field :inchi, type: String - attr_readonly :inchi field :smiles, type: String field :inchikey, type: String field :names, type: Array field :cid, type: String field :chemblid, type: String - field :image_id, type: BSON::ObjectId + field :png_id, type: BSON::ObjectId + field :svg_id, type: BSON::ObjectId field :sdf_id, type: BSON::ObjectId field :fp4, type: Array field :fp4_size, type: Integer @@ -46,14 +46,18 @@ module OpenTox # @return [OpenTox::Compound] Compound def self.from_smiles smiles # do not store smiles because it might be noncanonical - Compound.find_or_create_by :inchi => obconversion(smiles,"smi","inchi") + Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can") end # Create a compound from inchi string # @param inchi [String] smiles InChI string # @return [OpenTox::Compound] Compound def self.from_inchi inchi - Compound.find_or_create_by :inchi => inchi + # Temporary workaround for OpenBabels Inchi bug + # http://sourceforge.net/p/openbabel/bugs/957/ + # bug has not been fixed in latest git/development version + smiles = `echo "#{inchi}" | babel -iinchi - -ocan`.chomp.strip + smiles.empty? ? nil : Compound.find_or_create_by(:smiles => smiles, :inchi => inchi) end # Create a compound from sdf string @@ -61,7 +65,7 @@ module OpenTox # @return [OpenTox::Compound] Compound def self.from_sdf sdf # do not store sdf because it might be 2D - Compound.find_or_create_by :inchi => obconversion(sdf,"sdf","inchi") + Compound.find_or_create_by :smiles => obconversion(sdf,"sdf","can") end # Create a compound from name. Relies on an external service for name lookups. @@ -70,20 +74,30 @@ module OpenTox # @param name [String] can be also an InChI/InChiKey, CAS number, etc # @return [OpenTox::Compound] Compound def self.from_name name - Compound.find_or_create_by :inchi => RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"stdinchi")) + Compound.find_or_create_by :smiles => RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"smiles")) end - # Get InChIKey + # Get InChI # @return [String] InChI string + def inchi + unless self["inchi"] + result = `echo "#{self.smiles}" | babel -ismi - -oinchi`.chomp + update(:inchi => result.chomp) unless result.empty? + end + self["inchi"] + end + + # Get InChIKey + # @return [String] InChIKey string def inchikey - update(:inchikey => obconversion(inchi,"inchi","inchikey")) unless self["inchikey"] + update(:inchikey => obconversion(smiles,"smi","inchikey")) unless self["inchikey"] self["inchikey"] end # Get (canonical) smiles # @return [String] Smiles string def smiles - update(:smiles => obconversion(inchi,"inchi","smi")) unless self["smiles"] # should give canonical smiles, "can" seems to give incorrect results + update(:smiles => obconversion(self["smiles"],"smi","can")) #unless self["smiles"] # should give canonical smiles, "can" seems to give incorrect results self["smiles"] end @@ -91,7 +105,7 @@ module OpenTox # @return [String] SDF string def sdf if self.sdf_id.nil? - sdf = obconversion(inchi,"inchi","sdf") + sdf = obconversion(smiles,"smi","sdf") file = Mongo::Grid::File.new(sdf, :filename => "#{id}.sdf",:content_type => "chemical/x-mdl-sdfile") sdf_id = $gridfs.insert_one file update :sdf_id => sdf_id @@ -99,17 +113,29 @@ module OpenTox $gridfs.find_one(_id: self.sdf_id).data end + # Get SVG image + # @return [image/svg] Image data + def svg + if self.svg_id.nil? + svg = obconversion(smiles,"smi","svg") + file = Mongo::Grid::File.new(svg, :filename => "#{id}.svg", :content_type => "image/svg") + update(:image_id => $gridfs.insert_one(file)) + end + $gridfs.find_one(_id: self.svg_id).data + + end + # Get png image # @example # image = compound.png # @return [image/png] Image data def png - if self.image_id.nil? - png = obconversion(inchi,"inchi","_png2") + if self.png_id.nil? + png = obconversion(smiles,"smi","_png2") file = Mongo::Grid::File.new(Base64.encode64(png), :filename => "#{id}.png", :content_type => "image/png") - update(:image_id => $gridfs.insert_one(file)) + update(:png_id => $gridfs.insert_one(file)) end - Base64.decode64($gridfs.find_one(_id: self.image_id).data) + Base64.decode64($gridfs.find_one(_id: self.png_id).data) end diff --git a/lib/descriptor.rb b/lib/descriptor.rb index 335f3dc..f0492a2 100644 --- a/lib/descriptor.rb +++ b/lib/descriptor.rb @@ -64,7 +64,7 @@ module OpenTox @count = count obconversion = OpenBabel::OBConversion.new obmol = OpenBabel::OBMol.new - obconversion.set_in_format('inchi') + obconversion.set_in_format('smi') smarts_pattern = OpenBabel::OBSmartsPattern.new smarts_features = [smarts_features] if smarts_features.is_a?(Feature) @smarts = smarts_features.collect{|f| f.smarts} @@ -77,7 +77,7 @@ module OpenTox # which worked with opentox-client # (but no smarts_match) #p "'#{compound.inchi}'" - obconversion.read_string(obmol,compound.inchi) + obconversion.read_string(obmol,compound.smiles) @smarts.each_with_index do |smart,s| smarts_pattern.init(smart) if smarts_pattern.match(obmol) @@ -123,10 +123,10 @@ module OpenTox obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d} obmol = OpenBabel::OBMol.new obconversion = OpenBabel::OBConversion.new - obconversion.set_in_format 'inchi' + obconversion.set_in_format 'smi' last_feature_idx = @physchem_descriptors.size @compounds.each_with_index do |compound,c| - obconversion.read_string obmol, compound.inchi + obconversion.read_string obmol, compound.smiles obdescriptors.each_with_index do |descriptor,d| @data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol)) end diff --git a/lib/lazar-model.rb b/lib/lazar-model.rb new file mode 100644 index 0000000..4ca3403 --- /dev/null +++ b/lib/lazar-model.rb @@ -0,0 +1,287 @@ +module OpenTox + + module Model + + class Lazar + include OpenTox + include Mongoid::Document + include Mongoid::Timestamps + store_in collection: "models" + + field :title, type: String + field :endpoint, type: String + field :creator, type: String, default: __FILE__ + # datasets + field :training_dataset_id, type: BSON::ObjectId + # algorithms + field :prediction_algorithm, type: String + field :neighbor_algorithm, type: String + field :neighbor_algorithm_parameters, type: Hash + # prediction feature + field :prediction_feature_id, type: BSON::ObjectId + + attr_accessor :prediction_dataset + attr_accessor :training_dataset + + # Create a lazar model from a training_dataset and a feature_dataset + # @param [OpenTox::Dataset] training_dataset + # @return [OpenTox::Model::Lazar] Regression or classification model + def self.create training_dataset + + bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1 + + # TODO document convention + prediction_feature = training_dataset.features.first + prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new + lazar.training_dataset_id = training_dataset.id + lazar.prediction_feature_id = prediction_feature.id + lazar.title = prediction_feature.title + + lazar.save + lazar + end + + def predict object + + t = Time.now + at = Time.now + + training_dataset = Dataset.find training_dataset_id + prediction_feature = Feature.find prediction_feature_id + + # parse data + compounds = [] + case object.class.to_s + when "OpenTox::Compound" + compounds = [object] + when "Array" + compounds = object + when "OpenTox::Dataset" + compounds = object.compounds + else + bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter." + end + + # make predictions + predictions = [] + compounds.each_with_index do |compound,c| + t = Time.new + neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters) + # add activities + # TODO: improve efficiency, takes 3 times longer than previous version + # TODO database activity?? + neighbors.collect! do |n| + rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first} + acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact + acts.empty? ? nil : n << acts + end + neighbors.compact! # remove neighbors without training activities + predictions << Algorithm.run(prediction_algorithm, neighbors) + end + + # serialize result + case object.class.to_s + when "OpenTox::Compound" + return predictions.first + when "Array" + return predictions + when "OpenTox::Dataset" + # prepare prediction dataset + prediction_dataset = LazarPrediction.new( + :title => "Lazar prediction for #{prediction_feature.title}", + :creator => __FILE__, + :prediction_feature_id => prediction_feature.id + + ) + confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" ) + # TODO move into warnings field + warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings") + prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ] + prediction_dataset.compounds = compounds + prediction_dataset.data_entries = predictions + prediction_dataset.save_all + return prediction_dataset + end + + end + + def training_activities + i = training_dataset.feature_ids.index prediction_feature_id + training_dataset.data_entries.collect{|de| de[i]} + end + + end + + class LazarClassification < Lazar + def initialize + super + self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" + self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity" + self.neighbor_algorithm_parameters = {:min_sim => 0.7} + end + end + + class LazarFminerClassification < LazarClassification + #field :feature_dataset_id, type: BSON::ObjectId + #field :feature_calculation_algorithm, type: String + + def self.create training_dataset + model = super(training_dataset) + model.update "_type" => self.to_s # adjust class + model = self.find model.id # adjust class + model.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fminer_similarity" + model.neighbor_algorithm_parameters = { + :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match", + :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset).id, + :min_sim => 0.3 + } + model.save + model + end + +=begin + def predict object + + t = Time.now + at = Time.now + + @training_dataset = OpenTox::Dataset.find(training_dataset_id) + @feature_dataset = OpenTox::Dataset.find(feature_dataset_id) + + compounds = [] + case object.class.to_s + when "OpenTox::Compound" + compounds = [object] + when "Array" + compounds = object + when "OpenTox::Dataset" + compounds = object.compounds + else + bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter." + end + + $logger.debug "Setup: #{Time.now-t}" + t = Time.now + + @query_fingerprint = Algorithm.run(feature_calculation_algorithm, compounds, @feature_dataset.features.collect{|f| f.name} ) + + $logger.debug "Query fingerprint calculation: #{Time.now-t}" + t = Time.now + + predictions = [] + prediction_feature = OpenTox::Feature.find prediction_feature_id + tt = 0 + pt = 0 + nt = 0 + st = 0 + nit = 0 + @training_fingerprints ||= @feature_dataset.data_entries + compounds.each_with_index do |compound,c| + t = Time.new + + $logger.debug "predict compound #{c+1}/#{compounds.size} #{compound.inchi}" + + database_activities = @training_dataset.values(compound,prediction_feature) + if database_activities and !database_activities.empty? + database_activities = database_activities.first if database_activities.size == 1 + $logger.debug "Compound #{compound.inchi} occurs in training dataset with activity #{database_activities}" + predictions << {:compound => compound, :value => database_activities, :confidence => "measured"} + next + else + + #training_fingerprints = @feature_dataset.data_entries + query_fingerprint = @query_fingerprint[c] + neighbors = [] + tt += Time.now-t + t = Time.new + + + # find neighbors + @training_fingerprints.each_with_index do |fingerprint, i| + ts = Time.new + sim = Algorithm.run(similarity_algorithm,fingerprint, query_fingerprint) + st += Time.now-ts + ts = Time.new + if sim > self.min_sim + if prediction_algorithm =~ /Regression/ + neighbors << [@feature_dataset.compound_ids[i],sim,training_activities[i], fingerprint] + else + neighbors << [@feature_dataset.compound_ids[i],sim,training_activities[i]] # use compound_ids, instantiation of Compounds is too time consuming + end + end + nit += Time.now-ts + end + + if neighbors.empty? + predictions << {:compound => compound, :value => nil, :confidence => nil, :warning => "No neighbors with similarity > #{min_sim} in dataset #{training_dataset.id}"} + next + end + nt += Time.now-t + t = Time.new + + if prediction_algorithm =~ /Regression/ + prediction = Algorithm.run(prediction_algorithm, neighbors, :min_train_performance => self.min_train_performance) + else + prediction = Algorithm.run(prediction_algorithm, neighbors) + end + prediction[:compound] = compound + prediction[:neighbors] = neighbors.sort{|a,b| b[1] <=> a[1]} # sort with ascending similarities + + + # AM: transform to original space (TODO) + #confidence_value = ((confidence_value+1.0)/2.0).abs if prediction.first and similarity_algorithm =~ /cosine/ + + + $logger.debug "predicted value: #{prediction[:value]}, confidence: #{prediction[:confidence]}" + predictions << prediction + pt += Time.now-t + end + + end + $logger.debug "Transform time: #{tt}" + $logger.debug "Neighbor search time: #{nt} (Similarity calculation: #{st}, Neighbor insert: #{nit})" + $logger.debug "Prediction time: #{pt}" + $logger.debug "Total prediction time: #{Time.now-at}" + + # serialize result + case object.class.to_s + when "OpenTox::Compound" + return predictions.first + when "Array" + return predictions + when "OpenTox::Dataset" + # prepare prediction dataset + prediction_dataset = LazarPrediction.new( + :title => "Lazar prediction for #{prediction_feature.title}", + :creator => __FILE__, + :prediction_feature_id => prediction_feature.id + + ) + confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" ) + warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings") + prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ] + prediction_dataset.compounds = compounds + prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence],p[:warning]]} + prediction_dataset.save_all + return prediction_dataset + end + + end +=end + end + + class LazarRegression < Lazar + + def initialize + super + self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity" + self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average" + self.neighbor_algorithm_parameters = {:min_sim => 0.7} + end + + end + + end + +end + diff --git a/lib/lazar.rb b/lib/lazar.rb index 2e7e7c2..0c5e18b 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -58,7 +58,7 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation"]# Algor "algorithm.rb", "descriptor.rb", "bbrc.rb", - "lazar.rb", + "lazar-model.rb", "similarity.rb", "neighbor.rb", "classification.rb", diff --git a/lib/neighbor.rb b/lib/neighbor.rb index a2c28d4..d849cbf 100644 --- a/lib/neighbor.rb +++ b/lib/neighbor.rb @@ -8,7 +8,7 @@ module OpenTox def self.fminer_similarity compound, params feature_dataset = Dataset.find params[:feature_dataset_id] - query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features.collect{|f| f.smarts} ) + query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features) neighbors = [] # find neighbors diff --git a/lib/overwrite.rb b/lib/overwrite.rb index 2eb0b39..a27d685 100644 --- a/lib/overwrite.rb +++ b/lib/overwrite.rb @@ -11,6 +11,12 @@ class Object end end +class Numeric + def percent_of(n) + self.to_f / n.to_f * 100.0 + end +end + module Enumerable # @return [Array] only the duplicates of an enumerable def duplicates -- cgit v1.2.3