summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2015-08-13 11:56:40 +0200
committerChristoph Helma <helma@in-silico.ch>2015-08-13 12:04:52 +0200
commit6ab86c253ba0eb79b9e6a20effa2d18626accf2b (patch)
tree508eef99b34eb495493444663af2dd72e138bba6 /lib
parentb7cd3ebbb858a8891c35c45896f1bdd525f3534e (diff)
OpenBabel can (canonical smiles) instead of inchi as internal identifier to avoid OpenBabel InChi bug.
Diffstat (limited to 'lib')
-rw-r--r--lib/compound.rb54
-rw-r--r--lib/descriptor.rb8
-rw-r--r--lib/lazar-model.rb287
-rw-r--r--lib/lazar.rb2
-rw-r--r--lib/neighbor.rb2
-rw-r--r--lib/overwrite.rb6
6 files changed, 339 insertions, 20 deletions
diff --git a/lib/compound.rb b/lib/compound.rb
index 3418fcc..5343aa0 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -10,13 +10,13 @@ module OpenTox
include OpenTox
field :inchi, type: String
- attr_readonly :inchi
field :smiles, type: String
field :inchikey, type: String
field :names, type: Array
field :cid, type: String
field :chemblid, type: String
- field :image_id, type: BSON::ObjectId
+ field :png_id, type: BSON::ObjectId
+ field :svg_id, type: BSON::ObjectId
field :sdf_id, type: BSON::ObjectId
field :fp4, type: Array
field :fp4_size, type: Integer
@@ -46,14 +46,18 @@ module OpenTox
# @return [OpenTox::Compound] Compound
def self.from_smiles smiles
# do not store smiles because it might be noncanonical
- Compound.find_or_create_by :inchi => obconversion(smiles,"smi","inchi")
+ Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can")
end
# Create a compound from inchi string
# @param inchi [String] smiles InChI string
# @return [OpenTox::Compound] Compound
def self.from_inchi inchi
- Compound.find_or_create_by :inchi => inchi
+ # Temporary workaround for OpenBabels Inchi bug
+ # http://sourceforge.net/p/openbabel/bugs/957/
+ # bug has not been fixed in latest git/development version
+ smiles = `echo "#{inchi}" | babel -iinchi - -ocan`.chomp.strip
+ smiles.empty? ? nil : Compound.find_or_create_by(:smiles => smiles, :inchi => inchi)
end
# Create a compound from sdf string
@@ -61,7 +65,7 @@ module OpenTox
# @return [OpenTox::Compound] Compound
def self.from_sdf sdf
# do not store sdf because it might be 2D
- Compound.find_or_create_by :inchi => obconversion(sdf,"sdf","inchi")
+ Compound.find_or_create_by :smiles => obconversion(sdf,"sdf","can")
end
# Create a compound from name. Relies on an external service for name lookups.
@@ -70,20 +74,30 @@ module OpenTox
# @param name [String] can be also an InChI/InChiKey, CAS number, etc
# @return [OpenTox::Compound] Compound
def self.from_name name
- Compound.find_or_create_by :inchi => RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"stdinchi"))
+ Compound.find_or_create_by :smiles => RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"smiles"))
end
- # Get InChIKey
+ # Get InChI
# @return [String] InChI string
+ def inchi
+ unless self["inchi"]
+ result = `echo "#{self.smiles}" | babel -ismi - -oinchi`.chomp
+ update(:inchi => result.chomp) unless result.empty?
+ end
+ self["inchi"]
+ end
+
+ # Get InChIKey
+ # @return [String] InChIKey string
def inchikey
- update(:inchikey => obconversion(inchi,"inchi","inchikey")) unless self["inchikey"]
+ update(:inchikey => obconversion(smiles,"smi","inchikey")) unless self["inchikey"]
self["inchikey"]
end
# Get (canonical) smiles
# @return [String] Smiles string
def smiles
- update(:smiles => obconversion(inchi,"inchi","smi")) unless self["smiles"] # should give canonical smiles, "can" seems to give incorrect results
+ update(:smiles => obconversion(self["smiles"],"smi","can")) #unless self["smiles"] # should give canonical smiles, "can" seems to give incorrect results
self["smiles"]
end
@@ -91,7 +105,7 @@ module OpenTox
# @return [String] SDF string
def sdf
if self.sdf_id.nil?
- sdf = obconversion(inchi,"inchi","sdf")
+ sdf = obconversion(smiles,"smi","sdf")
file = Mongo::Grid::File.new(sdf, :filename => "#{id}.sdf",:content_type => "chemical/x-mdl-sdfile")
sdf_id = $gridfs.insert_one file
update :sdf_id => sdf_id
@@ -99,17 +113,29 @@ module OpenTox
$gridfs.find_one(_id: self.sdf_id).data
end
+ # Get SVG image
+ # @return [image/svg] Image data
+ def svg
+ if self.svg_id.nil?
+ svg = obconversion(smiles,"smi","svg")
+ file = Mongo::Grid::File.new(svg, :filename => "#{id}.svg", :content_type => "image/svg")
+ update(:image_id => $gridfs.insert_one(file))
+ end
+ $gridfs.find_one(_id: self.svg_id).data
+
+ end
+
# Get png image
# @example
# image = compound.png
# @return [image/png] Image data
def png
- if self.image_id.nil?
- png = obconversion(inchi,"inchi","_png2")
+ if self.png_id.nil?
+ png = obconversion(smiles,"smi","_png2")
file = Mongo::Grid::File.new(Base64.encode64(png), :filename => "#{id}.png", :content_type => "image/png")
- update(:image_id => $gridfs.insert_one(file))
+ update(:png_id => $gridfs.insert_one(file))
end
- Base64.decode64($gridfs.find_one(_id: self.image_id).data)
+ Base64.decode64($gridfs.find_one(_id: self.png_id).data)
end
diff --git a/lib/descriptor.rb b/lib/descriptor.rb
index 335f3dc..f0492a2 100644
--- a/lib/descriptor.rb
+++ b/lib/descriptor.rb
@@ -64,7 +64,7 @@ module OpenTox
@count = count
obconversion = OpenBabel::OBConversion.new
obmol = OpenBabel::OBMol.new
- obconversion.set_in_format('inchi')
+ obconversion.set_in_format('smi')
smarts_pattern = OpenBabel::OBSmartsPattern.new
smarts_features = [smarts_features] if smarts_features.is_a?(Feature)
@smarts = smarts_features.collect{|f| f.smarts}
@@ -77,7 +77,7 @@ module OpenTox
# which worked with opentox-client
# (but no smarts_match)
#p "'#{compound.inchi}'"
- obconversion.read_string(obmol,compound.inchi)
+ obconversion.read_string(obmol,compound.smiles)
@smarts.each_with_index do |smart,s|
smarts_pattern.init(smart)
if smarts_pattern.match(obmol)
@@ -123,10 +123,10 @@ module OpenTox
obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d}
obmol = OpenBabel::OBMol.new
obconversion = OpenBabel::OBConversion.new
- obconversion.set_in_format 'inchi'
+ obconversion.set_in_format 'smi'
last_feature_idx = @physchem_descriptors.size
@compounds.each_with_index do |compound,c|
- obconversion.read_string obmol, compound.inchi
+ obconversion.read_string obmol, compound.smiles
obdescriptors.each_with_index do |descriptor,d|
@data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol))
end
diff --git a/lib/lazar-model.rb b/lib/lazar-model.rb
new file mode 100644
index 0000000..4ca3403
--- /dev/null
+++ b/lib/lazar-model.rb
@@ -0,0 +1,287 @@
+module OpenTox
+
+ module Model
+
+ class Lazar
+ include OpenTox
+ include Mongoid::Document
+ include Mongoid::Timestamps
+ store_in collection: "models"
+
+ field :title, type: String
+ field :endpoint, type: String
+ field :creator, type: String, default: __FILE__
+ # datasets
+ field :training_dataset_id, type: BSON::ObjectId
+ # algorithms
+ field :prediction_algorithm, type: String
+ field :neighbor_algorithm, type: String
+ field :neighbor_algorithm_parameters, type: Hash
+ # prediction feature
+ field :prediction_feature_id, type: BSON::ObjectId
+
+ attr_accessor :prediction_dataset
+ attr_accessor :training_dataset
+
+ # Create a lazar model from a training_dataset and a feature_dataset
+ # @param [OpenTox::Dataset] training_dataset
+ # @return [OpenTox::Model::Lazar] Regression or classification model
+ def self.create training_dataset
+
+ bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
+
+ # TODO document convention
+ prediction_feature = training_dataset.features.first
+ prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new
+ lazar.training_dataset_id = training_dataset.id
+ lazar.prediction_feature_id = prediction_feature.id
+ lazar.title = prediction_feature.title
+
+ lazar.save
+ lazar
+ end
+
+ def predict object
+
+ t = Time.now
+ at = Time.now
+
+ training_dataset = Dataset.find training_dataset_id
+ prediction_feature = Feature.find prediction_feature_id
+
+ # parse data
+ compounds = []
+ case object.class.to_s
+ when "OpenTox::Compound"
+ compounds = [object]
+ when "Array"
+ compounds = object
+ when "OpenTox::Dataset"
+ compounds = object.compounds
+ else
+ bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter."
+ end
+
+ # make predictions
+ predictions = []
+ compounds.each_with_index do |compound,c|
+ t = Time.new
+ neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters)
+ # add activities
+ # TODO: improve efficiency, takes 3 times longer than previous version
+ # TODO database activity??
+ neighbors.collect! do |n|
+ rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first}
+ acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact
+ acts.empty? ? nil : n << acts
+ end
+ neighbors.compact! # remove neighbors without training activities
+ predictions << Algorithm.run(prediction_algorithm, neighbors)
+ end
+
+ # serialize result
+ case object.class.to_s
+ when "OpenTox::Compound"
+ return predictions.first
+ when "Array"
+ return predictions
+ when "OpenTox::Dataset"
+ # prepare prediction dataset
+ prediction_dataset = LazarPrediction.new(
+ :title => "Lazar prediction for #{prediction_feature.title}",
+ :creator => __FILE__,
+ :prediction_feature_id => prediction_feature.id
+
+ )
+ confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" )
+ # TODO move into warnings field
+ warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings")
+ prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ]
+ prediction_dataset.compounds = compounds
+ prediction_dataset.data_entries = predictions
+ prediction_dataset.save_all
+ return prediction_dataset
+ end
+
+ end
+
+ def training_activities
+ i = training_dataset.feature_ids.index prediction_feature_id
+ training_dataset.data_entries.collect{|de| de[i]}
+ end
+
+ end
+
+ class LazarClassification < Lazar
+ def initialize
+ super
+ self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote"
+ self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
+ self.neighbor_algorithm_parameters = {:min_sim => 0.7}
+ end
+ end
+
+ class LazarFminerClassification < LazarClassification
+ #field :feature_dataset_id, type: BSON::ObjectId
+ #field :feature_calculation_algorithm, type: String
+
+ def self.create training_dataset
+ model = super(training_dataset)
+ model.update "_type" => self.to_s # adjust class
+ model = self.find model.id # adjust class
+ model.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fminer_similarity"
+ model.neighbor_algorithm_parameters = {
+ :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match",
+ :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset).id,
+ :min_sim => 0.3
+ }
+ model.save
+ model
+ end
+
+=begin
+ def predict object
+
+ t = Time.now
+ at = Time.now
+
+ @training_dataset = OpenTox::Dataset.find(training_dataset_id)
+ @feature_dataset = OpenTox::Dataset.find(feature_dataset_id)
+
+ compounds = []
+ case object.class.to_s
+ when "OpenTox::Compound"
+ compounds = [object]
+ when "Array"
+ compounds = object
+ when "OpenTox::Dataset"
+ compounds = object.compounds
+ else
+ bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter."
+ end
+
+ $logger.debug "Setup: #{Time.now-t}"
+ t = Time.now
+
+ @query_fingerprint = Algorithm.run(feature_calculation_algorithm, compounds, @feature_dataset.features.collect{|f| f.name} )
+
+ $logger.debug "Query fingerprint calculation: #{Time.now-t}"
+ t = Time.now
+
+ predictions = []
+ prediction_feature = OpenTox::Feature.find prediction_feature_id
+ tt = 0
+ pt = 0
+ nt = 0
+ st = 0
+ nit = 0
+ @training_fingerprints ||= @feature_dataset.data_entries
+ compounds.each_with_index do |compound,c|
+ t = Time.new
+
+ $logger.debug "predict compound #{c+1}/#{compounds.size} #{compound.inchi}"
+
+ database_activities = @training_dataset.values(compound,prediction_feature)
+ if database_activities and !database_activities.empty?
+ database_activities = database_activities.first if database_activities.size == 1
+ $logger.debug "Compound #{compound.inchi} occurs in training dataset with activity #{database_activities}"
+ predictions << {:compound => compound, :value => database_activities, :confidence => "measured"}
+ next
+ else
+
+ #training_fingerprints = @feature_dataset.data_entries
+ query_fingerprint = @query_fingerprint[c]
+ neighbors = []
+ tt += Time.now-t
+ t = Time.new
+
+
+ # find neighbors
+ @training_fingerprints.each_with_index do |fingerprint, i|
+ ts = Time.new
+ sim = Algorithm.run(similarity_algorithm,fingerprint, query_fingerprint)
+ st += Time.now-ts
+ ts = Time.new
+ if sim > self.min_sim
+ if prediction_algorithm =~ /Regression/
+ neighbors << [@feature_dataset.compound_ids[i],sim,training_activities[i], fingerprint]
+ else
+ neighbors << [@feature_dataset.compound_ids[i],sim,training_activities[i]] # use compound_ids, instantiation of Compounds is too time consuming
+ end
+ end
+ nit += Time.now-ts
+ end
+
+ if neighbors.empty?
+ predictions << {:compound => compound, :value => nil, :confidence => nil, :warning => "No neighbors with similarity > #{min_sim} in dataset #{training_dataset.id}"}
+ next
+ end
+ nt += Time.now-t
+ t = Time.new
+
+ if prediction_algorithm =~ /Regression/
+ prediction = Algorithm.run(prediction_algorithm, neighbors, :min_train_performance => self.min_train_performance)
+ else
+ prediction = Algorithm.run(prediction_algorithm, neighbors)
+ end
+ prediction[:compound] = compound
+ prediction[:neighbors] = neighbors.sort{|a,b| b[1] <=> a[1]} # sort with ascending similarities
+
+
+ # AM: transform to original space (TODO)
+ #confidence_value = ((confidence_value+1.0)/2.0).abs if prediction.first and similarity_algorithm =~ /cosine/
+
+
+ $logger.debug "predicted value: #{prediction[:value]}, confidence: #{prediction[:confidence]}"
+ predictions << prediction
+ pt += Time.now-t
+ end
+
+ end
+ $logger.debug "Transform time: #{tt}"
+ $logger.debug "Neighbor search time: #{nt} (Similarity calculation: #{st}, Neighbor insert: #{nit})"
+ $logger.debug "Prediction time: #{pt}"
+ $logger.debug "Total prediction time: #{Time.now-at}"
+
+ # serialize result
+ case object.class.to_s
+ when "OpenTox::Compound"
+ return predictions.first
+ when "Array"
+ return predictions
+ when "OpenTox::Dataset"
+ # prepare prediction dataset
+ prediction_dataset = LazarPrediction.new(
+ :title => "Lazar prediction for #{prediction_feature.title}",
+ :creator => __FILE__,
+ :prediction_feature_id => prediction_feature.id
+
+ )
+ confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" )
+ warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings")
+ prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ]
+ prediction_dataset.compounds = compounds
+ prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence],p[:warning]]}
+ prediction_dataset.save_all
+ return prediction_dataset
+ end
+
+ end
+=end
+ end
+
+ class LazarRegression < Lazar
+
+ def initialize
+ super
+ self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
+ self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average"
+ self.neighbor_algorithm_parameters = {:min_sim => 0.7}
+ end
+
+ end
+
+ end
+
+end
+
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 2e7e7c2..0c5e18b 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -58,7 +58,7 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation"]# Algor
"algorithm.rb",
"descriptor.rb",
"bbrc.rb",
- "lazar.rb",
+ "lazar-model.rb",
"similarity.rb",
"neighbor.rb",
"classification.rb",
diff --git a/lib/neighbor.rb b/lib/neighbor.rb
index a2c28d4..d849cbf 100644
--- a/lib/neighbor.rb
+++ b/lib/neighbor.rb
@@ -8,7 +8,7 @@ module OpenTox
def self.fminer_similarity compound, params
feature_dataset = Dataset.find params[:feature_dataset_id]
- query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features.collect{|f| f.smarts} )
+ query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features)
neighbors = []
# find neighbors
diff --git a/lib/overwrite.rb b/lib/overwrite.rb
index 2eb0b39..a27d685 100644
--- a/lib/overwrite.rb
+++ b/lib/overwrite.rb
@@ -11,6 +11,12 @@ class Object
end
end
+class Numeric
+ def percent_of(n)
+ self.to_f / n.to_f * 100.0
+ end
+end
+
module Enumerable
# @return [Array] only the duplicates of an enumerable
def duplicates