From 8c973e16028cb95c978bb08cf79369a5c3520c31 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sun, 28 Feb 2016 12:43:38 +0100 Subject: physchem feature class --- lib/compound.rb | 29 ++++++++-- lib/descriptor.rb | 35 +++++++----- lib/feature.rb | 8 +-- lib/lazar.rb | 3 +- lib/physchem.rb | 138 ++++++++++++++++++++++++++++++++++++++++++++++ lib/unique_descriptors.rb | 9 ++- 6 files changed, 188 insertions(+), 34 deletions(-) create mode 100644 lib/physchem.rb (limited to 'lib') diff --git a/lib/compound.rb b/lib/compound.rb index d5d6aa9..4ea4db4 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -7,7 +7,9 @@ CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/" module OpenTox class Compound + require_relative "unique_descriptors.rb" include OpenTox + include OpenTox::Descriptor DEFAULT_FINGERPRINT = "MP2D" @@ -15,7 +17,7 @@ module OpenTox field :smiles, type: String field :inchikey, type: String field :names, type: Array - field :warning, type: String + #field :warnings, type: Array, default: [] field :cid, type: String field :chemblid, type: String field :png_id, type: BSON::ObjectId @@ -23,8 +25,8 @@ module OpenTox field :sdf_id, type: BSON::ObjectId field :molecular_weight, type: Float field :fingerprints, type: Hash, default: {} - field :physchem, type: Hash, default: {} field :default_fingerprint_size, type: Integer + field :physchem_descriptors, type: Hash, default: {} field :dataset_ids, type: Array, default: [] field :features, type: Hash, default: {} @@ -86,19 +88,34 @@ module OpenTox fingerprints[type] end + def physchem descriptor_ids + calculated_descriptor_ids = self[:physchem_descriptors].keys + p names + new = UNIQUEDESCRIPTORS-names + p new + d = self.physchem(self, new) + #p d + #self[:physchem_descriptors].merge! d + self.update_attribute(:physchem_descriptors, self[:physchem_descriptors].merge(d)) + save + self[:physchem_descriptors] + end + # Create a compound from smiles string # @example # compound = OpenTox::Compound.from_smiles("c1ccccc1") # @param [String] smiles Smiles string # @return [OpenTox::Compound] Compound def self.from_smiles smiles - return nil if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles + if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles + $logger.warn "SMILES parsing failed for '#{smiles}'', SMILES string contains whitespaces." + return nil + end smiles = obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons) if smiles.empty? + $logger.warn "SMILES parsing failed for '#{smiles}'', this may be caused by an incorrect SMILES string." return nil - #Compound.find_or_create_by(:warning => "SMILES parsing failed for '#{smiles}', this may be caused by an incorrect SMILES string.") else - #Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons) Compound.find_or_create_by :smiles => smiles end end @@ -113,7 +130,7 @@ module OpenTox #smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip smiles = obconversion(inchi,"inchi","can") if smiles.empty? - Compound.find_or_create_by(:warning => "InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries.") + Compound.find_or_create_by(:warnings => ["InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries."]) else Compound.find_or_create_by(:smiles => smiles, :inchi => inchi) end diff --git a/lib/descriptor.rb b/lib/descriptor.rb index 93ce591..d6b2e85 100644 --- a/lib/descriptor.rb +++ b/lib/descriptor.rb @@ -4,10 +4,10 @@ ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk" module OpenTox - module Algorithm + #module Algorithm # Class for descriptor calculations - class Descriptor + module Descriptor include OpenTox JAVA_DIR = File.join(File.dirname(__FILE__),"..","java") @@ -19,20 +19,19 @@ module OpenTox obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"] OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d| name,description = d.split(/\s+/,2) - ["Openbabel."+name,description] unless obexclude.include? name + ["Openbabel_"+name,description] unless obexclude.include? name end.compact.sort{|a,b| a[0] <=> b[0]}] cdk_desc = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`) - CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}] - CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"."+name } }.flatten + CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk_"+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}] + CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk_"+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"_"+name } }.flatten # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug) joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"] # strip Joelib messages from stdout JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d| - name = d[:java_class].sub(/^joelib2.feature.types./,'') - # impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java - ["Joelib."+name, "no description available"] unless joelibexclude.include? name + name = d[:java_class].sub(/^joelib2.feature.types./,'').gsub(/\./,"_") + ["Joelib_"+name, "impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java"] unless joelibexclude.include? name end.compact.sort{|a,b| a[0] <=> b[0]}] DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS)) @@ -42,12 +41,12 @@ module OpenTox # Description of available descriptors def self.description descriptor - lib = descriptor.split('.').first + lib = descriptor.split('_').first case lib when "Openbabel" OBDESCRIPTORS[descriptor] when "Cdk" - name = descriptor.split('.')[0..-2].join('.') + name = descriptor.split('_')[0..-2].join('_') CDKDESCRIPTORS[name] when "Joelib" JOELIBDESCRIPTORS[descriptor] @@ -101,7 +100,7 @@ module OpenTox @physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features des = {} @descriptors.each do |d| - lib, descriptor = d.split(".",2) + lib, descriptor = d.split("_",2) lib = lib.downcase.to_sym des[lib] ||= [] des[lib] << descriptor @@ -125,7 +124,7 @@ module OpenTox @data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol)) end end - @physchem_descriptors += descriptors.collect{|d| "Openbabel.#{d}"} + @physchem_descriptors += descriptors.collect{|d| "Openbabel_#{d}"} end def self.java_descriptors descriptors, lib @@ -208,10 +207,16 @@ module OpenTox end def self.serialize - @data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}} + #@data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}} case @input_class + # TODO beautify and fix for other objects when "OpenTox::Compound" - @data_entries.first + r = {} + @data_entries.first.each_with_index do |d,i| + # TODO fix @ source + r[@physchem_descriptors[i].gsub(/\./,'_')] = d + end + r when "Array" @data_entries when "OpenTox::Dataset" @@ -243,5 +248,5 @@ module OpenTox end private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize end - end + #end end diff --git a/lib/feature.rb b/lib/feature.rb index a308a55..21572ca 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -5,6 +5,7 @@ module OpenTox field :nominal, type: Boolean field :numeric, type: Boolean field :measured, type: Boolean + field :calculated, type: Boolean end # Feature for categorical variables @@ -42,13 +43,6 @@ module OpenTox field :dataset_id end - # Feature for physico-chemical descriptors - class PhysChemDescriptor < NumericFeature - field :algorithm, type: String, default: "OpenTox::Algorithm::Descriptor.physchem" - field :parameters, type: Hash - field :creator, type: String - end - # Feature for categorical bioassay results class NominalBioAssay < NominalFeature end diff --git a/lib/lazar.rb b/lib/lazar.rb index e5c1609..c43dae7 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -69,11 +69,12 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveO "error.rb", "opentox.rb", "feature.rb", + "physchem.rb", + "descriptor.rb", "compound.rb", "dataset.rb", "descriptor.rb", "algorithm.rb", - "descriptor.rb", "bbrc.rb", "model.rb", "similarity.rb", diff --git a/lib/physchem.rb b/lib/physchem.rb new file mode 100644 index 0000000..1126e69 --- /dev/null +++ b/lib/physchem.rb @@ -0,0 +1,138 @@ +module OpenTox + + # Feature for physico-chemical descriptors + class PhysChem < NumericFeature + + field :library, type: String + field :descriptor, type: String + field :description, type: String + + JAVA_DIR = File.join(File.dirname(__FILE__),"..","java") + CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last + JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar") + LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar") + JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar") + + obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"] + OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d| + name,description = d.split(/\s+/,2) + ["Openbabel."+name,description] unless obexclude.include? name + end.compact.sort{|a,b| a[0] <=> b[0]}] + + cdkdescriptors = {} + CDK_DESCRIPTIONS = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`) + CDK_DESCRIPTIONS.each do |d| + prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,'') + d[:names].each { |name| cdkdescriptors[prefix+"."+name] = d[:description] } + end + CDKDESCRIPTORS = cdkdescriptors + + # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug) + joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"] + # strip Joelib messages from stdout + JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d| + name = d[:java_class].sub(/^joelib2.feature.types./,'') + ["Joelib."+name, "JOELIb does not provide meaningful descriptions, see java/JoelibDescriptors.java for details."] unless joelibexclude.include? name + end.compact.sort{|a,b| a[0] <=> b[0]}] + + DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS)) + + require_relative "unique_descriptors.rb" + + def self.descriptors + DESCRIPTORS.collect do |name,description| + lib,desc = name.split('.',2) + self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false) + end + end + + def self.unique_descriptors + udesc = [] + UNIQUEDESCRIPTORS.each do |name| + lib,desc = name.split('.',2) + if lib == "Cdk" + CDK_DESCRIPTIONS.select{|d| desc == d[:java_class].split('.').last.sub('Descriptor','') }.first[:names].each do |n| + dname = "#{name}.#{n}" + description = DESCRIPTORS[dname] + udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false) + end + else + description = DESCRIPTORS[name] + udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false) + end + end + udesc + end + + # Description of available descriptors + def self.description descriptor + lib = descriptor.split('_').first + case lib + when "Openbabel" + OBDESCRIPTORS[descriptor] + when "Cdk" + name = descriptor.split('_')[0..-2].join('_') + CDKDESCRIPTORS[name] + when "Joelib" + JOELIBDESCRIPTORS[descriptor] + when "lookup" + "Read feature values from a dataset" + end + end + + def calculate compound + result = send library.downcase,descriptor,compound + p result + result[self.name] + end + + def openbabel descriptor, compound + obdescriptor = OpenBabel::OBDescriptor.find_type descriptor + obmol = OpenBabel::OBMol.new + obconversion = OpenBabel::OBConversion.new + obconversion.set_in_format 'smi' + obconversion.read_string obmol, compound.smiles + {"#{library.capitalize}.#{descriptor}" => fix_value(obdescriptor.predict(obmol))} + end + + def cdk descriptor, compound + java_descriptor "cdk", descriptor, compound + end + + def joelib descriptor, compound + java_descriptor "joelib", descriptor, compound + end + + private + + def java_descriptor lib, descriptor, compound + + sdf_3d = "/tmp/#{SecureRandom.uuid}.sdf" + File.open(sdf_3d,"w+"){|f| f.print compound.sdf} + + # use java system call (rjb blocks within tasks) + # use Tempfiles to avoid "Argument list too long" error + case lib + when "cdk" + `java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf_3d} #{descriptor}` + when "joelib" + `java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf_3d} #{descriptor}` + end + result = YAML.load_file("#{sdf_3d}#{lib}.yaml").first + result.keys.each{|k| result[k] = result.delete(k)} + result + end + + def fix_value val + val = val.first if val.is_a? Array and val.size == 1 + val = nil if val == "NaN" + if val.numeric? + val = Float(val) + val = nil if val.nan? or val.infinite? + end + val + end + + end + +end diff --git a/lib/unique_descriptors.rb b/lib/unique_descriptors.rb index cf9cbf3..03a9b08 100644 --- a/lib/unique_descriptors.rb +++ b/lib/unique_descriptors.rb @@ -12,7 +12,7 @@ UNIQUEDESCRIPTORS = [ "Openbabel.HBA1", #Number of Hydrogen Bond Acceptors 1 (JoelLib) "Openbabel.HBA2", #Number of Hydrogen Bond Acceptors 2 (JoelLib) "Openbabel.HBD", #Number of Hydrogen Bond Donors (JoelLib) - #"Openbabel.L5", #Lipinski Rule of Five# TODO Openbabel.L5 returns nil, investigate!!! + #"Openbabe..L5", #Lipinski Rule of Five# TODO Openbabel.L5 returns nil, investigate!!! "Openbabel.logP", #octanol/water partition coefficient "Openbabel.MP", #Melting point "Openbabel.MR", #molar refractivity @@ -75,7 +75,7 @@ UNIQUEDESCRIPTORS = [ "Joelib.count.NumberOfP", #no description available "Joelib.count.NumberOfO", #no description available "Joelib.count.NumberOfN", #no description available - #"Joelib.count.AromaticBonds", #no description available + #"Joeli#.count.AromaticBonds", #no description available "Joelib.count.NumberOfI", #no description available "Joelib.count.NumberOfF", #no description available "Joelib.count.NumberOfC", #no description available @@ -91,7 +91,7 @@ UNIQUEDESCRIPTORS = [ "Joelib.GeometricalShapeCoefficient", #no description available #"Joelib.MolecularWeight", #no description available "Joelib.FractionRotatableBonds", #no description available - #"Joelib.count.HBD2", #no description available + #"Joeli..count.HBD2", #no description available #"Joelib.count.HBD1", #no description available "Joelib.LogP", #no description available "Joelib.GraphShapeCoefficient", #no description available @@ -116,5 +116,4 @@ UNIQUEDESCRIPTORS = [ "Joelib.count.SOGroups", #no description available "Joelib.TopologicalDiameter", #no description available "Joelib.count.NumberOfHal", #no description available - -].sort +] -- cgit v1.2.3