summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2016-02-28 12:43:38 +0100
committerChristoph Helma <helma@in-silico.ch>2016-02-28 12:43:38 +0100
commit8c973e16028cb95c978bb08cf79369a5c3520c31 (patch)
tree96909cb936fbbf2c3bc43776278953394b93b94f /lib
parentb90720cc26d789a96fa6f7a054fe06fc8b4ef33d (diff)
physchem feature class
Diffstat (limited to 'lib')
-rw-r--r--lib/compound.rb29
-rw-r--r--lib/descriptor.rb35
-rw-r--r--lib/feature.rb8
-rw-r--r--lib/lazar.rb3
-rw-r--r--lib/physchem.rb138
-rw-r--r--lib/unique_descriptors.rb9
6 files changed, 188 insertions, 34 deletions
diff --git a/lib/compound.rb b/lib/compound.rb
index d5d6aa9..4ea4db4 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -7,7 +7,9 @@ CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/"
module OpenTox
class Compound
+ require_relative "unique_descriptors.rb"
include OpenTox
+ include OpenTox::Descriptor
DEFAULT_FINGERPRINT = "MP2D"
@@ -15,7 +17,7 @@ module OpenTox
field :smiles, type: String
field :inchikey, type: String
field :names, type: Array
- field :warning, type: String
+ #field :warnings, type: Array, default: []
field :cid, type: String
field :chemblid, type: String
field :png_id, type: BSON::ObjectId
@@ -23,8 +25,8 @@ module OpenTox
field :sdf_id, type: BSON::ObjectId
field :molecular_weight, type: Float
field :fingerprints, type: Hash, default: {}
- field :physchem, type: Hash, default: {}
field :default_fingerprint_size, type: Integer
+ field :physchem_descriptors, type: Hash, default: {}
field :dataset_ids, type: Array, default: []
field :features, type: Hash, default: {}
@@ -86,19 +88,34 @@ module OpenTox
fingerprints[type]
end
+ def physchem descriptor_ids
+ calculated_descriptor_ids = self[:physchem_descriptors].keys
+ p names
+ new = UNIQUEDESCRIPTORS-names
+ p new
+ d = self.physchem(self, new)
+ #p d
+ #self[:physchem_descriptors].merge! d
+ self.update_attribute(:physchem_descriptors, self[:physchem_descriptors].merge(d))
+ save
+ self[:physchem_descriptors]
+ end
+
# Create a compound from smiles string
# @example
# compound = OpenTox::Compound.from_smiles("c1ccccc1")
# @param [String] smiles Smiles string
# @return [OpenTox::Compound] Compound
def self.from_smiles smiles
- return nil if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles
+ if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles
+ $logger.warn "SMILES parsing failed for '#{smiles}'', SMILES string contains whitespaces."
+ return nil
+ end
smiles = obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons)
if smiles.empty?
+ $logger.warn "SMILES parsing failed for '#{smiles}'', this may be caused by an incorrect SMILES string."
return nil
- #Compound.find_or_create_by(:warning => "SMILES parsing failed for '#{smiles}', this may be caused by an incorrect SMILES string.")
else
- #Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons)
Compound.find_or_create_by :smiles => smiles
end
end
@@ -113,7 +130,7 @@ module OpenTox
#smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip
smiles = obconversion(inchi,"inchi","can")
if smiles.empty?
- Compound.find_or_create_by(:warning => "InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries.")
+ Compound.find_or_create_by(:warnings => ["InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries."])
else
Compound.find_or_create_by(:smiles => smiles, :inchi => inchi)
end
diff --git a/lib/descriptor.rb b/lib/descriptor.rb
index 93ce591..d6b2e85 100644
--- a/lib/descriptor.rb
+++ b/lib/descriptor.rb
@@ -4,10 +4,10 @@ ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk"
module OpenTox
- module Algorithm
+ #module Algorithm
# Class for descriptor calculations
- class Descriptor
+ module Descriptor
include OpenTox
JAVA_DIR = File.join(File.dirname(__FILE__),"..","java")
@@ -19,20 +19,19 @@ module OpenTox
obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"]
OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
name,description = d.split(/\s+/,2)
- ["Openbabel."+name,description] unless obexclude.include? name
+ ["Openbabel_"+name,description] unless obexclude.include? name
end.compact.sort{|a,b| a[0] <=> b[0]}]
cdk_desc = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`)
- CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}]
- CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"."+name } }.flatten
+ CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk_"+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}]
+ CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk_"+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"_"+name } }.flatten
# exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
# strip Joelib messages from stdout
JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
- name = d[:java_class].sub(/^joelib2.feature.types./,'')
- # impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java
- ["Joelib."+name, "no description available"] unless joelibexclude.include? name
+ name = d[:java_class].sub(/^joelib2.feature.types./,'').gsub(/\./,"_")
+ ["Joelib_"+name, "impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java"] unless joelibexclude.include? name
end.compact.sort{|a,b| a[0] <=> b[0]}]
DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
@@ -42,12 +41,12 @@ module OpenTox
# Description of available descriptors
def self.description descriptor
- lib = descriptor.split('.').first
+ lib = descriptor.split('_').first
case lib
when "Openbabel"
OBDESCRIPTORS[descriptor]
when "Cdk"
- name = descriptor.split('.')[0..-2].join('.')
+ name = descriptor.split('_')[0..-2].join('_')
CDKDESCRIPTORS[name]
when "Joelib"
JOELIBDESCRIPTORS[descriptor]
@@ -101,7 +100,7 @@ module OpenTox
@physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features
des = {}
@descriptors.each do |d|
- lib, descriptor = d.split(".",2)
+ lib, descriptor = d.split("_",2)
lib = lib.downcase.to_sym
des[lib] ||= []
des[lib] << descriptor
@@ -125,7 +124,7 @@ module OpenTox
@data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol))
end
end
- @physchem_descriptors += descriptors.collect{|d| "Openbabel.#{d}"}
+ @physchem_descriptors += descriptors.collect{|d| "Openbabel_#{d}"}
end
def self.java_descriptors descriptors, lib
@@ -208,10 +207,16 @@ module OpenTox
end
def self.serialize
- @data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}}
+ #@data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}}
case @input_class
+ # TODO beautify and fix for other objects
when "OpenTox::Compound"
- @data_entries.first
+ r = {}
+ @data_entries.first.each_with_index do |d,i|
+ # TODO fix @ source
+ r[@physchem_descriptors[i].gsub(/\./,'_')] = d
+ end
+ r
when "Array"
@data_entries
when "OpenTox::Dataset"
@@ -243,5 +248,5 @@ module OpenTox
end
private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize
end
- end
+ #end
end
diff --git a/lib/feature.rb b/lib/feature.rb
index a308a55..21572ca 100644
--- a/lib/feature.rb
+++ b/lib/feature.rb
@@ -5,6 +5,7 @@ module OpenTox
field :nominal, type: Boolean
field :numeric, type: Boolean
field :measured, type: Boolean
+ field :calculated, type: Boolean
end
# Feature for categorical variables
@@ -42,13 +43,6 @@ module OpenTox
field :dataset_id
end
- # Feature for physico-chemical descriptors
- class PhysChemDescriptor < NumericFeature
- field :algorithm, type: String, default: "OpenTox::Algorithm::Descriptor.physchem"
- field :parameters, type: Hash
- field :creator, type: String
- end
-
# Feature for categorical bioassay results
class NominalBioAssay < NominalFeature
end
diff --git a/lib/lazar.rb b/lib/lazar.rb
index e5c1609..c43dae7 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -69,11 +69,12 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveO
"error.rb",
"opentox.rb",
"feature.rb",
+ "physchem.rb",
+ "descriptor.rb",
"compound.rb",
"dataset.rb",
"descriptor.rb",
"algorithm.rb",
- "descriptor.rb",
"bbrc.rb",
"model.rb",
"similarity.rb",
diff --git a/lib/physchem.rb b/lib/physchem.rb
new file mode 100644
index 0000000..1126e69
--- /dev/null
+++ b/lib/physchem.rb
@@ -0,0 +1,138 @@
+module OpenTox
+
+ # Feature for physico-chemical descriptors
+ class PhysChem < NumericFeature
+
+ field :library, type: String
+ field :descriptor, type: String
+ field :description, type: String
+
+ JAVA_DIR = File.join(File.dirname(__FILE__),"..","java")
+ CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last
+ JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar")
+ LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar")
+ JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
+
+ obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"]
+ OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
+ name,description = d.split(/\s+/,2)
+ ["Openbabel."+name,description] unless obexclude.include? name
+ end.compact.sort{|a,b| a[0] <=> b[0]}]
+
+ cdkdescriptors = {}
+ CDK_DESCRIPTIONS = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`)
+ CDK_DESCRIPTIONS.each do |d|
+ prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,'')
+ d[:names].each { |name| cdkdescriptors[prefix+"."+name] = d[:description] }
+ end
+ CDKDESCRIPTORS = cdkdescriptors
+
+ # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
+ joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
+ # strip Joelib messages from stdout
+ JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
+ name = d[:java_class].sub(/^joelib2.feature.types./,'')
+ ["Joelib."+name, "JOELIb does not provide meaningful descriptions, see java/JoelibDescriptors.java for details."] unless joelibexclude.include? name
+ end.compact.sort{|a,b| a[0] <=> b[0]}]
+
+ DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
+
+ require_relative "unique_descriptors.rb"
+
+ def self.descriptors
+ DESCRIPTORS.collect do |name,description|
+ lib,desc = name.split('.',2)
+ self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
+ end
+ end
+
+ def self.unique_descriptors
+ udesc = []
+ UNIQUEDESCRIPTORS.each do |name|
+ lib,desc = name.split('.',2)
+ if lib == "Cdk"
+ CDK_DESCRIPTIONS.select{|d| desc == d[:java_class].split('.').last.sub('Descriptor','') }.first[:names].each do |n|
+ dname = "#{name}.#{n}"
+ description = DESCRIPTORS[dname]
+ udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
+ end
+ else
+ description = DESCRIPTORS[name]
+ udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true, :numeric => true, :nominal => false)
+ end
+ end
+ udesc
+ end
+
+ # Description of available descriptors
+ def self.description descriptor
+ lib = descriptor.split('_').first
+ case lib
+ when "Openbabel"
+ OBDESCRIPTORS[descriptor]
+ when "Cdk"
+ name = descriptor.split('_')[0..-2].join('_')
+ CDKDESCRIPTORS[name]
+ when "Joelib"
+ JOELIBDESCRIPTORS[descriptor]
+ when "lookup"
+ "Read feature values from a dataset"
+ end
+ end
+
+ def calculate compound
+ result = send library.downcase,descriptor,compound
+ p result
+ result[self.name]
+ end
+
+ def openbabel descriptor, compound
+ obdescriptor = OpenBabel::OBDescriptor.find_type descriptor
+ obmol = OpenBabel::OBMol.new
+ obconversion = OpenBabel::OBConversion.new
+ obconversion.set_in_format 'smi'
+ obconversion.read_string obmol, compound.smiles
+ {"#{library.capitalize}.#{descriptor}" => fix_value(obdescriptor.predict(obmol))}
+ end
+
+ def cdk descriptor, compound
+ java_descriptor "cdk", descriptor, compound
+ end
+
+ def joelib descriptor, compound
+ java_descriptor "joelib", descriptor, compound
+ end
+
+ private
+
+ def java_descriptor lib, descriptor, compound
+
+ sdf_3d = "/tmp/#{SecureRandom.uuid}.sdf"
+ File.open(sdf_3d,"w+"){|f| f.print compound.sdf}
+
+ # use java system call (rjb blocks within tasks)
+ # use Tempfiles to avoid "Argument list too long" error
+ case lib
+ when "cdk"
+ `java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf_3d} #{descriptor}`
+ when "joelib"
+ `java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf_3d} #{descriptor}`
+ end
+ result = YAML.load_file("#{sdf_3d}#{lib}.yaml").first
+ result.keys.each{|k| result[k] = result.delete(k)}
+ result
+ end
+
+ def fix_value val
+ val = val.first if val.is_a? Array and val.size == 1
+ val = nil if val == "NaN"
+ if val.numeric?
+ val = Float(val)
+ val = nil if val.nan? or val.infinite?
+ end
+ val
+ end
+
+ end
+
+end
diff --git a/lib/unique_descriptors.rb b/lib/unique_descriptors.rb
index cf9cbf3..03a9b08 100644
--- a/lib/unique_descriptors.rb
+++ b/lib/unique_descriptors.rb
@@ -12,7 +12,7 @@ UNIQUEDESCRIPTORS = [
"Openbabel.HBA1", #Number of Hydrogen Bond Acceptors 1 (JoelLib)
"Openbabel.HBA2", #Number of Hydrogen Bond Acceptors 2 (JoelLib)
"Openbabel.HBD", #Number of Hydrogen Bond Donors (JoelLib)
- #"Openbabel.L5", #Lipinski Rule of Five# TODO Openbabel.L5 returns nil, investigate!!!
+ #"Openbabe..L5", #Lipinski Rule of Five# TODO Openbabel.L5 returns nil, investigate!!!
"Openbabel.logP", #octanol/water partition coefficient
"Openbabel.MP", #Melting point
"Openbabel.MR", #molar refractivity
@@ -75,7 +75,7 @@ UNIQUEDESCRIPTORS = [
"Joelib.count.NumberOfP", #no description available
"Joelib.count.NumberOfO", #no description available
"Joelib.count.NumberOfN", #no description available
- #"Joelib.count.AromaticBonds", #no description available
+ #"Joeli#.count.AromaticBonds", #no description available
"Joelib.count.NumberOfI", #no description available
"Joelib.count.NumberOfF", #no description available
"Joelib.count.NumberOfC", #no description available
@@ -91,7 +91,7 @@ UNIQUEDESCRIPTORS = [
"Joelib.GeometricalShapeCoefficient", #no description available
#"Joelib.MolecularWeight", #no description available
"Joelib.FractionRotatableBonds", #no description available
- #"Joelib.count.HBD2", #no description available
+ #"Joeli..count.HBD2", #no description available
#"Joelib.count.HBD1", #no description available
"Joelib.LogP", #no description available
"Joelib.GraphShapeCoefficient", #no description available
@@ -116,5 +116,4 @@ UNIQUEDESCRIPTORS = [
"Joelib.count.SOGroups", #no description available
"Joelib.TopologicalDiameter", #no description available
"Joelib.count.NumberOfHal", #no description available
-
-].sort
+]