summaryrefslogtreecommitdiff
path: root/lib/descriptor.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/descriptor.rb')
-rw-r--r--lib/descriptor.rb253
1 files changed, 253 insertions, 0 deletions
diff --git a/lib/descriptor.rb b/lib/descriptor.rb
new file mode 100644
index 0000000..68bc7a2
--- /dev/null
+++ b/lib/descriptor.rb
@@ -0,0 +1,253 @@
+require 'digest/md5'
+ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk"
+BABEL_3D_CACHE_DIR = File.join(File.dirname(__FILE__),"..",'/babel_3d_cache')
+# TODO store descriptors in mongodb
+
+module OpenTox
+
+ module Algorithm
+ class Descriptor
+ include OpenTox
+
+ JAVA_DIR = File.join(File.dirname(__FILE__),"..","java")
+ CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last
+ JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar")
+ LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar")
+ JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
+
+ obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title"]
+ OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
+ name,description = d.split(/\s+/,2)
+ ["Openbabel."+name,description] unless obexclude.include? name
+ end.compact.sort{|a,b| a[0] <=> b[0]}]
+
+ cdk_desc = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`)
+ CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}]
+ CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"."+name } }.flatten
+
+ # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
+ joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
+ # strip Joelib messages from stdout
+ JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
+ name = d[:java_class].sub(/^joelib2.feature.types./,'')
+ # impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java
+ ["Joelib."+name, "no description available"] unless joelibexclude.include? name
+ end.compact.sort{|a,b| a[0] <=> b[0]}]
+
+ DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
+ DESCRIPTOR_VALUES = OBDESCRIPTORS.keys + CDKDESCRIPTOR_VALUES + JOELIBDESCRIPTORS.keys
+
+ require_relative "unique_descriptors.rb"
+
+ def self.description descriptor
+ lib = descriptor.split('.').first
+ case lib
+ when "Openbabel"
+ OBDESCRIPTORS[descriptor]
+ when "Cdk"
+ name = descriptor.split('.')[0..-2].join('.')
+ CDKDESCRIPTORS[name]
+ when "Joelib"
+ JOELIBDESCRIPTORS[descriptor]
+ when "lookup"
+ "Read feature values from a dataset"
+ end
+ end
+
+ def self.smarts_match compounds, smarts_features, count=false
+ bad_request_error "Compounds for smarts_match are empty" unless compounds
+ bad_request_error "Smarts features for smarts_match are empty" unless smarts_features
+ parse compounds
+ @count = count
+ obconversion = OpenBabel::OBConversion.new
+ obmol = OpenBabel::OBMol.new
+ obconversion.set_in_format('inchi')
+ smarts_pattern = OpenBabel::OBSmartsPattern.new
+ smarts_features = [smarts_features] if smarts_features.is_a?(Feature)
+ @smarts = smarts_features.collect{|f| f.smarts}
+ @physchem_descriptors = nil
+ @data_entries = Array.new(@compounds.size){Array.new(@smarts.size,false)}
+ @compounds.each_with_index do |compound,c|
+ # TODO OpenBabel may segfault here
+ # catch inchi errors in compound.rb
+ # eg. at line 249 of rat_feature_dataset
+ # which worked with opentox-client
+ # (but no smarts_match)
+ p "'#{compound.inchi}'"
+ obconversion.read_string(obmol,compound.inchi)
+ @smarts.each_with_index do |smart,s|
+ smarts_pattern.init(smart)
+ if smarts_pattern.match(obmol)
+ count ? value = smarts_pattern.get_map_list.to_a.size : value = 1
+ else
+ value = 0
+ end
+ @data_entries[c][s] = value
+ end
+ end
+ serialize
+ end
+
+ def self.smarts_count compounds, smarts
+ smarts_match compounds,smarts,true
+ end
+
+ def self.serialize
+ case @input_class
+ when "OpenTox::Compound"
+ if @with_names and @physchem_descriptors
+ [@physchem_descriptors,@data_entries.first]
+ else
+ @data_entries.first
+ end
+ when "Array"
+ if @with_names and @physchem_descriptors
+ [@physchem_descriptors,@data_entries.first]
+ else
+ @data_entries
+ end
+ when "OpenTox::Dataset"
+ dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id})
+ if @smarts
+ dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id}
+ @count ? algo = "count" : algo = "match"
+ dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}"
+
+ elsif @physchem_descriptors
+ dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id}
+ dataset.data_entries = @data_entries
+ dataset.feature_calculation_algorithm = "#{self}.physchem"
+ #TODO params?
+ end
+ dataset.save_all
+ dataset
+ end
+ end
+
+ def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS, with_names=false
+ parse compounds
+ @data_entries = Array.new(@compounds.size){[]}
+ @descriptors = descriptors
+ @smarts = nil
+ @physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features
+ @with_names = with_names
+ des = {}
+ @descriptors.each do |d|
+ lib, descriptor = d.split(".",2)
+ lib = lib.downcase.to_sym
+ des[lib] ||= []
+ des[lib] << descriptor
+ end
+ des.each do |lib,descriptors|
+ send(lib, descriptors)
+ end
+ serialize
+ end
+
+ def self.openbabel descriptors
+ $logger.debug "compute #{descriptors.size} openbabel descriptors for #{@compounds.size} compounds"
+ obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d}
+ obmol = OpenBabel::OBMol.new
+ obconversion = OpenBabel::OBConversion.new
+ obconversion.set_in_format 'inchi'
+ last_feature_idx = @physchem_descriptors.size
+ @compounds.each_with_index do |compound,c|
+ obconversion.read_string obmol, compound.inchi
+ obdescriptors.each_with_index do |descriptor,d|
+ @data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol))
+ end
+ end
+ @physchem_descriptors += descriptors.collect{|d| "Openbabel.#{d}"}
+ end
+
+ def self.java_descriptors descriptors, lib
+ $logger.debug "compute #{descriptors.size} cdk descriptors for #{@compounds.size} compounds"
+ sdf = sdf_3d
+ # use java system call (rjb blocks within tasks)
+ # use Tempfiles to avoid "Argument list too long" error
+ case lib
+ when "cdk"
+ run_cmd "java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf} #{descriptors.join(" ")}"
+ when "joelib"
+ run_cmd "java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf} #{descriptors.join(' ')}"
+ end
+ last_feature_idx = @physchem_descriptors.size
+ YAML.load_file("#{sdf}#{lib}.yaml").each_with_index do |calculation,i|
+ $logger.error "Descriptor calculation failed for compound #{compounds[i].inchi}." if calculation.empty?
+ # CDK Descriptors may calculate multiple values, they are stored in separate features
+ @physchem_descriptors += calculation.keys if i == 0
+ calculation.keys.each_with_index do |name,j|
+ @data_entries[i][j+last_feature_idx] = fix_value(calculation[name])
+ end
+ end
+ FileUtils.rm "#{sdf}#{lib}.yaml"
+ end
+
+ def self.cdk descriptors
+ java_descriptors descriptors, "cdk"
+ end
+
+ def self.joelib descriptors
+ java_descriptors descriptors, "joelib"
+ end
+
+ def self.lookup compounds, features, dataset
+ parse compounds
+ fingerprint = []
+ compounds.each do |compound|
+ fingerprint << []
+ features.each do |feature|
+ end
+ end
+ end
+
+ def self.run_cmd cmd
+ cmd = "#{cmd} 2>&1"
+ $logger.debug "running external cmd: '#{cmd}'"
+ p = IO.popen(cmd) do |io|
+ while line = io.gets
+ $logger.debug "> #{line.chomp}"
+ end
+ io.close
+ raise "external cmd failed '#{cmd}' (see log file for error msg)" unless $?.to_i == 0
+ end
+ end
+
+ def self.sdf_3d
+ # TODO check if 3d sdfs are stored in GridFS
+ sdf = ""
+ @compounds.each do |compound|
+ sdf << compound.sdf
+ end
+ sdf_file = "/tmp/#{SecureRandom.uuid}.sdf"
+ File.open(sdf_file,"w+"){|f| f.print sdf}
+ sdf_file
+ end
+
+ def self.parse compounds
+ @input_class = compounds.class.to_s
+ case @input_class
+ when "OpenTox::Compound"
+ @compounds = [compounds]
+ when "Array"
+ @compounds = compounds
+ when "OpenTox::Dataset"
+ @compounds = compounds.compounds
+ else
+ bad_request_error "Cannot calculate descriptors for #{compounds.class} objects."
+ end
+ end
+
+ def self.fix_value val
+ val = val.first if val.is_a? Array and val.size == 1
+ val = nil if val == "NaN"
+ if val.numeric?
+ val = Float(val)
+ val = nil if val.nan? or val.infinite?
+ end
+ val
+ end
+ private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize
+ end
+ end
+end