From 23ecfc6fa5ae4913e5cd17b7d58432d1f88d780c Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 10 Aug 2015 09:48:57 +0200 Subject: transfer to new git project started --- lib/descriptor.rb | 253 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 253 insertions(+) create mode 100644 lib/descriptor.rb (limited to 'lib/descriptor.rb') diff --git a/lib/descriptor.rb b/lib/descriptor.rb new file mode 100644 index 0000000..68bc7a2 --- /dev/null +++ b/lib/descriptor.rb @@ -0,0 +1,253 @@ +require 'digest/md5' +ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk" +BABEL_3D_CACHE_DIR = File.join(File.dirname(__FILE__),"..",'/babel_3d_cache') +# TODO store descriptors in mongodb + +module OpenTox + + module Algorithm + class Descriptor + include OpenTox + + JAVA_DIR = File.join(File.dirname(__FILE__),"..","java") + CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last + JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar") + LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar") + JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar") + + obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title"] + OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d| + name,description = d.split(/\s+/,2) + ["Openbabel."+name,description] unless obexclude.include? name + end.compact.sort{|a,b| a[0] <=> b[0]}] + + cdk_desc = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`) + CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}] + CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"."+name } }.flatten + + # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug) + joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"] + # strip Joelib messages from stdout + JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d| + name = d[:java_class].sub(/^joelib2.feature.types./,'') + # impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java + ["Joelib."+name, "no description available"] unless joelibexclude.include? name + end.compact.sort{|a,b| a[0] <=> b[0]}] + + DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS)) + DESCRIPTOR_VALUES = OBDESCRIPTORS.keys + CDKDESCRIPTOR_VALUES + JOELIBDESCRIPTORS.keys + + require_relative "unique_descriptors.rb" + + def self.description descriptor + lib = descriptor.split('.').first + case lib + when "Openbabel" + OBDESCRIPTORS[descriptor] + when "Cdk" + name = descriptor.split('.')[0..-2].join('.') + CDKDESCRIPTORS[name] + when "Joelib" + JOELIBDESCRIPTORS[descriptor] + when "lookup" + "Read feature values from a dataset" + end + end + + def self.smarts_match compounds, smarts_features, count=false + bad_request_error "Compounds for smarts_match are empty" unless compounds + bad_request_error "Smarts features for smarts_match are empty" unless smarts_features + parse compounds + @count = count + obconversion = OpenBabel::OBConversion.new + obmol = OpenBabel::OBMol.new + obconversion.set_in_format('inchi') + smarts_pattern = OpenBabel::OBSmartsPattern.new + smarts_features = [smarts_features] if smarts_features.is_a?(Feature) + @smarts = smarts_features.collect{|f| f.smarts} + @physchem_descriptors = nil + @data_entries = Array.new(@compounds.size){Array.new(@smarts.size,false)} + @compounds.each_with_index do |compound,c| + # TODO OpenBabel may segfault here + # catch inchi errors in compound.rb + # eg. at line 249 of rat_feature_dataset + # which worked with opentox-client + # (but no smarts_match) + p "'#{compound.inchi}'" + obconversion.read_string(obmol,compound.inchi) + @smarts.each_with_index do |smart,s| + smarts_pattern.init(smart) + if smarts_pattern.match(obmol) + count ? value = smarts_pattern.get_map_list.to_a.size : value = 1 + else + value = 0 + end + @data_entries[c][s] = value + end + end + serialize + end + + def self.smarts_count compounds, smarts + smarts_match compounds,smarts,true + end + + def self.serialize + case @input_class + when "OpenTox::Compound" + if @with_names and @physchem_descriptors + [@physchem_descriptors,@data_entries.first] + else + @data_entries.first + end + when "Array" + if @with_names and @physchem_descriptors + [@physchem_descriptors,@data_entries.first] + else + @data_entries + end + when "OpenTox::Dataset" + dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id}) + if @smarts + dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id} + @count ? algo = "count" : algo = "match" + dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}" + + elsif @physchem_descriptors + dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id} + dataset.data_entries = @data_entries + dataset.feature_calculation_algorithm = "#{self}.physchem" + #TODO params? + end + dataset.save_all + dataset + end + end + + def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS, with_names=false + parse compounds + @data_entries = Array.new(@compounds.size){[]} + @descriptors = descriptors + @smarts = nil + @physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features + @with_names = with_names + des = {} + @descriptors.each do |d| + lib, descriptor = d.split(".",2) + lib = lib.downcase.to_sym + des[lib] ||= [] + des[lib] << descriptor + end + des.each do |lib,descriptors| + send(lib, descriptors) + end + serialize + end + + def self.openbabel descriptors + $logger.debug "compute #{descriptors.size} openbabel descriptors for #{@compounds.size} compounds" + obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d} + obmol = OpenBabel::OBMol.new + obconversion = OpenBabel::OBConversion.new + obconversion.set_in_format 'inchi' + last_feature_idx = @physchem_descriptors.size + @compounds.each_with_index do |compound,c| + obconversion.read_string obmol, compound.inchi + obdescriptors.each_with_index do |descriptor,d| + @data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol)) + end + end + @physchem_descriptors += descriptors.collect{|d| "Openbabel.#{d}"} + end + + def self.java_descriptors descriptors, lib + $logger.debug "compute #{descriptors.size} cdk descriptors for #{@compounds.size} compounds" + sdf = sdf_3d + # use java system call (rjb blocks within tasks) + # use Tempfiles to avoid "Argument list too long" error + case lib + when "cdk" + run_cmd "java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf} #{descriptors.join(" ")}" + when "joelib" + run_cmd "java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf} #{descriptors.join(' ')}" + end + last_feature_idx = @physchem_descriptors.size + YAML.load_file("#{sdf}#{lib}.yaml").each_with_index do |calculation,i| + $logger.error "Descriptor calculation failed for compound #{compounds[i].inchi}." if calculation.empty? + # CDK Descriptors may calculate multiple values, they are stored in separate features + @physchem_descriptors += calculation.keys if i == 0 + calculation.keys.each_with_index do |name,j| + @data_entries[i][j+last_feature_idx] = fix_value(calculation[name]) + end + end + FileUtils.rm "#{sdf}#{lib}.yaml" + end + + def self.cdk descriptors + java_descriptors descriptors, "cdk" + end + + def self.joelib descriptors + java_descriptors descriptors, "joelib" + end + + def self.lookup compounds, features, dataset + parse compounds + fingerprint = [] + compounds.each do |compound| + fingerprint << [] + features.each do |feature| + end + end + end + + def self.run_cmd cmd + cmd = "#{cmd} 2>&1" + $logger.debug "running external cmd: '#{cmd}'" + p = IO.popen(cmd) do |io| + while line = io.gets + $logger.debug "> #{line.chomp}" + end + io.close + raise "external cmd failed '#{cmd}' (see log file for error msg)" unless $?.to_i == 0 + end + end + + def self.sdf_3d + # TODO check if 3d sdfs are stored in GridFS + sdf = "" + @compounds.each do |compound| + sdf << compound.sdf + end + sdf_file = "/tmp/#{SecureRandom.uuid}.sdf" + File.open(sdf_file,"w+"){|f| f.print sdf} + sdf_file + end + + def self.parse compounds + @input_class = compounds.class.to_s + case @input_class + when "OpenTox::Compound" + @compounds = [compounds] + when "Array" + @compounds = compounds + when "OpenTox::Dataset" + @compounds = compounds.compounds + else + bad_request_error "Cannot calculate descriptors for #{compounds.class} objects." + end + end + + def self.fix_value val + val = val.first if val.is_a? Array and val.size == 1 + val = nil if val == "NaN" + if val.numeric? + val = Float(val) + val = nil if val.nan? or val.infinite? + end + val + end + private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize + end + end +end -- cgit v1.2.3