From c5cbcd9b617047c0933e465d4cb247618920ec6d Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 31 Jul 2015 10:59:35 +0200 Subject: descriptor tests working --- lib/descriptor.rb | 214 ++++++++++++++++++++++-------------------------------- 1 file changed, 87 insertions(+), 127 deletions(-) diff --git a/lib/descriptor.rb b/lib/descriptor.rb index 1b04ebf..1d43d7d 100644 --- a/lib/descriptor.rb +++ b/lib/descriptor.rb @@ -58,122 +58,128 @@ module OpenTox def self.smarts_match compounds, smarts, count=false bad_request_error "Compounds for smarts_match are empty" unless compounds bad_request_error "Smarts for smarts_match are empty" unless smarts - compounds = parse compounds + parse compounds obconversion = OpenBabel::OBConversion.new obmol = OpenBabel::OBMol.new obconversion.set_in_format('inchi') smarts_pattern = OpenBabel::OBSmartsPattern.new - smarts = [smarts] unless smarts.is_a? Array - fingerprint = Array.new(compounds.size){Array.new(smarts.size,false)} - compounds.each_with_index do |compound,c| + smarts.is_a?(String) ? @smarts = [smarts] : @smarts = smarts + @physchem_descriptors = nil + @data_entries = Array.new(@compounds.size){Array.new(@smarts.size,false)} + @compounds.each_with_index do |compound,c| obconversion.read_string(obmol,compound.inchi) - smarts.each_with_index do |smart,s| + @smarts.each_with_index do |smart,s| smarts_pattern.init(smart) if smarts_pattern.match(obmol) count ? value = smarts_pattern.get_map_list.to_a.size : value = 1 else value = 0 end - fingerprint[c][s] = value + @data_entries[c][s] = value end end - fingerprint + serialize end def self.smarts_count compounds, smarts smarts_match compounds,smarts,true end - def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS - compounds = parse compounds - dataset = OpenTox::CalculatedDataset.new - dataset.compounds = compounds + def self.serialize + case @input_class + when "OpenTox::Compound" + if @with_names and @physchem_descriptors + [@physchem_descriptors,@data_entries.first] + else + @data_entries.first + end + when "Array" + if @with_names and @physchem_descriptors + [@physchem_descriptors,@data_entries.first] + else + @data_entries + end + when "OpenTox::Dataset" + dataset = OpenTox::Dataset.new(:compound_ids => @compounds.collect{|c| c.id}) + if @smarts + dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id} + elsif @physchem_descriptors + dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id} + dataset.data_entries = @data_entries + end + dataset + end + end + + def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS, with_names=false + parse compounds + @data_entries = Array.new(@compounds.size){[]} + @descriptors = descriptors + @smarts = nil + @physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features + @with_names = with_names des = {} - descriptors.each do |d| + @descriptors.each do |d| lib, descriptor = d.split(".",2) lib = lib.downcase.to_sym des[lib] ||= [] des[lib] << descriptor end - result = {} - features = [] - data_entries = Array.new(compounds.size){Array.new(des.size)} - n = 0 des.each do |lib,descriptors| - features += descriptors.collect do |d| - OpenTox::Feature.find_or_create_by( - :title => "#{lib}.#{d}", - :creator => __FILE__ - ) - end - r = send(lib, compounds, descriptors) - #p r - r.each_with_index do |values,i| - data_entries[i][n] = values - end - n += 1 + send(lib, descriptors) end - #dataset.features = features - #dataset.data_entries = data_entries - #dataset - data_entries + serialize end - def self.openbabel compounds, descriptors - compounds = parse compounds - $logger.debug "compute #{descriptors.size} openbabel descriptors for #{compounds.size} compounds" + def self.openbabel descriptors + $logger.debug "compute #{descriptors.size} openbabel descriptors for #{@compounds.size} compounds" obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d} obmol = OpenBabel::OBMol.new obconversion = OpenBabel::OBConversion.new obconversion.set_in_format 'inchi' - fingerprint = Array.new(compounds.size){Array.new(obdescriptors.size)} - compounds.each_with_index do |compound,c| + last_feature_idx = @physchem_descriptors.size + @compounds.each_with_index do |compound,c| obconversion.read_string obmol, compound.inchi obdescriptors.each_with_index do |descriptor,d| - fingerprint[c][d] = fix_value(descriptor.predict(obmol)) + @data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol)) end end - fingerprint + @physchem_descriptors += descriptors.collect{|d| "Openbabel.#{d}"} end - def self.cdk compounds, descriptors - compounds = parse compounds - $logger.debug "compute #{descriptors.size} cdk descriptors for #{compounds.size} compounds" - sdf = sdf_3d compounds + def self.java_descriptors descriptors, lib + $logger.debug "compute #{descriptors.size} cdk descriptors for #{@compounds.size} compounds" + sdf = sdf_3d # use java system call (rjb blocks within tasks) # use Tempfiles to avoid "Argument list too long" error - run_cmd "java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf} #{descriptors.join(" ")}" - fingerprint = {} - YAML.load_file(sdf+"cdk.yaml").each_with_index do |calculation,i| + case lib + when "cdk" + run_cmd "java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf} #{descriptors.join(" ")}" + when "joelib" + run_cmd "java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf} #{descriptors.join(' ')}" + end + last_feature_idx = @physchem_descriptors.size + YAML.load_file("#{sdf}#{lib}.yaml").each_with_index do |calculation,i| $logger.error "Descriptor calculation failed for compound #{compounds[i].inchi}." if calculation.empty? - descriptors.each do |descriptor| - fingerprint[compounds[i]] = calculation + # CDK Descriptors may calculate multiple values, they are stored in separate features + @physchem_descriptors += calculation.keys if i == 0 + calculation.keys.each_with_index do |name,j| + @data_entries[i][j+last_feature_idx] = fix_value(calculation[name]) end end - FileUtils.rm sdf+"cdk.yaml" - fingerprint + FileUtils.rm "#{sdf}#{lib}.yaml" end - def self.joelib compounds, descriptors - compounds = parse compounds - $logger.debug "compute #{descriptors.size} joelib descriptors for #{compounds.size} compounds" - # use java system call (rjb blocks within tasks) - # use Tempfiles to avoid "Argument list too long" error - sdf = sdf_3d compounds - run_cmd "java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf} #{descriptors.join(' ')}" - fingerprint = {} - YAML.load_file(sdf+"joelib.yaml").each_with_index do |calculation,i| - $logger.error "Descriptor calculation failed for compound #{compounds[i].inchi}." if calculation.empty? - descriptors.each do |descriptor| - fingerprint[compounds[i]] = calculation - end - end - FileUtils.rm sdf+"joelib.yaml" - fingerprint + def self.cdk descriptors + java_descriptors descriptors, "cdk" + end + + def self.joelib descriptors + java_descriptors descriptors, "joelib" end def self.lookup compounds, features, dataset - compounds = parse compounds + parse compounds fingerprint = [] compounds.each do |compound| fingerprint << [] @@ -194,71 +200,26 @@ module OpenTox end end - def self.sdf_3d compounds - compounds = parse compounds - obconversion = OpenBabel::OBConversion.new - obmol = OpenBabel::OBMol.new - obconversion.set_in_format 'inchi' - obconversion.set_out_format 'sdf' - - digest = Digest::MD5.hexdigest compounds.collect{|c| c.inchi}.inspect - sdf_file = "/tmp/#{digest}.sdf" - if File.exists? sdf_file # do not recreate existing 3d sdfs - $logger.debug "re-using cached 3d structures from #{sdf_file}" - else - tmp_file = Tempfile.new('sdf') - # create 3d sdf file (faster in Openbabel than in CDK) - # MG: moreover, CDK 3d generation is faulty - # MG: WARNING: Openbabel 3d generation is not deterministic - # MG: WARNING: Openbabel 3D generation does not work for mixtures - c = 0 - compounds.each do |compound| - c += 1 - cmp_file = File.join(BABEL_3D_CACHE_DIR,Digest::MD5.hexdigest(compound.inchi)+".sdf") - cmp_sdf = nil - if File.exists? cmp_file - $logger.debug "read cached 3d structure for compound #{c}/#{compounds.size}" - cmp_sdf = File.read(cmp_file) - else - $logger.debug "compute 3d structure for compound #{c}/#{compounds.size}" - obconversion.read_string obmol, compound.inchi - sdf_2d = obconversion.write_string(obmol) - error = nil - if compound.inchi.include?(";") # component includes multiple compounds (; in inchi, . in smiles) - error = "OpenBabel 3D generation failes for multi-compound #{compound.inchi}, trying to calculate descriptors from 2D structure." - else - OpenBabel::OBOp.find_type("Gen3D").do(obmol) - sdf_3d = obconversion.write_string(obmol) - error = "3D generation failed for compound #{compound.inchi}, trying to calculate descriptors from 2D structure." if sdf_3d.match(/.nan/) - end - if error - $logger.warn error - # TODO - # @feature_dataset[RDF::OT.Warnings] ? @feature_dataset[RDF::OT.Warnings] << error : @feature_dataset[RDF::OT.Warnings] = error - cmp_sdf = sdf_2d - else - cmp_sdf = sdf_3d - File.open(cmp_file,"w") do |f| - f.write(cmp_sdf) - end - end - end - tmp_file.write cmp_sdf - end - tmp_file.close - File.rename(tmp_file, sdf_file) + def self.sdf_3d + # TODO check if 3d sdfs are stored in GridFS + sdf = "" + @compounds.each do |compound| + sdf << compound.sdf end + sdf_file = "/tmp/#{SecureRandom.uuid}.sdf" + File.open(sdf_file,"w+"){|f| f.print sdf} sdf_file end def self.parse compounds - case compounds.class.to_s + @input_class = compounds.class.to_s + case @input_class when "OpenTox::Compound" - compounds = [compounds] + @compounds = [compounds] when "Array" - compounds + @compounds = compounds when "OpenTox::Dataset" - compounds = compounds.compounds + @compounds = compounds.compounds else bad_request_error "Cannot calculate descriptors for #{compounds.class} objects." end @@ -266,15 +227,14 @@ module OpenTox def self.fix_value val val = val.first if val.is_a? Array and val.size == 1 + val = nil if val == "NaN" if val.numeric? val = Float(val) val = nil if val.nan? or val.infinite? - else - val = nil if val == "NaN" end val end - private_class_method :sdf_3d, :fix_value, :parse, :run_cmd + private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize end end end -- cgit v1.2.3