From ca2bb0f90335b1f2c4ecc28ee423e85b281ffcf0 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 4 Nov 2015 17:50:17 +0100 Subject: neighbor search delegated to database backend --- lib/descriptor.rb | 1 - 1 file changed, 1 deletion(-) (limited to 'lib/descriptor.rb') diff --git a/lib/descriptor.rb b/lib/descriptor.rb index 9733bde..93ce591 100644 --- a/lib/descriptor.rb +++ b/lib/descriptor.rb @@ -107,7 +107,6 @@ module OpenTox des[lib] << descriptor end des.each do |lib,descriptors| - p lib, descriptors send(lib, descriptors) end serialize -- cgit v1.2.3 From 8c973e16028cb95c978bb08cf79369a5c3520c31 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sun, 28 Feb 2016 12:43:38 +0100 Subject: physchem feature class --- lib/descriptor.rb | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) (limited to 'lib/descriptor.rb') diff --git a/lib/descriptor.rb b/lib/descriptor.rb index 93ce591..d6b2e85 100644 --- a/lib/descriptor.rb +++ b/lib/descriptor.rb @@ -4,10 +4,10 @@ ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk" module OpenTox - module Algorithm + #module Algorithm # Class for descriptor calculations - class Descriptor + module Descriptor include OpenTox JAVA_DIR = File.join(File.dirname(__FILE__),"..","java") @@ -19,20 +19,19 @@ module OpenTox obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"] OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d| name,description = d.split(/\s+/,2) - ["Openbabel."+name,description] unless obexclude.include? name + ["Openbabel_"+name,description] unless obexclude.include? name end.compact.sort{|a,b| a[0] <=> b[0]}] cdk_desc = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`) - CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}] - CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"."+name } }.flatten + CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk_"+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}] + CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk_"+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"_"+name } }.flatten # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug) joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"] # strip Joelib messages from stdout JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d| - name = d[:java_class].sub(/^joelib2.feature.types./,'') - # impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java - ["Joelib."+name, "no description available"] unless joelibexclude.include? name + name = d[:java_class].sub(/^joelib2.feature.types./,'').gsub(/\./,"_") + ["Joelib_"+name, "impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java"] unless joelibexclude.include? name end.compact.sort{|a,b| a[0] <=> b[0]}] DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS)) @@ -42,12 +41,12 @@ module OpenTox # Description of available descriptors def self.description descriptor - lib = descriptor.split('.').first + lib = descriptor.split('_').first case lib when "Openbabel" OBDESCRIPTORS[descriptor] when "Cdk" - name = descriptor.split('.')[0..-2].join('.') + name = descriptor.split('_')[0..-2].join('_') CDKDESCRIPTORS[name] when "Joelib" JOELIBDESCRIPTORS[descriptor] @@ -101,7 +100,7 @@ module OpenTox @physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features des = {} @descriptors.each do |d| - lib, descriptor = d.split(".",2) + lib, descriptor = d.split("_",2) lib = lib.downcase.to_sym des[lib] ||= [] des[lib] << descriptor @@ -125,7 +124,7 @@ module OpenTox @data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol)) end end - @physchem_descriptors += descriptors.collect{|d| "Openbabel.#{d}"} + @physchem_descriptors += descriptors.collect{|d| "Openbabel_#{d}"} end def self.java_descriptors descriptors, lib @@ -208,10 +207,16 @@ module OpenTox end def self.serialize - @data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}} + #@data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}} case @input_class + # TODO beautify and fix for other objects when "OpenTox::Compound" - @data_entries.first + r = {} + @data_entries.first.each_with_index do |d,i| + # TODO fix @ source + r[@physchem_descriptors[i].gsub(/\./,'_')] = d + end + r when "Array" @data_entries when "OpenTox::Dataset" @@ -243,5 +248,5 @@ module OpenTox end private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize end - end + #end end -- cgit v1.2.3 From c4b56b22fd6e65633deb7e52bd99865e3bee8f00 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 29 Feb 2016 13:02:37 +0100 Subject: crossvalidation folds fixed for duplicates --- lib/descriptor.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/descriptor.rb') diff --git a/lib/descriptor.rb b/lib/descriptor.rb index d6b2e85..14a123b 100644 --- a/lib/descriptor.rb +++ b/lib/descriptor.rb @@ -232,7 +232,7 @@ module OpenTox dataset.feature_calculation_algorithm = "#{self}.physchem" #TODO params? end - dataset.save_all + dataset.save dataset end end -- cgit v1.2.3 From 0c5d2e678908a2d4aea43efbedbedc2c0439be30 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 14 Mar 2016 15:25:50 +0100 Subject: descriptor tests --- lib/descriptor.rb | 252 ------------------------------------------------------ 1 file changed, 252 deletions(-) delete mode 100644 lib/descriptor.rb (limited to 'lib/descriptor.rb') diff --git a/lib/descriptor.rb b/lib/descriptor.rb deleted file mode 100644 index 14a123b..0000000 --- a/lib/descriptor.rb +++ /dev/null @@ -1,252 +0,0 @@ -require 'digest/md5' -ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk" -# TODO store descriptors in mongodb - -module OpenTox - - #module Algorithm - - # Class for descriptor calculations - module Descriptor - include OpenTox - - JAVA_DIR = File.join(File.dirname(__FILE__),"..","java") - CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last - JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar") - LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar") - JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar") - - obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"] - OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d| - name,description = d.split(/\s+/,2) - ["Openbabel_"+name,description] unless obexclude.include? name - end.compact.sort{|a,b| a[0] <=> b[0]}] - - cdk_desc = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`) - CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk_"+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}] - CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk_"+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"_"+name } }.flatten - - # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug) - joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"] - # strip Joelib messages from stdout - JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d| - name = d[:java_class].sub(/^joelib2.feature.types./,'').gsub(/\./,"_") - ["Joelib_"+name, "impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java"] unless joelibexclude.include? name - end.compact.sort{|a,b| a[0] <=> b[0]}] - - DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS)) - DESCRIPTOR_VALUES = OBDESCRIPTORS.keys + CDKDESCRIPTOR_VALUES + JOELIBDESCRIPTORS.keys - - require_relative "unique_descriptors.rb" - - # Description of available descriptors - def self.description descriptor - lib = descriptor.split('_').first - case lib - when "Openbabel" - OBDESCRIPTORS[descriptor] - when "Cdk" - name = descriptor.split('_')[0..-2].join('_') - CDKDESCRIPTORS[name] - when "Joelib" - JOELIBDESCRIPTORS[descriptor] - when "lookup" - "Read feature values from a dataset" - end - end - - # Match an array of smarts features - def self.smarts_match compounds, smarts_features, count=false - bad_request_error "Compounds for smarts_match are empty" unless compounds - bad_request_error "Smarts features for smarts_match are empty" unless smarts_features - parse compounds - @count = count - obconversion = OpenBabel::OBConversion.new - obmol = OpenBabel::OBMol.new - obconversion.set_in_format('smi') - smarts_pattern = OpenBabel::OBSmartsPattern.new - smarts_features = [smarts_features] if smarts_features.is_a?(Feature) - @smarts = smarts_features.collect{|f| f.smarts} - @physchem_descriptors = nil - @data_entries = Array.new(@compounds.size){Array.new(@smarts.size,false)} - @compounds.each_with_index do |compound,c| - obconversion.read_string(obmol,compound.smiles) - @smarts.each_with_index do |smart,s| - smarts_pattern.init(smart) - if smarts_pattern.match(obmol) - count ? value = smarts_pattern.get_map_list.to_a.size : value = 1 - else - value = 0 - end - @data_entries[c][s] = value - end - end - serialize - end - - # Count matches of an array with smarts features - def self.smarts_count compounds, smarts - # TODO: non-overlapping matches? - smarts_match compounds,smarts,true - end - - # Calculate physchem descriptors - # @param [OpenTox::Compound,Array,OpenTox::Dataset] input object, either a compound, an array of compounds or a dataset - def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS - parse compounds - @data_entries = Array.new(@compounds.size){[]} - @descriptors = descriptors - @smarts = nil - @physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features - des = {} - @descriptors.each do |d| - lib, descriptor = d.split("_",2) - lib = lib.downcase.to_sym - des[lib] ||= [] - des[lib] << descriptor - end - des.each do |lib,descriptors| - send(lib, descriptors) - end - serialize - end - - def self.openbabel descriptors - $logger.debug "compute #{descriptors.size} openbabel descriptors for #{@compounds.size} compounds" - obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d} - obmol = OpenBabel::OBMol.new - obconversion = OpenBabel::OBConversion.new - obconversion.set_in_format 'smi' - last_feature_idx = @physchem_descriptors.size - @compounds.each_with_index do |compound,c| - obconversion.read_string obmol, compound.smiles - obdescriptors.each_with_index do |descriptor,d| - @data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol)) - end - end - @physchem_descriptors += descriptors.collect{|d| "Openbabel_#{d}"} - end - - def self.java_descriptors descriptors, lib - $logger.debug "compute #{descriptors.size} cdk descriptors for #{@compounds.size} compounds" - sdf = sdf_3d - # use java system call (rjb blocks within tasks) - # use Tempfiles to avoid "Argument list too long" error - case lib - when "cdk" - run_cmd "java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf} #{descriptors.join(" ")}" - when "joelib" - run_cmd "java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf} #{descriptors.join(' ')}" - end - last_feature_idx = @physchem_descriptors.size - YAML.load_file("#{sdf}#{lib}.yaml").each_with_index do |calculation,i| - # TODO create warnings - #$logger.error "Descriptor calculation failed for compound #{@compounds[i].inchi}." if calculation.empty? - # CDK Descriptors may calculate multiple values, they are stored in separate features - @physchem_descriptors += calculation.keys if i == 0 - calculation.keys.each_with_index do |name,j| - @data_entries[i][j+last_feature_idx] = fix_value(calculation[name]) - end - end - FileUtils.rm "#{sdf}#{lib}.yaml" - end - - def self.cdk descriptors - java_descriptors descriptors, "cdk" - end - - def self.joelib descriptors - java_descriptors descriptors, "joelib" - end - - def self.lookup compounds, features, dataset - parse compounds - fingerprint = [] - compounds.each do |compound| - fingerprint << [] - features.each do |feature| - end - end - end - - def self.run_cmd cmd - cmd = "#{cmd} 2>&1" - $logger.debug "running external cmd: '#{cmd}'" - p = IO.popen(cmd) do |io| - while line = io.gets - $logger.debug "> #{line.chomp}" - end - io.close - raise "external cmd failed '#{cmd}' (see log file for error msg)" unless $?.to_i == 0 - end - end - - def self.sdf_3d - # TODO check if 3d sdfs are stored in GridFS - sdf = "" - @compounds.each do |compound| - sdf << compound.sdf - end - sdf_file = "/tmp/#{SecureRandom.uuid}.sdf" - File.open(sdf_file,"w+"){|f| f.print sdf} - sdf_file - end - - def self.parse compounds - @input_class = compounds.class.to_s - case @input_class - when "OpenTox::Compound" - @compounds = [compounds] - when "Array" - @compounds = compounds - when "OpenTox::Dataset" - @compounds = compounds.compounds - else - bad_request_error "Cannot calculate descriptors for #{compounds.class} objects." - end - end - - def self.serialize - #@data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}} - case @input_class - # TODO beautify and fix for other objects - when "OpenTox::Compound" - r = {} - @data_entries.first.each_with_index do |d,i| - # TODO fix @ source - r[@physchem_descriptors[i].gsub(/\./,'_')] = d - end - r - when "Array" - @data_entries - when "OpenTox::Dataset" - dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id}) - if @smarts - dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id} - @count ? algo = "count" : algo = "match" - dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}" - - elsif @physchem_descriptors - dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id} - dataset.data_entries = @data_entries - dataset.feature_calculation_algorithm = "#{self}.physchem" - #TODO params? - end - dataset.save - dataset - end - end - - def self.fix_value val - val = val.first if val.is_a? Array and val.size == 1 - val = nil if val == "NaN" - if val.numeric? - val = Float(val) - val = nil if val.nan? or val.infinite? - end - val - end - private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize - end - #end -end -- cgit v1.2.3