summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2015-07-31 10:59:35 +0200
committerChristoph Helma <helma@in-silico.ch>2015-07-31 10:59:35 +0200
commitc5cbcd9b617047c0933e465d4cb247618920ec6d (patch)
treecbe8e9f7074885a169f5db4b91f39d676f8ccd4b
parente55ed6f22e07ff36e74a8a5838ddb7e51524c89c (diff)
descriptor tests working
-rw-r--r--lib/descriptor.rb214
1 files changed, 87 insertions, 127 deletions
diff --git a/lib/descriptor.rb b/lib/descriptor.rb
index 1b04ebf..1d43d7d 100644
--- a/lib/descriptor.rb
+++ b/lib/descriptor.rb
@@ -58,122 +58,128 @@ module OpenTox
def self.smarts_match compounds, smarts, count=false
bad_request_error "Compounds for smarts_match are empty" unless compounds
bad_request_error "Smarts for smarts_match are empty" unless smarts
- compounds = parse compounds
+ parse compounds
obconversion = OpenBabel::OBConversion.new
obmol = OpenBabel::OBMol.new
obconversion.set_in_format('inchi')
smarts_pattern = OpenBabel::OBSmartsPattern.new
- smarts = [smarts] unless smarts.is_a? Array
- fingerprint = Array.new(compounds.size){Array.new(smarts.size,false)}
- compounds.each_with_index do |compound,c|
+ smarts.is_a?(String) ? @smarts = [smarts] : @smarts = smarts
+ @physchem_descriptors = nil
+ @data_entries = Array.new(@compounds.size){Array.new(@smarts.size,false)}
+ @compounds.each_with_index do |compound,c|
obconversion.read_string(obmol,compound.inchi)
- smarts.each_with_index do |smart,s|
+ @smarts.each_with_index do |smart,s|
smarts_pattern.init(smart)
if smarts_pattern.match(obmol)
count ? value = smarts_pattern.get_map_list.to_a.size : value = 1
else
value = 0
end
- fingerprint[c][s] = value
+ @data_entries[c][s] = value
end
end
- fingerprint
+ serialize
end
def self.smarts_count compounds, smarts
smarts_match compounds,smarts,true
end
- def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS
- compounds = parse compounds
- dataset = OpenTox::CalculatedDataset.new
- dataset.compounds = compounds
+ def self.serialize
+ case @input_class
+ when "OpenTox::Compound"
+ if @with_names and @physchem_descriptors
+ [@physchem_descriptors,@data_entries.first]
+ else
+ @data_entries.first
+ end
+ when "Array"
+ if @with_names and @physchem_descriptors
+ [@physchem_descriptors,@data_entries.first]
+ else
+ @data_entries
+ end
+ when "OpenTox::Dataset"
+ dataset = OpenTox::Dataset.new(:compound_ids => @compounds.collect{|c| c.id})
+ if @smarts
+ dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id}
+ elsif @physchem_descriptors
+ dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id}
+ dataset.data_entries = @data_entries
+ end
+ dataset
+ end
+ end
+
+ def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS, with_names=false
+ parse compounds
+ @data_entries = Array.new(@compounds.size){[]}
+ @descriptors = descriptors
+ @smarts = nil
+ @physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features
+ @with_names = with_names
des = {}
- descriptors.each do |d|
+ @descriptors.each do |d|
lib, descriptor = d.split(".",2)
lib = lib.downcase.to_sym
des[lib] ||= []
des[lib] << descriptor
end
- result = {}
- features = []
- data_entries = Array.new(compounds.size){Array.new(des.size)}
- n = 0
des.each do |lib,descriptors|
- features += descriptors.collect do |d|
- OpenTox::Feature.find_or_create_by(
- :title => "#{lib}.#{d}",
- :creator => __FILE__
- )
- end
- r = send(lib, compounds, descriptors)
- #p r
- r.each_with_index do |values,i|
- data_entries[i][n] = values
- end
- n += 1
+ send(lib, descriptors)
end
- #dataset.features = features
- #dataset.data_entries = data_entries
- #dataset
- data_entries
+ serialize
end
- def self.openbabel compounds, descriptors
- compounds = parse compounds
- $logger.debug "compute #{descriptors.size} openbabel descriptors for #{compounds.size} compounds"
+ def self.openbabel descriptors
+ $logger.debug "compute #{descriptors.size} openbabel descriptors for #{@compounds.size} compounds"
obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d}
obmol = OpenBabel::OBMol.new
obconversion = OpenBabel::OBConversion.new
obconversion.set_in_format 'inchi'
- fingerprint = Array.new(compounds.size){Array.new(obdescriptors.size)}
- compounds.each_with_index do |compound,c|
+ last_feature_idx = @physchem_descriptors.size
+ @compounds.each_with_index do |compound,c|
obconversion.read_string obmol, compound.inchi
obdescriptors.each_with_index do |descriptor,d|
- fingerprint[c][d] = fix_value(descriptor.predict(obmol))
+ @data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol))
end
end
- fingerprint
+ @physchem_descriptors += descriptors.collect{|d| "Openbabel.#{d}"}
end
- def self.cdk compounds, descriptors
- compounds = parse compounds
- $logger.debug "compute #{descriptors.size} cdk descriptors for #{compounds.size} compounds"
- sdf = sdf_3d compounds
+ def self.java_descriptors descriptors, lib
+ $logger.debug "compute #{descriptors.size} cdk descriptors for #{@compounds.size} compounds"
+ sdf = sdf_3d
# use java system call (rjb blocks within tasks)
# use Tempfiles to avoid "Argument list too long" error
- run_cmd "java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf} #{descriptors.join(" ")}"
- fingerprint = {}
- YAML.load_file(sdf+"cdk.yaml").each_with_index do |calculation,i|
+ case lib
+ when "cdk"
+ run_cmd "java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf} #{descriptors.join(" ")}"
+ when "joelib"
+ run_cmd "java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf} #{descriptors.join(' ')}"
+ end
+ last_feature_idx = @physchem_descriptors.size
+ YAML.load_file("#{sdf}#{lib}.yaml").each_with_index do |calculation,i|
$logger.error "Descriptor calculation failed for compound #{compounds[i].inchi}." if calculation.empty?
- descriptors.each do |descriptor|
- fingerprint[compounds[i]] = calculation
+ # CDK Descriptors may calculate multiple values, they are stored in separate features
+ @physchem_descriptors += calculation.keys if i == 0
+ calculation.keys.each_with_index do |name,j|
+ @data_entries[i][j+last_feature_idx] = fix_value(calculation[name])
end
end
- FileUtils.rm sdf+"cdk.yaml"
- fingerprint
+ FileUtils.rm "#{sdf}#{lib}.yaml"
end
- def self.joelib compounds, descriptors
- compounds = parse compounds
- $logger.debug "compute #{descriptors.size} joelib descriptors for #{compounds.size} compounds"
- # use java system call (rjb blocks within tasks)
- # use Tempfiles to avoid "Argument list too long" error
- sdf = sdf_3d compounds
- run_cmd "java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf} #{descriptors.join(' ')}"
- fingerprint = {}
- YAML.load_file(sdf+"joelib.yaml").each_with_index do |calculation,i|
- $logger.error "Descriptor calculation failed for compound #{compounds[i].inchi}." if calculation.empty?
- descriptors.each do |descriptor|
- fingerprint[compounds[i]] = calculation
- end
- end
- FileUtils.rm sdf+"joelib.yaml"
- fingerprint
+ def self.cdk descriptors
+ java_descriptors descriptors, "cdk"
+ end
+
+ def self.joelib descriptors
+ java_descriptors descriptors, "joelib"
end
def self.lookup compounds, features, dataset
- compounds = parse compounds
+ parse compounds
fingerprint = []
compounds.each do |compound|
fingerprint << []
@@ -194,71 +200,26 @@ module OpenTox
end
end
- def self.sdf_3d compounds
- compounds = parse compounds
- obconversion = OpenBabel::OBConversion.new
- obmol = OpenBabel::OBMol.new
- obconversion.set_in_format 'inchi'
- obconversion.set_out_format 'sdf'
-
- digest = Digest::MD5.hexdigest compounds.collect{|c| c.inchi}.inspect
- sdf_file = "/tmp/#{digest}.sdf"
- if File.exists? sdf_file # do not recreate existing 3d sdfs
- $logger.debug "re-using cached 3d structures from #{sdf_file}"
- else
- tmp_file = Tempfile.new('sdf')
- # create 3d sdf file (faster in Openbabel than in CDK)
- # MG: moreover, CDK 3d generation is faulty
- # MG: WARNING: Openbabel 3d generation is not deterministic
- # MG: WARNING: Openbabel 3D generation does not work for mixtures
- c = 0
- compounds.each do |compound|
- c += 1
- cmp_file = File.join(BABEL_3D_CACHE_DIR,Digest::MD5.hexdigest(compound.inchi)+".sdf")
- cmp_sdf = nil
- if File.exists? cmp_file
- $logger.debug "read cached 3d structure for compound #{c}/#{compounds.size}"
- cmp_sdf = File.read(cmp_file)
- else
- $logger.debug "compute 3d structure for compound #{c}/#{compounds.size}"
- obconversion.read_string obmol, compound.inchi
- sdf_2d = obconversion.write_string(obmol)
- error = nil
- if compound.inchi.include?(";") # component includes multiple compounds (; in inchi, . in smiles)
- error = "OpenBabel 3D generation failes for multi-compound #{compound.inchi}, trying to calculate descriptors from 2D structure."
- else
- OpenBabel::OBOp.find_type("Gen3D").do(obmol)
- sdf_3d = obconversion.write_string(obmol)
- error = "3D generation failed for compound #{compound.inchi}, trying to calculate descriptors from 2D structure." if sdf_3d.match(/.nan/)
- end
- if error
- $logger.warn error
- # TODO
- # @feature_dataset[RDF::OT.Warnings] ? @feature_dataset[RDF::OT.Warnings] << error : @feature_dataset[RDF::OT.Warnings] = error
- cmp_sdf = sdf_2d
- else
- cmp_sdf = sdf_3d
- File.open(cmp_file,"w") do |f|
- f.write(cmp_sdf)
- end
- end
- end
- tmp_file.write cmp_sdf
- end
- tmp_file.close
- File.rename(tmp_file, sdf_file)
+ def self.sdf_3d
+ # TODO check if 3d sdfs are stored in GridFS
+ sdf = ""
+ @compounds.each do |compound|
+ sdf << compound.sdf
end
+ sdf_file = "/tmp/#{SecureRandom.uuid}.sdf"
+ File.open(sdf_file,"w+"){|f| f.print sdf}
sdf_file
end
def self.parse compounds
- case compounds.class.to_s
+ @input_class = compounds.class.to_s
+ case @input_class
when "OpenTox::Compound"
- compounds = [compounds]
+ @compounds = [compounds]
when "Array"
- compounds
+ @compounds = compounds
when "OpenTox::Dataset"
- compounds = compounds.compounds
+ @compounds = compounds.compounds
else
bad_request_error "Cannot calculate descriptors for #{compounds.class} objects."
end
@@ -266,15 +227,14 @@ module OpenTox
def self.fix_value val
val = val.first if val.is_a? Array and val.size == 1
+ val = nil if val == "NaN"
if val.numeric?
val = Float(val)
val = nil if val.nan? or val.infinite?
- else
- val = nil if val == "NaN"
end
val
end
- private_class_method :sdf_3d, :fix_value, :parse, :run_cmd
+ private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize
end
end
end