diff options
author | Christoph Helma <helma@in-silico.ch> | 2013-04-11 21:49:28 +0200 |
---|---|---|
committer | Christoph Helma <helma@in-silico.ch> | 2013-04-11 21:49:28 +0200 |
commit | 35b9f1c814c8e38ca3e418e92b294078996c7193 (patch) | |
tree | 4f5df89c75a714471cdba4a61bfa702b9e18969f | |
parent | 2cfe47dbadeb9b013c875500e4f14e9d287f1a1f (diff) |
descriptor calculation works within tasks. rjb removed.
-rw-r--r-- | algorithm.gemspec | 1 | ||||
-rw-r--r-- | descriptor.rb | 192 | ||||
-rw-r--r-- | java/CdkDescriptorInfo.class | bin | 0 -> 1702 bytes | |||
-rw-r--r-- | java/CdkDescriptorInfo.java | 22 | ||||
-rw-r--r-- | java/CdkDescriptors.class | bin | 0 -> 3307 bytes | |||
-rw-r--r-- | java/CdkDescriptors.java | 66 | ||||
-rw-r--r-- | java/JoelibDescriptorInfo.class | bin | 0 -> 1039 bytes | |||
-rw-r--r-- | java/JoelibDescriptorInfo.java | 15 | ||||
-rw-r--r-- | java/JoelibDescriptors.class | bin | 0 -> 2833 bytes | |||
-rw-r--r-- | java/JoelibDescriptors.java | 66 | ||||
-rw-r--r-- | java/log4j.jar | bin | 0 -> 391834 bytes |
11 files changed, 248 insertions, 114 deletions
diff --git a/algorithm.gemspec b/algorithm.gemspec index b9d3241..3bda834 100644 --- a/algorithm.gemspec +++ b/algorithm.gemspec @@ -24,6 +24,5 @@ Gem::Specification.new do |s| s.add_runtime_dependency 'statsample'#, "~>1.1" s.add_runtime_dependency 'gsl'#, "~>1.14" s.add_runtime_dependency "openbabel"#, "~>2.3.1.5" - s.add_runtime_dependency "rjb" ,"1.4.6" # error in 1.4.5#,"1.4.3" s.post_install_message = "Please configure your service in ~/.opentox/config/algorithm.rb" end diff --git a/descriptor.rb b/descriptor.rb index 18b25a5..2d0844f 100644 --- a/descriptor.rb +++ b/descriptor.rb @@ -1,7 +1,7 @@ # descriptors.rb # Calculation of physico-chemical descriptors # Author: Andreas Maunz, Christoph Helma -require 'rjb' +#require 'rjb' require 'openbabel' module OpenTox @@ -9,22 +9,11 @@ module OpenTox class Application < Service ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk" - java_dir = File.join(File.dirname(__FILE__),"java") - jars = Dir[File.join(ENV["JAVA_HOME"],"lib","*.jar")] - jars += Dir[File.join(java_dir,"*jar")] - ENV["CLASSPATH"] = ([java_dir]+jars).join(":") - jars.each { |jar| Rjb::load jar } - - StringReader ||= Rjb::import "java.io.StringReader" - CDKMdlReader ||= Rjb::import "org.openscience.cdk.io.MDLReader" - CDKMolecule ||= Rjb::import "org.openscience.cdk.Molecule" - CDKDescriptorEngine ||= Rjb::import "org.openscience.cdk.qsar.DescriptorEngine" - #AromaticityDetector = Rjb::import 'org.openscience.cdk.aromaticity.CDKHueckelAromaticityDetector' - JOELIBHelper ||= Rjb::import 'joelib2.feature.FeatureHelper' - JOELIBFactory ||= Rjb::import 'joelib2.feature.FeatureFactory' - JOELIBSmilesParser ||= Rjb::import "joelib2.smiles.SMILESParser" - JOELIBTypeHolder ||= Rjb::import "joelib2.io.BasicIOTypeHolder" - JOELIBMolecule ||= Rjb::import "joelib2.molecule.BasicConformerMolecule" + JAVA_DIR = File.join(File.dirname(__FILE__),"java") + CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last + JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar") + LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar") + JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar") unless defined? DESCRIPTORS @@ -34,7 +23,6 @@ module OpenTox @@obmol = OpenBabel::OBMol.new @@obconversion = OpenBabel::OBConversion.new @@obconversion.set_in_format 'inchi' - @@cdk_engine = CDKDescriptorEngine.new(CDKDescriptorEngine.MOLECULAR) # OpenBabel OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").each do |d| @@ -56,50 +44,34 @@ module OpenTox end # CDK - @@cdk_engine.getDescriptorClassNames.toArray.each do |d| - cdk_class = d.toString - title = "CDK "+cdk_class.split('.').last - description = @@cdk_engine.getDictionaryDefinition(cdk_class).gsub(/\s+/,' ').strip + " (Class: " + @@cdk_engine.getDictionaryClass(cdk_class).join(", ") + ")" - descriptor = { - :title => title, - :description => description, - :calculator => Rjb::import(cdk_class).new, - :features => [] - } - # CDK Descriptors may return more than one value - descriptor[:features] = descriptor[:calculator].getDescriptorNames.collect do |name| - feature = OpenTox::Feature.find_or_create({ - RDF::DC.title => "#{title} #{name}", + cdk_descriptors = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`) + cdk_descriptors.each do |descriptor| + descriptor[:title] = "Cdk " + descriptor[:java_class].split('.').last.sub(/Descriptor/,'') + descriptor[:features] = [] + descriptor[:names].each do |name| + descriptor[:features] << OpenTox::Feature.find_or_create({ + RDF::DC.title => "#{descriptor[:title]} #{name}", RDF.type => [RDF::OT.Feature, RDF::OT.NumericFeature], - RDF::DC.description => description + RDF::DC.description => descriptor[:description] }, @subjectid) end - descriptors[:cdk] << descriptor end - - # JOELIB - factory = JOELIBFactory.instance - JOELIBHelper.instance.getNativeFeatures.toArray.each do |f| - joelib_class = f.toString - unless joelib_class == "joelib2.feature.types.GlobalTopologicalChargeIndex" - # CH: returns "joelib2.feature.types.atomlabel.AtomValence\n#{numeric value}" - # unsure if numeric_value is GlobalTopologicalChargeIndex or AtomValence - # excluded from descriptor list - title = "JOELib "+joelib_class.split('.').last - description = title # feature.getDescription.hasText returns false, feature.getDescription.getHtml returns unparsable content - feature = OpenTox::Feature.find_or_create({ - RDF::DC.title => title, - RDF.type => [RDF::OT.Feature, RDF::OT.NumericFeature], - RDF::DC.description => description, - }, @subjectid) - descriptors[:joelib] << { - :title => title, - :description => description, - :calculator => Rjb::import(joelib_class).new, - :feature => feature - } - end + descriptors[:cdk] = cdk_descriptors + + # Joelib + joelib_descriptors = YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`) # strip Joelib messages at stdout + joelib_descriptors.each do |descriptor| + # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug) + next if descriptor[:java_class] == "joelib2.feature.types.MoleculeHashcode" or descriptor[:java_class] == "joelib2.feature.types.GlobalTopologicalChargeIndex" + descriptor[:title] = "Joelib " + descriptor[:java_class].split('.').last + descriptor[:feature] = OpenTox::Feature.find_or_create({ + RDF::DC.title => descriptor[:title], + RDF.type => [RDF::OT.Feature, RDF::OT.NumericFeature], + #RDF::DC.description => descriptor[:title], # impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java + }, @subjectid) end + descriptors[:joelib] = joelib_descriptors.select{|d| d[:title]} + DESCRIPTORS = descriptors end @@ -110,65 +82,66 @@ module OpenTox compounds.each do |compound| @@obconversion.read_string @@obmol, compound.inchi descriptors.each do |descriptor| - puts descriptor[:title] @feature_dataset.add_data_entry compound, descriptor[:feature], fix_value(descriptor[:calculator].predict(@@obmol)) end end end def cdk compounds, descriptors - @@obconversion.set_out_format 'sdf' - compounds.each do |compound| - @@obconversion.read_string @@obmol, compound.inchi - sdf = @@obconversion.write_string(@@obmol) - OpenBabel::OBOp.find_type("Gen3D").do(@@obmol) - sdf_3D = @@obconversion.write_string(@@obmol) - if sdf_3D.match(/.nan/) - warning = "3D generation failed for compound #{compound.uri} (using 2D structure)." - $logger.warn warning - @feature_dataset[RDF::OT.Warnings] ? @feature_dataset[RDF::OT.Warnings] << warning : @feature_dataset[RDF::OT.Warnings] = warning - else - sdf = sdf_3D - end - reader = CDKMdlReader.new(StringReader.new(sdf)) - cdk_compound = reader.read(CDKMolecule.new) - #AromaticityDetector.detectAromaticity(cdk_compound) - values = [] - descriptors.each do |descriptor| - puts descriptor[:title] - begin - result = descriptor[:calculator].calculate cdk_compound - result.getValue.toString.split(",").each_with_index do |value,i| - @feature_dataset.add_data_entry compound, descriptor[:features][i], fix_value(value) - end - rescue - $logger.error "#{descriptor[:title]} calculation failed with #{$!.message} for compound #{compound.uri}." - end + sdf_3d compounds + # rjb blocks within tasks + # Avoid "Argument list too long" error by sending only short descriptor names + yaml = `echo "#{@sdf}" |java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{descriptors.collect{|d| d[:title].split("\s").last}.join(" ")}` + YAML.load(yaml).each_with_index do |calculation,i| + $logger.error "Descriptor calculation failed with #{$!.message} for compound #{compounds[i].uri}." if calculation.empty? + calculation.each do |name,value| + feature = DESCRIPTORS[:cdk].collect{|d| d[:features]}.flatten.select{|f| f[RDF::DC.title].split("\s").last == name.to_s}.first + @feature_dataset.add_data_entry compounds[i], feature, fix_value(value) end end end def joelib compounds, descriptors - @@obconversion.set_out_format 'smi' - compounds.each do |compound| - mol = JOELIBMolecule.new(JOELIBTypeHolder.instance.getIOType("SMILES"), JOELIBTypeHolder.instance.getIOType("SMILES")) - @@obconversion.read_string @@obmol, compound.inchi - JOELIBSmilesParser.smiles2molecule mol, @@obconversion.write_string(@@obmol).strip, "Smiles: #{@@obconversion.write_string(@@obmol).strip}" - mol.addHydrogens - descriptors.each do |descriptor| - puts descriptor[:title] - puts descriptor[:calculator].toString#java_methods.inspect - puts descriptor[:calculator].calculate(mol).toString - @feature_dataset.add_data_entry compound, descriptor[:feature], fix_value(descriptor[:calculator].calculate(mol).toString) + sdf_3d compounds + # rjb blocks within tasks + yaml = `echo "#{@sdf}" |java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{descriptors.collect{|d| d[:java_class]}.join(" ")}|grep "^[- ]"` + YAML.load(yaml).each_with_index do |calculation,i| + $logger.error "Descriptor calculation failed with #{$!.message} for compound #{compounds[i].uri}." if calculation.empty? + calculation.each do |java_class,value| + feature = DESCRIPTORS[:joelib].select{|d| d[:java_class] == java_class}.first[:feature] + @feature_dataset.add_data_entry compounds[i], feature, fix_value(value) + end + end + end + + def sdf_3d compounds + unless @sdf + @sdf = "" + @@obconversion.set_out_format 'sdf' + # create 3d sdf file (faster in Openbabel than in CDK) + compounds.each do |compound| + @@obconversion.read_string @@obmol, compound.inchi + sdf_2d = @@obconversion.write_string(@@obmol) + OpenBabel::OBOp.find_type("Gen3D").do(@@obmol) + sdf_3d = @@obconversion.write_string(@@obmol) + if sdf_3d.match(/.nan/) + warning = "3D generation failed for compound #{compound.uri}, using 2D structure." + $logger.warn warning + @feature_dataset[RDF::OT.Warnings] ? @feature_dataset[RDF::OT.Warnings] << warning : @feature_dataset[RDF::OT.Warnings] = warning + @sdf << sdf_2d + else + @sdf << sdf_3d + end end end end def fix_value val - #unless val.numeric? if val.numeric? val = Float(val) val = nil if val.nan? or val.infinite? + else + val = nil if val == "NaN" end val end @@ -207,31 +180,30 @@ module OpenTox before '/descriptor/:lib/:descriptor/?' do @descriptors = DESCRIPTORS[params[:lib].to_sym].select{|d| d[:title].split(" ").last == params[:descriptor]} - bad_request_error "Unknown descriptor #{@uri}. See #{uri('descriptors')} for a complete list of supported descriptors.", @uri if @descriptors.empty? + bad_request_error "Unknown descriptor #{@uri}. See #{uri('descriptor')} for a complete list of supported descriptors.", @uri if @descriptors.empty? @descriptor = @descriptors.first end # Get a list of descriptor calculation # @return [text/uri-list] URIs get '/descriptor/?' do - DESCRIPTORS.collect{|lib,d| d.collect{|n| uri("/descriptors/#{lib}/#{n[:title].split(" ").last}")}}.flatten.sort.join("\n") + DESCRIPTORS.collect{|lib,d| d.collect{|n| uri("/descriptor/#{lib}/#{n[:title].split(" ").last}")}}.flatten.sort.join("\n") end get '/descriptor/:lib/?' do - DESCRIPTORS[params[:lib].to_sym].collect{|n| uri("/descriptors/#{params[:lib].to_sym}/#{n[:title].split(" ").last}")}.sort.join("\n") + DESCRIPTORS[params[:lib].to_sym].collect{|n| uri("/descriptor/#{params[:lib].to_sym}/#{n[:title].split(" ").last}")}.sort.join("\n") end # Get representation of descriptor calculation # @return [String] Representation get '/descriptor/:lib/:descriptor/?' do @algorithm[RDF::DC.title] = @descriptor[:title] - @algorithm[RDF::DC.description] = @descriptor[:description] + @algorithm[RDF::DC.description] = @descriptor[:description] if @descriptor[:description] format_output(@algorithm) end post '/descriptor/?' do - #task = OpenTox::Task.run "Calculating PC descriptors", @uri, @subjectid do |task| - puts "Task created" + task = OpenTox::Task.run "Calculating PC descriptors", @uri, @subjectid do |task| if params[:descriptors] descriptors = {} params[:descriptors].each do |descriptor| @@ -247,18 +219,12 @@ module OpenTox elsif params[:dataset_uri] compounds = Dataset.new(params[:dataset_uri]).compounds end - puts "Calculating" - [:openbabel, :cdk, :joelib].each{ |lib| puts lib; send lib, compounds, descriptors[lib]; puts lib.to_s+" finished" } - #[:joelib].each{ |lib| send lib, compounds, descriptors[lib]; puts lib.to_s+" finished" } - puts "saving file" - File.open("/home/ch/tmp.nt","w+"){|f| f.puts @feature_dataset.to_ntriples} - puts "saving "+@feature_dataset.uri + [:openbabel, :cdk, :joelib].each{ |lib| send lib, compounds, descriptors[lib] } @feature_dataset.put - puts "finished" @feature_dataset.uri - #end - #response['Content-Type'] = 'text/uri-list' - #halt 202, task.uri + end + response['Content-Type'] = 'text/uri-list' + halt 202, task.uri end post '/descriptor/:lib/:descriptor/?' do diff --git a/java/CdkDescriptorInfo.class b/java/CdkDescriptorInfo.class Binary files differnew file mode 100644 index 0000000..922c779 --- /dev/null +++ b/java/CdkDescriptorInfo.class diff --git a/java/CdkDescriptorInfo.java b/java/CdkDescriptorInfo.java new file mode 100644 index 0000000..73a65ac --- /dev/null +++ b/java/CdkDescriptorInfo.java @@ -0,0 +1,22 @@ +import java.util.*; +import org.openscience.cdk.qsar.descriptors.molecular.*; +import org.openscience.cdk.qsar.*; + +class CdkDescriptorInfo { + public static void main(String[] args) { + + DescriptorEngine engine = new DescriptorEngine(DescriptorEngine.MOLECULAR); + + for (Iterator<IDescriptor> it = engine.getDescriptorInstances().iterator(); it.hasNext(); ) { + IDescriptor descriptor = it.next(); + String cdk_class = descriptor.getClass().toString().replaceAll("class ",""); + System.out.println("- :java_class: \""+cdk_class+"\""); + String description = engine.getDictionaryDefinition(cdk_class).replaceAll("^\\s+", "" ).replaceAll("\\s+$", "").replaceAll("\\s+", " "); + System.out.println(" :description: \""+description+"\""); + System.out.println(" :names:"); + for (String name : descriptor.getDescriptorNames()) { + System.out.println(" - \""+name+"\""); + } + } + } +} diff --git a/java/CdkDescriptors.class b/java/CdkDescriptors.class Binary files differnew file mode 100644 index 0000000..21f82c4 --- /dev/null +++ b/java/CdkDescriptors.class diff --git a/java/CdkDescriptors.java b/java/CdkDescriptors.java new file mode 100644 index 0000000..5635507 --- /dev/null +++ b/java/CdkDescriptors.java @@ -0,0 +1,66 @@ +import java.util.*; +import java.io.*; +import org.openscience.cdk.DefaultChemObjectBuilder; +import org.openscience.cdk.interfaces.IMolecule; +import org.openscience.cdk.io.iterator.IteratingMDLReader; +import org.openscience.cdk.qsar.*; +import org.openscience.cdk.qsar.DescriptorValue; + +class CdkDescriptors { + public static void main(String[] args) { + + // parse command line arguments (descriptors) + DescriptorEngine engine; + if (args.length > 0) { + for (int i =0; i < args.length; i++) { + args[i] = "org.openscience.cdk.qsar.descriptors.molecular." + args[i] + "Descriptor"; + } + List<String> classNames = Arrays.asList(args); + engine = new DescriptorEngine(classNames); + List<IDescriptor> instances = engine.instantiateDescriptors(classNames); + List<DescriptorSpecification> specs = engine.initializeSpecifications(instances); + engine.setDescriptorInstances(instances); + engine.setDescriptorSpecifications(specs); + } else { + engine = new DescriptorEngine(DescriptorEngine.MOLECULAR); + } + + // parse 3d sdf from stdin and calculate descriptors + BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); + IteratingMDLReader reader = new IteratingMDLReader( br, DefaultChemObjectBuilder.getInstance()); + while (reader.hasNext()) { + IMolecule molecule = (IMolecule)reader.next(); + try { + engine.process(molecule); + Iterator it = molecule.getProperties().values().iterator(); + Boolean first = true; + while (it.hasNext()) { + try { + DescriptorValue value = (DescriptorValue)it.next(); + int size = value.getValue().length(); + if (size == 1) { + if (first) { System.out.print("- "); } + else { System.out.print(" "); } + System.out.println(":"+value.getNames()[0].toString() + ": " + value.getValue()); + first = false; + } + else { + String[] values = value.getValue().toString().split(","); + for (int i = 0; i < size; i++) { + if (first) { System.out.print("- "); } + else { System.out.print(" "); } + System.out.println(":"+value.getNames()[i].toString() + ": " + values[i]); + first = false; + } + } + } + catch (ClassCastException e) { } // sdf properties are stored as molecules properties (strings), ignore them + } + } + catch (Exception e) { + System.out.println("- {}"); + System.err.println(e.toString()); + } + } + } +} diff --git a/java/JoelibDescriptorInfo.class b/java/JoelibDescriptorInfo.class Binary files differnew file mode 100644 index 0000000..293cb72 --- /dev/null +++ b/java/JoelibDescriptorInfo.class diff --git a/java/JoelibDescriptorInfo.java b/java/JoelibDescriptorInfo.java new file mode 100644 index 0000000..851d650 --- /dev/null +++ b/java/JoelibDescriptorInfo.java @@ -0,0 +1,15 @@ +import joelib2.feature.FeatureHelper; + +class JoelibDescriptorInfo { + public static void main(String[] args) { + FeatureHelper helper = FeatureHelper.instance(); + System.out.println("---"); // document separator for Joelib debug messages + for (Object feature : helper.getNativeFeatures() ) { + System.out.println("- :java_class: \""+feature.toString()+"\""); + // methods for accessing feature descriptions e.g. with + // FeatureFactory.instance().getFeature(feature.toString()).getDescription().getText() or + // FeatureFactory.instance().getFeature(feature.toString()).getDescription().getHtml() + // are defunct + } + } +} diff --git a/java/JoelibDescriptors.class b/java/JoelibDescriptors.class Binary files differnew file mode 100644 index 0000000..4a23d26 --- /dev/null +++ b/java/JoelibDescriptors.class diff --git a/java/JoelibDescriptors.java b/java/JoelibDescriptors.java new file mode 100644 index 0000000..fceb2a9 --- /dev/null +++ b/java/JoelibDescriptors.java @@ -0,0 +1,66 @@ +import java.util.*; +import java.io.*; +import joelib2.feature.Feature; +import joelib2.feature.FeatureHelper; +import joelib2.feature.FeatureFactory; +import joelib2.feature.FeatureResult; +import joelib2.io.BasicIOType; +import joelib2.io.BasicIOTypeHolder; +import joelib2.io.BasicReader; +import joelib2.io.MoleculeFileHelper; +import joelib2.io.MoleculeFileIO; +import joelib2.io.MoleculeIOException; +import joelib2.molecule.BasicConformerMolecule; + +class JoelibDescriptors { + public static void main(String[] args) { + + // set args to all descriptors + if (args.length == 0) { + FeatureHelper helper = FeatureHelper.instance(); + args = (String[]) helper.getNativeFeatures().toArray(new String[0]); + } + + FeatureFactory factory = FeatureFactory.instance(); + MoleculeFileIO loader = null; + BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); + String line = new String(); + String sdf = new String(); + try { + while ((line = br.readLine()) != null) { sdf += line + "\n"; } + br.close(); + InputStream is = null; + is = new ByteArrayInputStream(sdf.getBytes("UTF-8")); + BasicIOType inType = BasicIOTypeHolder.instance().getIOType("SDF"); + loader = MoleculeFileHelper.getMolReader(is, inType); + //BasicIOType outType = BasicIOTypeHolder.instance().getIOType("SMILES"); + //JOEMol mol = new JOEMol(inType, inType); + BasicConformerMolecule mol = new BasicConformerMolecule(inType, inType); + while (true) { + try { + Boolean success = loader.read(mol); + if (!success) { break; } + //System.err.println( mol ); + for (int i =0; i < args.length; i++) { + Feature feature = factory.getFeature(args[i]); + FeatureResult result = feature.calculate(mol); + if (i == 0) { System.out.print("- "); } + else { System.out.print(" "); } + System.out.print( args[i]+": " ); + System.out.println( result.toString() ); + } + + } + catch (Exception e) { + System.err.println(e.toString()); + e.printStackTrace(); + //next; + } + } + } + catch (Exception e) { + e.printStackTrace(); + //System.err.println(e.toString()); + } + } +} diff --git a/java/log4j.jar b/java/log4j.jar Binary files differnew file mode 100644 index 0000000..c930a6a --- /dev/null +++ b/java/log4j.jar |