diff options
author | Christoph Helma <helma@in-silico.ch> | 2013-07-04 16:38:34 +0200 |
---|---|---|
committer | Christoph Helma <helma@in-silico.ch> | 2013-07-04 16:38:34 +0200 |
commit | 23ac5ebadb9eabc666cbc322ab03dbc61937d5dd (patch) | |
tree | dc1a45419c403e011645124249010750df8cfef5 | |
parent | 404c644ee52bac97ff737fcb2057df3f1ec18b76 (diff) |
descriptor calculation fixed
-rw-r--r-- | application.rb | 8 | ||||
-rw-r--r-- | descriptor.rb | 451 | ||||
-rw-r--r-- | java/CdkDescriptors.class | bin | 3401 -> 3781 bytes | |||
-rw-r--r-- | java/CdkDescriptors.java | 46 | ||||
-rw-r--r-- | java/JoelibDescriptors.class | bin | 2831 -> 2578 bytes | |||
-rw-r--r-- | java/JoelibDescriptors.java | 12 | ||||
-rw-r--r-- | lib/descriptor.rb | 115 | ||||
-rw-r--r-- | lib/lazar.rb | 3 | ||||
-rw-r--r-- | lib/transform.rb | 2 | ||||
-rw-r--r-- | lib/unique_descriptors.rb | 120 |
10 files changed, 279 insertions, 478 deletions
diff --git a/application.rb b/application.rb index 547da08..9f1f12f 100644 --- a/application.rb +++ b/application.rb @@ -33,15 +33,11 @@ module OpenTox end get '/?' do - list = [ to('/lazar', :full), + render [ to('/lazar', :full), to('/fminer/bbrc', :full), - #to('/fminer/bbrc/sample', :full), to('/fminer/last', :full), - #to('/fminer/bbrc/match', :full), - #to('/fminer/last/match', :full), - to('/feature-selection/recursive-feature-elimination', :full), + #to('/feature-selection/recursive-feature-elimination', :full), to('/descriptor') ].join("\n") + "\n" - render list end end end diff --git a/descriptor.rb b/descriptor.rb index 13a6fa3..ce3ec54 100644 --- a/descriptor.rb +++ b/descriptor.rb @@ -7,389 +7,126 @@ module OpenTox class Application < Service - before '/descriptor/:method' do - if params[:compound_uri] - @compounds = [params[:compound_uri]].flatten.collect{|u| OpenTox::Compound.new u} - elsif params[:dataset_uri] - @compounds = OpenTox::Dataset.new(params[:dataset_uri], @subjectid).compounds - else - bad_request_error "Please provide a dataset_uri or compound_uri parameter", @uri - end -=begin - new_params = {} - delete = [] - params.each do |k,v| - if k.match(/_uri$/) - klass = k.sub(/_uri$/,'') - v = [v] if v.is_a? String - new_params[klass] = v.collect{|u| OpenTox.const_get(klass.capitalize).new(u)} - delete << k - end - end - delete.each{|k| params.delete k} - params.merge! new_params -=end - end -=begin - before '/descriptor/:lib/:descriptor/?' do - #if request.get? - #lib = @uri.split("/")[-2].capitalize - @klass = OpenTox::Descriptor.const_get params[:lib].capitalize - #@algorithm = klass.new @uri, @subjectid unless params[:lib] == "smarts" - @method = params[:descriptor].to_sym - elsif request.post? - @feature_dataset = Dataset.new nil, @subjectid - @feature_dataset.metadata = { - RDF::DC.title => "Physico-chemical descriptors", - RDF::DC.creator => @uri, - RDF::OT.hasSource => @uri, + before '/descriptor/:method/?' do + if request.get? + @algorithm = OpenTox::Algorithm::Descriptor.new @uri + @algorithm.parameters = [ { + RDF::DC.description => "Dataset URI", + RDF::OT.paramScope => "optional", + RDF::DC.title => "dataset_uri" + },{ + RDF::DC.description => "Compound URI", + RDF::OT.paramScope => "optional", + RDF::DC.title => "compound_uri" + } ] + @algorithm.metadata = { + RDF.type => [RDF::OT.Algorithm, RDF::OTA.DescriptorCalculation], } - if params[:compound_uri] - @feature_dataset.parameters = [ { RDF::DC.title => "compound_uri", RDF::OT.paramValue => params[:compound_uri] }] - elsif params[:dataset_uri] - @feature_dataset.parameters = [ { RDF::DC.title => "dataset_uri", RDF::OT.paramValue => params[:dataset_uri] }] - else - bad_request_error "Please provide a dataset_uri or compound_uri parameter", @uri - end end end -=end + get '/descriptor/?' do - #OpenTox::Algorithm::Descriptor.list.collect{|d| uri d}.join "\n" - OpenTox::Algorithm::Descriptor.list.join "\n" - #OpenTox::Algorithm::Descriptor.list.inspect + render [ uri('/descriptor/physchem'), uri('/descriptor/smarts_match'), uri('/descriptor/smarts_count'), uri('/descriptor/lookup')].sort end - post '/descriptor/:method' do - puts params.inspect - bad_request_error "Please provide 'descriptors' parameters.", @uri unless params["descriptors"] - if params[:compound_uri] - result = OpenTox::Algorithm::Descriptor.send(params[:method].to_sym, @compounds, params["descriptors"]) - Hash[result.map {|compound, v| [compound.uri, v] }].to_json - elsif params[:dataset_uri] - puts "starting task" - task = OpenTox::Task.run("Calculating #{params[:method]} descriptors for dataset #{params[:dataset_uri]}.", @uri, @subjectid) do |task| - puts "start calculation" - result = OpenTox::Algorithm::Descriptor.send(params[:method].to_sym, @compounds, params["descriptors"]) - puts "create dataset" - puts result.inspect - dataset = OpenTox::Dataset.new nil, @subjectid - @compounds.each do |compound| - @features ||= result[compound].keys.collect{|name| - # TODO set other metadata - OpenTox::Feature.find_or_create({RDF::DC.title => name}, @subjectid) - } - @features.each do |feature| - value = result[compound][feature.title] - puts compound, feature, value if value - dataset.add_data_entry compound, feature, value if value - end - end - puts "put dataset" - dataset.put - puts "dataset stored" - dataset.uri - end - puts "Task" - puts task.uri - response['Content-Type'] = 'text/uri-list' - halt 202,task.uri - end + get '/descriptor/smarts_match/?' do + @algorithm.parameters += [ { + RDF::DC.description => "SMARTS strings", + RDF::OT.paramScope => "mandatory", + RDF::DC.title => "descriptors" + } ] + @algorithm.metadata[RDF::DC.title] = "SMARTS matcher" + render @algorithm end -=begin - post '/descriptor/smarts_match/?' do - bad_request_error "Please provide a compound_uri or dataset_uri parameter and a smarts parameter. The count parameter is optional and defaults to false." unless (params[:compound_uri] or params[:dataset_uri]) and params[:smarts] - params[:count] ? params[:count] = params[:count].to_boolean : params[:count] = false - if params[:compound_uri] - params[:compound_uri] = [ params[:compound_uri] ] unless params[:compound_uri].is_a? Array - response['Content-Type'] = "application/json" - OpenTox::Algorithm::Descriptor.smarts_match(params[:compound_uri].collect{|c| OpenTox::Compound.new c}, params[:smarts], params[:count]).to_json - elsif params[:dataset_uri] - task = OpenTox::Task.run("Calculating Smarts #{method} for dataset #{params[:dataset_uri]}.", @uri, @subjectid) do |task| - compounds = OpenTox::Dataset.new params[:dataset_uri] - matches = OpenTox::Descriptor::Smarts.fingerprint(compounds, params[:smarts], params[:count]) - end - response['Content-Type'] = 'text/uri-list' - halt 202,task.uri - end + get '/descriptor/smarts_count/?' do + @algorithm.parameters += [ { + RDF::DC.description => "Counts SMARTS matches", + RDF::OT.paramScope => "mandatory", + RDF::DC.title => "descriptors" + } ] + @algorithm.metadata[RDF::DC.title] = "SMARTS count" + render @algorithm end - post '/descriptor/smarts_count/?' do - bad_request_error "Please provide a compound_uri or dataset_uri parameter and a smarts parameter. The count parameter is optional and defaults to false." unless (params[:compound_uri] or params[:dataset_uri]) and params[:smarts] - params[:count] ? params[:count] = params[:count].to_boolean : params[:count] = false - if params[:compound_uri] - params[:compound_uri] = [ params[:compound_uri] ] unless params[:compound_uri].is_a? Array - response['Content-Type'] = "application/json" - OpenTox::Algorithm::Descriptor.smarts_count(params[:compound_uri].collect{|c| OpenTox::Compound.new c}, params[:smarts]).to_json - elsif params[:dataset_uri] - task = OpenTox::Task.run("Calculating Smarts #{method} for dataset #{params[:dataset_uri]}.", @uri, @subjectid) do |task| - compounds = OpenTox::Dataset.new params[:dataset_uri] - matches = OpenTox::Descriptor::Smarts.fingerprint(compounds, params[:smarts], params[:count]) - end - response['Content-Type'] = 'text/uri-list' - halt 202,task.uri - end + get '/descriptor/physchem/?' do + @algorithm.parameters += [ { + RDF::DC.description => "Physical-chemical descriptors (see #{File.join @uri, 'list'} for a list of supported parameters)", + RDF::OT.paramScope => "mandatory", + RDF::DC.title => "descriptors" + } ] + @algorithm.metadata[RDF::DC.title] = "Physical-chemical descriptors" + render @algorithm end -=end -=begin - # Get a list of descriptor calculation - # @return [text/uri-list] URIs - get '/descriptor/?' do - #uris = ["Openbabel","Cdk","Joelib"].collect do |lib| - uris = ["Openbabel"].collect do |lib| - klass = OpenTox::Descriptor.const_get lib - klass.all - end.flatten - render uris + get '/descriptor/physchem/list/?' do + response['Content-Type'] = 'text/plain' + OpenTox::Algorithm::Descriptor::DESCRIPTORS.collect{|k,v| "#{k}\t#{v}"}.join "\n" end - get '/descriptor/:lib/?' do - begin - klass = OpenTox::Descriptor.const_get params[:lib].capitalize - render klass.all - rescue - bad_request_error "Descriptor library '#{params[:lib]}' not found.", @uri - end + get '/descriptor/physchem/unique/?' do + response['Content-Type'] = 'text/plain' + OpenTox::Algorithm::Descriptor::UNIQUEDESCRIPTORS.collect{|d| "#{d}\t#{OpenTox::Algorithm::Descriptor::DESCRIPTORS[d]}"}.join "\n" end - # Get representation of descriptor calculation - # @return [String] Representation - get '/descriptor/:lib/:descriptor/?' do + get '/descriptor/lookup/?' do + @algorithm.parameters += [ { + RDF::DC.description => "Read feature values from a dataset", + RDF::OT.paramScope => "mandatory", + RDF::DC.title => "feature_dataset_uri" + } ] + @algorithm.metadata[RDF::DC.title] = "Dataset lookup" render @algorithm end - post '/descriptor/?' do - descriptors = OpenTox::Descriptor::Set.new params - if params[:compound_uri] - compound = OpenTox::Compound.new params[:compound_uri] - descriptors.calculate compound - elsif params[:dataset_uri] - task = OpenTox::Task.run("Calculating Smarts #{method} for dataset #{params[:dataset_uri]}.", @uri, @subjectid) do |task| - dataset = OpenTox::Dataset.new params[:dataset_uri] - descriptors.calculate dataset - end - response['Content-Type'] = 'text/uri-list' - halt 202,task.uri + post '/descriptor/:method' do + puts params.inspect + if params[:method] == "physchem" + params[:descriptors] = OpenTox::Algorithm::Descriptor::UNIQUEDESCRIPTORS if !params[:descriptors] or params[:descriptors] == [""] else - - end - end - #post '/descriptor/physchem/?' do - #post '/descriptor/lookup/?' do - - # use /descriptor with dataset_uri and descriptor_uri parameters for efficient calculation of multiple compounds/descriptors - post '/descriptor/:lib/:descriptor/?' do - bad_request_error "Please provide a compound_uri parameter", @uri unless params[:compound_uri] - params[:descriptor_uris] = [@uri] - result = @algorithm.calculate(params) - puts result.inspect - result.to_json - #compounds = [ Compound.new(params[:compound_uri], @subjectid) ] - #send params[:lib].to_sym, compounds, @descriptors - #@feature_dataset.put - #@feature_dataset.uri - end -=end -=begin - ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk" - JAVA_DIR = File.join(File.dirname(__FILE__),"java") - CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last - JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar") - LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar") - JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar") - - unless defined? DESCRIPTORS - - # initialize descriptors and features at startup to avoid duplication - descriptors = { :cdk => [], :openbabel => [], :joelib => [] } # use arrays to keep the sequence intact - - @@obmol = OpenBabel::OBMol.new - @@obconversion = OpenBabel::OBConversion.new - @@obconversion.set_in_format 'inchi' - - - # CDK - cdk_descriptors = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`) - cdk_descriptors.each do |descriptor| - title = descriptor[:java_class].split('.').last.sub(/Descriptor/,'') - descriptor[:title] = "Cdk " + title - descriptor[:uri] = File.join $algorithm[:uri], "descriptor/cdk" ,title - descriptor[:features] = [] - descriptor[:names].each do |name| - descriptor[:features] << OpenTox::Feature.find_or_create({ - RDF::DC.title => "#{descriptor[:title]} #{name}", - RDF.type => [RDF::OT.Feature, RDF::OT.NumericFeature], - RDF::DC.description => descriptor[:description] - }, @subjectid) - end - end - descriptors[:cdk] = cdk_descriptors - - # Joelib - joelib_descriptors = YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`) # strip Joelib messages at stdout - joelib_descriptors.each do |descriptor| - # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug) - next if descriptor[:java_class] == "joelib2.feature.types.MoleculeHashcode" or descriptor[:java_class] == "joelib2.feature.types.GlobalTopologicalChargeIndex" - title = descriptor[:java_class].split('.').last - descriptor[:uri] = File.join $algorithm[:uri], "descriptor/joelib",title - descriptor[:title] = "Joelib " + title - descriptor[:feature] = OpenTox::Feature.find_or_create({ - RDF::DC.title => descriptor[:title], - RDF.type => [RDF::OT.Feature, RDF::OT.NumericFeature], - #RDF::DC.description => descriptor[:title], # impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java - }, @subjectid) - end - descriptors[:joelib] = joelib_descriptors.select{|d| d[:title]} - - DESCRIPTORS = descriptors - - end - - helpers do - - def cdk compounds, descriptors - sdf_3d compounds - # use java system call (rjb blocks within tasks) - # use Tempfiles to avoid "Argument list too long" error - puts `java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{@sdf_file.path} #{descriptors.collect{|d| d[:title].split("\s").last}.join(" ")}` - YAML.load_file(@sdf_file.path+"cdk.yaml").each_with_index do |calculation,i| - $logger.error "Descriptor calculation failed for compound #{compounds[i].uri}." if calculation.empty? - calculation.each do |name,value| - feature = DESCRIPTORS[:cdk].collect{|d| d[:features]}.flatten.select{|f| f[RDF::DC.title].split("\s").last == name.to_s}.first - @feature_dataset.add_data_entry compounds[i], feature, fix_value(value) - end - end - end - - def joelib compounds, descriptors - # use java system call (rjb blocks within tasks) - # use Tempfiles to avoid "Argument list too long" error - sdf_3d compounds - puts `java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{@sdf_file.path} #{descriptors.collect{|d| d[:java_class]}.join(" ")}` - YAML.load_file(@sdf_file.path+"joelib.yaml").each_with_index do |calculation,i| - $logger.error "Descriptor calculation failed for compound #{compounds[i].uri}." if calculation.empty? - calculation.each do |java_class,value| - feature = DESCRIPTORS[:joelib].select{|d| d[:java_class] == java_class}.first[:feature] - @feature_dataset.add_data_entry compounds[i], feature, fix_value(value) - end - end + bad_request_error "Please provide 'descriptors' parameters.", @uri unless params[:descriptors] end - - def sdf_3d compounds - unless @sdf_file and File.exists? @sdf_file.path - @sdf_file = Tempfile.open("sdf") - @@obconversion.set_out_format 'sdf' - # create 3d sdf file (faster in Openbabel than in CDK) - compounds.each do |compound| - @@obconversion.read_string @@obmol, compound.inchi - sdf_2d = @@obconversion.write_string(@@obmol) - OpenBabel::OBOp.find_type("Gen3D").do(@@obmol) - sdf_3d = @@obconversion.write_string(@@obmol) - if sdf_3d.match(/.nan/) - warning = "3D generation failed for compound #{compound.uri}, trying to calculate descriptors from 2D structure." - $logger.warn warning - @feature_dataset[RDF::OT.Warnings] ? @feature_dataset[RDF::OT.Warnings] << warning : @feature_dataset[RDF::OT.Warnings] = warning - @sdf_file.puts sdf_2d - else - @sdf_file.puts sdf_3d + if params[:compound_uri] # return json + @compounds = [params[:compound_uri]].flatten.collect{|u| OpenTox::Compound.new u} + result = OpenTox::Algorithm::Descriptor.send(params[:method].to_sym, @compounds, params[:descriptors]) + Hash[result.map {|compound, v| [compound.uri, v] }].to_json + elsif params[:dataset_uri] # return dataset + task = OpenTox::Task.run("Calculating #{params[:method]} descriptors for dataset #{params[:dataset_uri]}.", @uri, @subjectid) do |task| + @compounds = OpenTox::Dataset.new(params[:dataset_uri], SUBJECTID).compounds + result = OpenTox::Algorithm::Descriptor.send(params[:method].to_sym, @compounds, params[:descriptors]) + dataset = OpenTox::Dataset.new nil, SUBJECTID + dataset.metadata = { + RDF::DC.title => "Physico-chemical descriptors", + RDF::DC.creator => @uri, + RDF::OT.hasSource => @uri, + } + dataset.parameters = [ + { RDF::DC.title => "dataset_uri", RDF::OT.paramValue => params[:dataset_uri] }, + { RDF::DC.title => "descriptors", RDF::OT.paramValue => params[:descriptors] }, + ] + params[:method] == "smarts_match" ? feature_type = RDF::OT.NominalFeature : feature_type = RDF::OT.NumericFeature + @compounds.each do |compound| + @features ||= result[compound].keys.collect{|name| + OpenTox::Feature.find_or_create({ + RDF::DC.title => name, + RDF.type => [RDF::OT.Feature, feature_type], + RDF::DC.description => OpenTox::Algorithm::Descriptor.description(name) + }, SUBJECTID) + } + @features.each do |feature| + value = result[compound][feature.title] + dataset.add_data_entry compound, feature, value if value end end - @sdf_file.close - end - end -<<<<<<< HEAD -======= - - def fix_value val - if val.numeric? - val = Float(val) - val = nil if val.nan? or val.infinite? - else - val = nil if val == "NaN" - end - val - end - end - - before '/descriptor/?*' do - if request.get? - @algorithm = OpenTox::Algorithm.new @uri, @subjectid - @algorithm.parameters = [ - { RDF::DC.description => "Dataset URI", - RDF::OT.paramScope => "optional", - RDF::DC.title => "dataset_uri" } , - { RDF::DC.description => "Compound URI", - RDF::OT.paramScope => "optional", - RDF::DC.title => "compound_uri" } - ] - @algorithm.metadata = { - RDF.type => [RDF::OTA.DescriptorCalculation], - } - elsif request.post? - @feature_dataset = Dataset.new nil, @subjectid - @feature_dataset.metadata = { - RDF::DC.title => "Physico-chemical descriptors", - RDF::DC.creator => @uri, - RDF::OT.hasSource => @uri, - } - if params[:compound_uri] - @feature_dataset.parameters = [ { RDF::DC.title => "compound_uri", RDF::OT.paramValue => params[:compound_uri] }] - elsif params[:dataset_uri] - @feature_dataset.parameters = [ { RDF::DC.title => "dataset_uri", RDF::OT.paramValue => params[:dataset_uri] }] - else - bad_request_error "Please provide a dataset_uri or compound_uri parameter", @uri - end - end ->>>>>>> ad386110267ecc3e0c5301769b4880a7e555a44e - end - - before '/descriptor/:lib/:descriptor/?' do - @descriptors = DESCRIPTORS[params[:lib].to_sym].select{|d| d[:title].split(" ").last == params[:descriptor]} - bad_request_error "Unknown descriptor #{@uri}. See #{uri('descriptor')} for a complete list of supported descriptors.", @uri if @descriptors.empty? - @descriptor = @descriptors.first - end - - after do # Tempfile cleanup - if @sdf_file and File.exists? @sdf_file.path - FileUtils.rm Dir["#{@sdf_file.path}*.yaml"] - @sdf_file.unlink - end - @sdf_file = nil - end - - # Get representation of descriptor calculation - # @return [String] Representation - get '/descriptor/:lib/:descriptor/?' do - render @algorithm - end - - post '/descriptor/?' do - task = OpenTox::Task.run "Calculating PC descriptors", @uri, @subjectid do |task| - if params[:descriptor_uris] - descriptors = {} - params[:descriptor_uris].each do |descriptor_uri| - lib = descriptor_uri.split('/')[-2] - descriptors[lib.to_sym] ||= [] - descriptors[lib.to_sym] += DESCRIPTORS[lib.to_sym].select{|d| d[:uri] == descriptor_uri} - end - else - descriptors = DESCRIPTORS - end - if params[:compound_uri] - compounds = [ Compound.new(params[:compound_uri], @subjectid) ] - elsif params[:dataset_uri] - compounds = Dataset.new(params[:dataset_uri], @subjectid).compounds + dataset.put + dataset.uri end - [:openbabel, :cdk, :joelib].each{ |lib| send lib, compounds, descriptors[lib] if descriptors[lib] } - @feature_dataset.put - @feature_dataset.uri + response['Content-Type'] = 'text/uri-list' + halt 202,task.uri + else + bad_request_error "Please provide a dataset_uri or compound_uri parameter", @uri end - response['Content-Type'] = 'text/uri-list' - halt 202, task.uri end -=end end diff --git a/java/CdkDescriptors.class b/java/CdkDescriptors.class Binary files differindex c2dc069..34b973f 100644 --- a/java/CdkDescriptors.class +++ b/java/CdkDescriptors.class diff --git a/java/CdkDescriptors.java b/java/CdkDescriptors.java index 50f6379..644e3d8 100644 --- a/java/CdkDescriptors.java +++ b/java/CdkDescriptors.java @@ -11,19 +11,15 @@ class CdkDescriptors { // parse command line arguments > 1 (descriptors) DescriptorEngine engine; - if (args.length > 1) { - List<String> classNames = new ArrayList<String>(); - for (int i =1; i < args.length; i++) { - classNames.add("org.openscience.cdk.qsar.descriptors.molecular." + args[i] + "Descriptor"); - } - engine = new DescriptorEngine(classNames); - List<IDescriptor> instances = engine.instantiateDescriptors(classNames); - List<DescriptorSpecification> specs = engine.initializeSpecifications(instances); - engine.setDescriptorInstances(instances); - engine.setDescriptorSpecifications(specs); - } else { - engine = new DescriptorEngine(DescriptorEngine.MOLECULAR); + List<String> classNames = new ArrayList<String>(); + for (int i =1; i < args.length; i++) { + classNames.add("org.openscience.cdk.qsar.descriptors.molecular." + args[i] + "Descriptor"); } + engine = new DescriptorEngine(classNames); + List<IDescriptor> instances = engine.instantiateDescriptors(classNames); + List<DescriptorSpecification> specs = engine.initializeSpecifications(instances); + engine.setDescriptorInstances(instances); + engine.setDescriptorSpecifications(specs); try { BufferedReader br = new BufferedReader(new FileReader(args[0])); @@ -34,26 +30,22 @@ class CdkDescriptors { try { IMolecule molecule = (IMolecule)reader.next(); engine.process(molecule); - Iterator it = molecule.getProperties().values().iterator(); + Map<Object,Object> properties = molecule.getProperties(); Boolean first = true; - while (it.hasNext()) { + for (Map.Entry<Object, Object> entry : properties.entrySet()) { try { - DescriptorValue value = (DescriptorValue)it.next(); - int size = value.getValue().length(); - if (size == 1) { - if (first) { yaml.print("- "); } - else { yaml.print(" "); } - yaml.println(":"+value.getNames()[0].toString() + ": " + value.getValue()); - first = false; - } - else { + if ((entry.getKey() instanceof DescriptorSpecification) && (entry.getValue() instanceof DescriptorValue)) { + DescriptorSpecification property = (DescriptorSpecification)entry.getKey(); + DescriptorValue value = (DescriptorValue)entry.getValue(); String[] values = value.getValue().toString().split(","); - for (int i = 0; i < size; i++) { - if (first) { yaml.print("- "); } + for (int i = 0; i < values.length; i++) { + if (first) { yaml.print("- "); first = false; } else { yaml.print(" "); } - yaml.println(":"+value.getNames()[i].toString() + ": " + values[i]); - first = false; + String cdk_class = property.getImplementationTitle(); + String name = cdk_class.substring(cdk_class.lastIndexOf(".")+1).replace("Descriptor",""); + yaml.println("Cdk." + name + "." + value.getNames()[i] + ": " + values[i]); } + } } catch (ClassCastException e) { } // sdf properties are stored as molecules properties (strings), ignore them diff --git a/java/JoelibDescriptors.class b/java/JoelibDescriptors.class Binary files differindex 7f3eced..d88ac63 100644 --- a/java/JoelibDescriptors.class +++ b/java/JoelibDescriptors.class diff --git a/java/JoelibDescriptors.java b/java/JoelibDescriptors.java index 64c099e..e90e35f 100644 --- a/java/JoelibDescriptors.java +++ b/java/JoelibDescriptors.java @@ -16,14 +16,8 @@ class JoelibDescriptors { public static void main(String[] args) { String[] features = null; - // set features to all descriptors - if (args.length == 1) { - FeatureHelper helper = FeatureHelper.instance(); - features = (String[]) helper.getNativeFeatures().toArray(new String[0]); - } else { - features = new String[args.length-1]; - System.arraycopy(args,1,features,0,args.length-1); - } + features = new String[args.length-1]; + System.arraycopy(args,1,features,0,args.length-1); FeatureFactory factory = FeatureFactory.instance(); MoleculeFileIO loader = null; @@ -46,7 +40,7 @@ class JoelibDescriptors { FeatureResult result = feature.calculate(mol); if (i == 0) { yaml.print("- "); } else { yaml.print(" "); } - yaml.print( features[i]+": " ); + yaml.print( "Joelib."+features[i]+": " ); yaml.println( result.toString() ); } diff --git a/lib/descriptor.rb b/lib/descriptor.rb index 8c8129c..8bc75ac 100644 --- a/lib/descriptor.rb +++ b/lib/descriptor.rb @@ -12,32 +12,39 @@ module OpenTox LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar") JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar") -=begin - def initialize uri, subjectid - super uri, subjectid - @parameters = [ - { RDF::DC.description => "Dataset URI", - RDF::OT.paramScope => "optional", - RDF::DC.title => "dataset_uri" } , - { RDF::DC.description => "Compound URI", - RDF::OT.paramScope => "optional", - RDF::DC.title => "compound_uri" } - ] - tokens = uri.split %r{/} - @metadata = { - RDF::DC.title => "#{tokens[-2].capitalize} #{tokens[-1]}", - RDF.type => [RDF::OT.Algorithm, RDF::OTA.DescriptorCalculation], - } - end -=end - - def self.list - list = OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect{|line| "/openbabel/#{line.split(/\s+/).first}" } - list += YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`).collect{|d| "cdk/#{d[:java_class].split('.').last.sub(/Descriptor/,'')}" } - joelib = YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`) # strip Joelib messages at stdout - # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug) - list += joelib.collect{|d| "joelib/#{d[:java_class].split('.').last}" unless d[:java_class] == "joelib2.feature.types.MoleculeHashcode" or d[:java_class] == "joelib2.feature.types.GlobalTopologicalChargeIndex"}.compact - list.collect{|item| File.join "descriptor",item} + obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title"] + OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d| + name,description = d.split(/\s+/,2) + ["Openbabel."+name,description] unless obexclude.include? name + end.compact.sort{|a,b| a[0] <=> b[0]}] + + CDKDESCRIPTORS = Hash[YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`).collect { |d| ["Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}] + + # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug) + joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"] + # strip Joelib messages from stdout + JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d| + name = d[:java_class].sub(/^joelib2.feature.types./,'') + # impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java + ["Joelib."+name, "no description available"] unless joelibexclude.include? name + end.compact.sort{|a,b| a[0] <=> b[0]}] + + DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS)) + require_relative "unique_descriptors.rb" + + def self.description descriptor + lib = descriptor.split('.').first + case lib + when "Openbabel" + OBDESCRIPTORS[descriptor] + when "Cdk" + name = descriptor.split('.')[0..-2].join('.') + CDKDESCRIPTORS[name] + when "Joelib" + JOELIBDESCRIPTORS[descriptor] + when "lookup" + "Read feature values from a dataset" + end end def self.smarts_match compounds, smarts, count=false @@ -71,16 +78,17 @@ module OpenTox def self.physchem compounds, descriptors des = {} descriptors.each do |d| - lib, descriptor = d.split(".") - des[lib.to_sym] ||= [] - des[lib.to_sym] << descriptor + lib, descriptor = d.split(".",2) + lib = lib.downcase.to_sym + des[lib] ||= [] + des[lib] << descriptor end result = {} des.each do |lib,d| send(lib, compounds, d).each do |compound,values| result[compound] ||= {} result[compound].merge! values - end + end end result end @@ -95,7 +103,7 @@ module OpenTox obconversion.read_string obmol, compound.inchi fingerprint[compound] = {} obdescriptors.each_with_index do |descriptor,i| - fingerprint[compound][descriptors[i]] = fix_value(descriptor.predict(obmol)) + fingerprint[compound]["Openbabel."+descriptors[i]] = fix_value(descriptor.predict(obmol)) end end fingerprint @@ -109,7 +117,7 @@ module OpenTox fingerprint = {} YAML.load_file(sdf+"cdk.yaml").each_with_index do |calculation,i| $logger.error "Descriptor calculation failed for compound #{compounds[i].uri}." if calculation.empty? - descriptors.each_with_index do |descriptor,j| + descriptors.each do |descriptor| fingerprint[compounds[i]] = calculation end end @@ -125,7 +133,7 @@ module OpenTox fingerprint = {} YAML.load_file(sdf+"joelib.yaml").each_with_index do |calculation,i| $logger.error "Descriptor calculation failed for compound #{compounds[i].uri}." if calculation.empty? - descriptors.each_with_index do |descriptor,j| + descriptors.each do |descriptor| fingerprint[compounds[i]] = calculation end end @@ -186,44 +194,3 @@ module OpenTox end end end -=begin - class Set - - def initialize params - bad_request_error "Please provide a compound_uri or dataset_uri parameter." unless params[:compound_uri] or params[:dataset_uri] - @dataset = OpenTox::Dataset.new params[:dataset_uri] - @compound = OpenTox::Compound.new params[:compound_uri] - @descriptors = [] - - end - - def calculate - end - - end - - class Openbabel - include Descriptor - - def initialize uri, subjectid=nil - descriptor = OpenBabel::OBDescriptor.find_type(uri.split("/").last) - bad_request_error "Unknown descriptor #{uri}. See #{File.join $algorithm[:uri], "descriptor"} for a list of supported descriptors.", uri unless descriptor - super uri, subjectid - @metadata[RDF::DC.description] = descriptor.description.split("\n").first - @obmol = OpenBabel::OBMol.new - @obconversion = OpenBabel::OBConversion.new - @obconversion.set_in_format 'inchi' - end - - def self.all - OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d| - title = d.split(/\s+/).first - unless title =~ /cansmi|formula|InChI|smarts|title/ or title == "s" - File.join $algorithm[:uri], "descriptor/openbabel" ,title - end - end.compact.sort{|a,b| a.upcase <=> b.upcase} - end - - - end -=end diff --git a/lib/lazar.rb b/lib/lazar.rb index d2eba5c..f88c695 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -184,8 +184,6 @@ module OpenTox if @compound_uri # add neighbors only for compound predictions @neighbors.each do |neighbor| - puts "Neighbor" - puts neighbor.inspect n = neighbor[:compound] @prediction_feature.feature_type == "classification" ? a = @prediction_feature.value_map[neighbor[:activity]] : a = neighbor[:activity] @prediction_dataset.add_data_entry n, @prediction_feature, a @@ -195,7 +193,6 @@ module OpenTox end end # iteration over compounds - puts prediction_dataset.to_turtle @prediction_dataset.put @prediction_dataset diff --git a/lib/transform.rb b/lib/transform.rb index 7b92df5..cb4fc87 100644 --- a/lib/transform.rb +++ b/lib/transform.rb @@ -401,8 +401,6 @@ module OpenTox # Same for compound fingerprints. def get_matrices @compounds = @model.training_compounds - puts @compounds.inspect - #@fingerprints = @model.fingerprints @activities = @model.training_activities @n_prop = @model.training_fingerprints @q_prop = @model.query_fingerprint diff --git a/lib/unique_descriptors.rb b/lib/unique_descriptors.rb new file mode 100644 index 0000000..676f34a --- /dev/null +++ b/lib/unique_descriptors.rb @@ -0,0 +1,120 @@ +# set of non redundant descriptors, faster algorithms are preferred +# TODO: +# select logP algorithm +# select l5 algorithm +# use smarts matcher for atom counts +# check correlations +UNIQUEDESCRIPTORS = [ + "Openbabel.abonds", #Number of aromatic bonds + "Openbabel.atoms", #Number of atoms + "Openbabel.bonds", #Number of bonds + "Openbabel.dbonds", #Number of double bonds + "Openbabel.HBA1", #Number of Hydrogen Bond Acceptors 1 (JoelLib) + "Openbabel.HBA2", #Number of Hydrogen Bond Acceptors 2 (JoelLib) + "Openbabel.HBD", #Number of Hydrogen Bond Donors (JoelLib) + "Openbabel.L5", #Lipinski Rule of Five + "Openbabel.logP", #octanol/water partition coefficient + "Openbabel.MP", #Melting point + "Openbabel.MR", #molar refractivity + "Openbabel.MW", #Molecular Weight filter + "Openbabel.nF", #Number of Fluorine Atoms + "Openbabel.sbonds", #Number of single bonds + "Openbabel.tbonds", #Number of triple bonds + "Openbabel.TPSA", #topological polar surface area + "Cdk.ALOGP", #Calculates atom additive logP and molar refractivity values as described by Ghose and Crippen and + "Cdk.APol", #Descriptor that calculates the sum of the atomic polarizabilities (including implicit hydrogens). + "Cdk.AcidicGroupCount", #Returns the number of acidic groups. + "Cdk.AminoAcidCount", #Returns the number of amino acids found in the system + #"Cdk.AromaticAtomsCount", #Descriptor based on the number of aromatic atoms of a molecule. + #"Cdk.AromaticBondsCount", #Descriptor based on the number of aromatic bonds of a molecule. + #"Cdk.AtomCount", #Descriptor based on the number of atoms of a certain element type. + "Cdk.AutocorrelationCharge", #The Moreau-Broto autocorrelation descriptors using partial charges + "Cdk.AutocorrelationMass", #The Moreau-Broto autocorrelation descriptors using atomic weight + "Cdk.AutocorrelationPolarizability", #The Moreau-Broto autocorrelation descriptors using polarizability + "Cdk.BCUT", #Eigenvalue based descriptor noted for its utility in chemical diversity described by Pearlman et al. . + "Cdk.BPol", #Descriptor that calculates the sum of the absolute value of the difference between atomic polarizabilities of all bonded atoms in the molecule (including implicit hydrogens). + "Cdk.BasicGroupCount", #Returns the number of basic groups. + #"Cdk.BondCount", #Descriptor based on the number of bonds of a certain bond order. + "Cdk.CPSA", #A variety of descriptors combining surface area and partial charge information + "Cdk.CarbonTypes", #Characterizes the carbon connectivity in terms of hybridization + "Cdk.ChiChain", #Evaluates the Kier & Hall Chi chain indices of orders 3,4,5 and 6 + "Cdk.ChiCluster", #Evaluates the Kier & Hall Chi cluster indices of orders 3,4,5,6 and 7 + "Cdk.ChiPathCluster", #Evaluates the Kier & Hall Chi path cluster indices of orders 4,5 and 6 + "Cdk.ChiPath", #Evaluates the Kier & Hall Chi path indices of orders 0,1,2,3,4,5,6 and 7 + "Cdk.EccentricConnectivityIndex", #A topological descriptor combining distance and adjacency information. + "Cdk.FMF", #Descriptor characterizing molecular complexity in terms of its Murcko framework + "Cdk.FragmentComplexity", #Class that returns the complexity of a system. The complexity is defined as @cdk.cite{Nilakantan06} + "Cdk.GravitationalIndex", #Descriptor characterizing the mass distribution of the molecule. + #"Cdk.HBondAcceptorCount", #Descriptor that calculates the number of hydrogen bond acceptors. + #"Cdk.HBondDonorCount", #Descriptor that calculates the number of hydrogen bond donors. + "Cdk.HybridizationRatio", #Characterizes molecular complexity in terms of carbon hybridization states. + "Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential. + "Cdk.KappaShapeIndices", #Descriptor that calculates Kier and Hall kappa molecular shape indices. + "Cdk.KierHallSmarts", #Counts the number of occurrences of the E-state fragments + "Cdk.LargestChain", #Returns the number of atoms in the largest chain + "Cdk.LargestPiSystem", #Returns the number of atoms in the largest pi chain + "Cdk.LengthOverBreadth", #Calculates the ratio of length to breadth. + "Cdk.LongestAliphaticChain", #Returns the number of atoms in the longest aliphatic chain + "Cdk.MDE", #Evaluate molecular distance edge descriptors for C, N and O + "Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms . + "Cdk.MomentOfInertia", #Descriptor that calculates the principal moments of inertia and ratios of the principal moments. Als calculates the radius of gyration. + "Cdk.PetitjeanNumber", #Descriptor that calculates the Petitjean Number of a molecule. + "Cdk.PetitjeanShapeIndex", #The topological and geometric shape indices described Petitjean and Bath et al. respectively. Both measure the anisotropy in a molecule. + "Cdk.RotatableBondsCount", #Descriptor that calculates the number of nonrotatable bonds on a molecule. + #"Cdk.RuleOfFive", #This Class contains a method that returns the number failures of the Lipinski's Rule Of Five. + #"Cdk.TPSA", #Calculation of topological polar surface area based on fragment contributions . + "Cdk.VABC", #Describes the volume of a molecule. + "Cdk.VAdjMa", #Descriptor that calculates the vertex adjacency information of a molecule. + "Cdk.WHIM", #Holistic descriptors described by Todeschini et al . + #"Cdk.Weight", #Descriptor based on the weight of atoms of a certain element type. If no element is specified, the returned value is the Molecular Weight + "Cdk.WeightedPath", #The weighted path (molecular ID) descriptors described by Randic. They characterize molecular branching. + "Cdk.WienerNumbers", #This class calculates Wiener path number and Wiener polarity number. + "Cdk.XLogP", #Prediction of logP based on the atom-type method called XLogP. + "Cdk.ZagrebIndex", #The sum of the squared atom degrees of all heavy atoms. + "Joelib.count.NumberOfS", #no description available + "Joelib.count.NumberOfP", #no description available + "Joelib.count.NumberOfO", #no description available + "Joelib.count.NumberOfN", #no description available + #"Joelib.count.AromaticBonds", #no description available + "Joelib.count.NumberOfI", #no description available + "Joelib.count.NumberOfF", #no description available + "Joelib.count.NumberOfC", #no description available + "Joelib.count.NumberOfB", #no description available + "Joelib.count.HydrophobicGroups", #no description available + #"Joelib.KierShape3", #no description available + #"Joelib.KierShape2", #no description available + #"Joelib.KierShape1", #no description available + #"Joelib.count.AcidicGroups", #no description available + "Joelib.count.AliphaticOHGroups", #no description available + #"Joelib.count.NumberOfAtoms", #no description available + "Joelib.TopologicalRadius", #no description available + "Joelib.GeometricalShapeCoefficient", #no description available + #"Joelib.MolecularWeight", #no description available + "Joelib.FractionRotatableBonds", #no description available + #"Joelib.count.HBD2", #no description available + #"Joelib.count.HBD1", #no description available + "Joelib.LogP", #no description available + "Joelib.GraphShapeCoefficient", #no description available + "Joelib.count.BasicGroups", #no description available + #"Joelib.count.RotatableBonds", #no description available + "Joelib.count.HeavyBonds", #no description available + "Joelib.PolarSurfaceArea", #no description available + #"Joelib.ZagrebIndex1", #no description available + "Joelib.GeometricalRadius", #no description available + "Joelib.count.SO2Groups", #no description available + "Joelib.count.AromaticOHGroups", #no description available + "Joelib.GeometricalDiameter", #no description available + #"Joelib.MolarRefractivity", #no description available + "Joelib.count.NumberOfCl", #no description available + "Joelib.count.OSOGroups", #no description available + "Joelib.count.NumberOfBr", #no description available + "Joelib.count.NO2Groups", #no description available + "Joelib.count.HeteroCycles", #no description available + #"Joelib.count.HBA2", #no description available + #"Joelib.count.HBA1", #no description available + #"Joelib.count.NumberOfBonds", #no description available + "Joelib.count.SOGroups", #no description available + "Joelib.TopologicalDiameter", #no description available + "Joelib.count.NumberOfHal", #no description available + +].sort |