summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2013-07-04 16:38:34 +0200
committerChristoph Helma <helma@in-silico.ch>2013-07-04 16:38:34 +0200
commit23ac5ebadb9eabc666cbc322ab03dbc61937d5dd (patch)
treedc1a45419c403e011645124249010750df8cfef5
parent404c644ee52bac97ff737fcb2057df3f1ec18b76 (diff)
descriptor calculation fixed
-rw-r--r--application.rb8
-rw-r--r--descriptor.rb451
-rw-r--r--java/CdkDescriptors.classbin3401 -> 3781 bytes
-rw-r--r--java/CdkDescriptors.java46
-rw-r--r--java/JoelibDescriptors.classbin2831 -> 2578 bytes
-rw-r--r--java/JoelibDescriptors.java12
-rw-r--r--lib/descriptor.rb115
-rw-r--r--lib/lazar.rb3
-rw-r--r--lib/transform.rb2
-rw-r--r--lib/unique_descriptors.rb120
10 files changed, 279 insertions, 478 deletions
diff --git a/application.rb b/application.rb
index 547da08..9f1f12f 100644
--- a/application.rb
+++ b/application.rb
@@ -33,15 +33,11 @@ module OpenTox
end
get '/?' do
- list = [ to('/lazar', :full),
+ render [ to('/lazar', :full),
to('/fminer/bbrc', :full),
- #to('/fminer/bbrc/sample', :full),
to('/fminer/last', :full),
- #to('/fminer/bbrc/match', :full),
- #to('/fminer/last/match', :full),
- to('/feature-selection/recursive-feature-elimination', :full),
+ #to('/feature-selection/recursive-feature-elimination', :full),
to('/descriptor') ].join("\n") + "\n"
- render list
end
end
end
diff --git a/descriptor.rb b/descriptor.rb
index 13a6fa3..ce3ec54 100644
--- a/descriptor.rb
+++ b/descriptor.rb
@@ -7,389 +7,126 @@ module OpenTox
class Application < Service
- before '/descriptor/:method' do
- if params[:compound_uri]
- @compounds = [params[:compound_uri]].flatten.collect{|u| OpenTox::Compound.new u}
- elsif params[:dataset_uri]
- @compounds = OpenTox::Dataset.new(params[:dataset_uri], @subjectid).compounds
- else
- bad_request_error "Please provide a dataset_uri or compound_uri parameter", @uri
- end
-=begin
- new_params = {}
- delete = []
- params.each do |k,v|
- if k.match(/_uri$/)
- klass = k.sub(/_uri$/,'')
- v = [v] if v.is_a? String
- new_params[klass] = v.collect{|u| OpenTox.const_get(klass.capitalize).new(u)}
- delete << k
- end
- end
- delete.each{|k| params.delete k}
- params.merge! new_params
-=end
- end
-=begin
- before '/descriptor/:lib/:descriptor/?' do
- #if request.get?
- #lib = @uri.split("/")[-2].capitalize
- @klass = OpenTox::Descriptor.const_get params[:lib].capitalize
- #@algorithm = klass.new @uri, @subjectid unless params[:lib] == "smarts"
- @method = params[:descriptor].to_sym
- elsif request.post?
- @feature_dataset = Dataset.new nil, @subjectid
- @feature_dataset.metadata = {
- RDF::DC.title => "Physico-chemical descriptors",
- RDF::DC.creator => @uri,
- RDF::OT.hasSource => @uri,
+ before '/descriptor/:method/?' do
+ if request.get?
+ @algorithm = OpenTox::Algorithm::Descriptor.new @uri
+ @algorithm.parameters = [ {
+ RDF::DC.description => "Dataset URI",
+ RDF::OT.paramScope => "optional",
+ RDF::DC.title => "dataset_uri"
+ },{
+ RDF::DC.description => "Compound URI",
+ RDF::OT.paramScope => "optional",
+ RDF::DC.title => "compound_uri"
+ } ]
+ @algorithm.metadata = {
+ RDF.type => [RDF::OT.Algorithm, RDF::OTA.DescriptorCalculation],
}
- if params[:compound_uri]
- @feature_dataset.parameters = [ { RDF::DC.title => "compound_uri", RDF::OT.paramValue => params[:compound_uri] }]
- elsif params[:dataset_uri]
- @feature_dataset.parameters = [ { RDF::DC.title => "dataset_uri", RDF::OT.paramValue => params[:dataset_uri] }]
- else
- bad_request_error "Please provide a dataset_uri or compound_uri parameter", @uri
- end
end
end
-=end
+
get '/descriptor/?' do
- #OpenTox::Algorithm::Descriptor.list.collect{|d| uri d}.join "\n"
- OpenTox::Algorithm::Descriptor.list.join "\n"
- #OpenTox::Algorithm::Descriptor.list.inspect
+ render [ uri('/descriptor/physchem'), uri('/descriptor/smarts_match'), uri('/descriptor/smarts_count'), uri('/descriptor/lookup')].sort
end
- post '/descriptor/:method' do
- puts params.inspect
- bad_request_error "Please provide 'descriptors' parameters.", @uri unless params["descriptors"]
- if params[:compound_uri]
- result = OpenTox::Algorithm::Descriptor.send(params[:method].to_sym, @compounds, params["descriptors"])
- Hash[result.map {|compound, v| [compound.uri, v] }].to_json
- elsif params[:dataset_uri]
- puts "starting task"
- task = OpenTox::Task.run("Calculating #{params[:method]} descriptors for dataset #{params[:dataset_uri]}.", @uri, @subjectid) do |task|
- puts "start calculation"
- result = OpenTox::Algorithm::Descriptor.send(params[:method].to_sym, @compounds, params["descriptors"])
- puts "create dataset"
- puts result.inspect
- dataset = OpenTox::Dataset.new nil, @subjectid
- @compounds.each do |compound|
- @features ||= result[compound].keys.collect{|name|
- # TODO set other metadata
- OpenTox::Feature.find_or_create({RDF::DC.title => name}, @subjectid)
- }
- @features.each do |feature|
- value = result[compound][feature.title]
- puts compound, feature, value if value
- dataset.add_data_entry compound, feature, value if value
- end
- end
- puts "put dataset"
- dataset.put
- puts "dataset stored"
- dataset.uri
- end
- puts "Task"
- puts task.uri
- response['Content-Type'] = 'text/uri-list'
- halt 202,task.uri
- end
+ get '/descriptor/smarts_match/?' do
+ @algorithm.parameters += [ {
+ RDF::DC.description => "SMARTS strings",
+ RDF::OT.paramScope => "mandatory",
+ RDF::DC.title => "descriptors"
+ } ]
+ @algorithm.metadata[RDF::DC.title] = "SMARTS matcher"
+ render @algorithm
end
-=begin
- post '/descriptor/smarts_match/?' do
- bad_request_error "Please provide a compound_uri or dataset_uri parameter and a smarts parameter. The count parameter is optional and defaults to false." unless (params[:compound_uri] or params[:dataset_uri]) and params[:smarts]
- params[:count] ? params[:count] = params[:count].to_boolean : params[:count] = false
- if params[:compound_uri]
- params[:compound_uri] = [ params[:compound_uri] ] unless params[:compound_uri].is_a? Array
- response['Content-Type'] = "application/json"
- OpenTox::Algorithm::Descriptor.smarts_match(params[:compound_uri].collect{|c| OpenTox::Compound.new c}, params[:smarts], params[:count]).to_json
- elsif params[:dataset_uri]
- task = OpenTox::Task.run("Calculating Smarts #{method} for dataset #{params[:dataset_uri]}.", @uri, @subjectid) do |task|
- compounds = OpenTox::Dataset.new params[:dataset_uri]
- matches = OpenTox::Descriptor::Smarts.fingerprint(compounds, params[:smarts], params[:count])
- end
- response['Content-Type'] = 'text/uri-list'
- halt 202,task.uri
- end
+ get '/descriptor/smarts_count/?' do
+ @algorithm.parameters += [ {
+ RDF::DC.description => "Counts SMARTS matches",
+ RDF::OT.paramScope => "mandatory",
+ RDF::DC.title => "descriptors"
+ } ]
+ @algorithm.metadata[RDF::DC.title] = "SMARTS count"
+ render @algorithm
end
- post '/descriptor/smarts_count/?' do
- bad_request_error "Please provide a compound_uri or dataset_uri parameter and a smarts parameter. The count parameter is optional and defaults to false." unless (params[:compound_uri] or params[:dataset_uri]) and params[:smarts]
- params[:count] ? params[:count] = params[:count].to_boolean : params[:count] = false
- if params[:compound_uri]
- params[:compound_uri] = [ params[:compound_uri] ] unless params[:compound_uri].is_a? Array
- response['Content-Type'] = "application/json"
- OpenTox::Algorithm::Descriptor.smarts_count(params[:compound_uri].collect{|c| OpenTox::Compound.new c}, params[:smarts]).to_json
- elsif params[:dataset_uri]
- task = OpenTox::Task.run("Calculating Smarts #{method} for dataset #{params[:dataset_uri]}.", @uri, @subjectid) do |task|
- compounds = OpenTox::Dataset.new params[:dataset_uri]
- matches = OpenTox::Descriptor::Smarts.fingerprint(compounds, params[:smarts], params[:count])
- end
- response['Content-Type'] = 'text/uri-list'
- halt 202,task.uri
- end
+ get '/descriptor/physchem/?' do
+ @algorithm.parameters += [ {
+ RDF::DC.description => "Physical-chemical descriptors (see #{File.join @uri, 'list'} for a list of supported parameters)",
+ RDF::OT.paramScope => "mandatory",
+ RDF::DC.title => "descriptors"
+ } ]
+ @algorithm.metadata[RDF::DC.title] = "Physical-chemical descriptors"
+ render @algorithm
end
-=end
-=begin
- # Get a list of descriptor calculation
- # @return [text/uri-list] URIs
- get '/descriptor/?' do
- #uris = ["Openbabel","Cdk","Joelib"].collect do |lib|
- uris = ["Openbabel"].collect do |lib|
- klass = OpenTox::Descriptor.const_get lib
- klass.all
- end.flatten
- render uris
+ get '/descriptor/physchem/list/?' do
+ response['Content-Type'] = 'text/plain'
+ OpenTox::Algorithm::Descriptor::DESCRIPTORS.collect{|k,v| "#{k}\t#{v}"}.join "\n"
end
- get '/descriptor/:lib/?' do
- begin
- klass = OpenTox::Descriptor.const_get params[:lib].capitalize
- render klass.all
- rescue
- bad_request_error "Descriptor library '#{params[:lib]}' not found.", @uri
- end
+ get '/descriptor/physchem/unique/?' do
+ response['Content-Type'] = 'text/plain'
+ OpenTox::Algorithm::Descriptor::UNIQUEDESCRIPTORS.collect{|d| "#{d}\t#{OpenTox::Algorithm::Descriptor::DESCRIPTORS[d]}"}.join "\n"
end
- # Get representation of descriptor calculation
- # @return [String] Representation
- get '/descriptor/:lib/:descriptor/?' do
+ get '/descriptor/lookup/?' do
+ @algorithm.parameters += [ {
+ RDF::DC.description => "Read feature values from a dataset",
+ RDF::OT.paramScope => "mandatory",
+ RDF::DC.title => "feature_dataset_uri"
+ } ]
+ @algorithm.metadata[RDF::DC.title] = "Dataset lookup"
render @algorithm
end
- post '/descriptor/?' do
- descriptors = OpenTox::Descriptor::Set.new params
- if params[:compound_uri]
- compound = OpenTox::Compound.new params[:compound_uri]
- descriptors.calculate compound
- elsif params[:dataset_uri]
- task = OpenTox::Task.run("Calculating Smarts #{method} for dataset #{params[:dataset_uri]}.", @uri, @subjectid) do |task|
- dataset = OpenTox::Dataset.new params[:dataset_uri]
- descriptors.calculate dataset
- end
- response['Content-Type'] = 'text/uri-list'
- halt 202,task.uri
+ post '/descriptor/:method' do
+ puts params.inspect
+ if params[:method] == "physchem"
+ params[:descriptors] = OpenTox::Algorithm::Descriptor::UNIQUEDESCRIPTORS if !params[:descriptors] or params[:descriptors] == [""]
else
-
- end
- end
- #post '/descriptor/physchem/?' do
- #post '/descriptor/lookup/?' do
-
- # use /descriptor with dataset_uri and descriptor_uri parameters for efficient calculation of multiple compounds/descriptors
- post '/descriptor/:lib/:descriptor/?' do
- bad_request_error "Please provide a compound_uri parameter", @uri unless params[:compound_uri]
- params[:descriptor_uris] = [@uri]
- result = @algorithm.calculate(params)
- puts result.inspect
- result.to_json
- #compounds = [ Compound.new(params[:compound_uri], @subjectid) ]
- #send params[:lib].to_sym, compounds, @descriptors
- #@feature_dataset.put
- #@feature_dataset.uri
- end
-=end
-=begin
- ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk"
- JAVA_DIR = File.join(File.dirname(__FILE__),"java")
- CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last
- JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar")
- LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar")
- JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
-
- unless defined? DESCRIPTORS
-
- # initialize descriptors and features at startup to avoid duplication
- descriptors = { :cdk => [], :openbabel => [], :joelib => [] } # use arrays to keep the sequence intact
-
- @@obmol = OpenBabel::OBMol.new
- @@obconversion = OpenBabel::OBConversion.new
- @@obconversion.set_in_format 'inchi'
-
-
- # CDK
- cdk_descriptors = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`)
- cdk_descriptors.each do |descriptor|
- title = descriptor[:java_class].split('.').last.sub(/Descriptor/,'')
- descriptor[:title] = "Cdk " + title
- descriptor[:uri] = File.join $algorithm[:uri], "descriptor/cdk" ,title
- descriptor[:features] = []
- descriptor[:names].each do |name|
- descriptor[:features] << OpenTox::Feature.find_or_create({
- RDF::DC.title => "#{descriptor[:title]} #{name}",
- RDF.type => [RDF::OT.Feature, RDF::OT.NumericFeature],
- RDF::DC.description => descriptor[:description]
- }, @subjectid)
- end
- end
- descriptors[:cdk] = cdk_descriptors
-
- # Joelib
- joelib_descriptors = YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`) # strip Joelib messages at stdout
- joelib_descriptors.each do |descriptor|
- # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
- next if descriptor[:java_class] == "joelib2.feature.types.MoleculeHashcode" or descriptor[:java_class] == "joelib2.feature.types.GlobalTopologicalChargeIndex"
- title = descriptor[:java_class].split('.').last
- descriptor[:uri] = File.join $algorithm[:uri], "descriptor/joelib",title
- descriptor[:title] = "Joelib " + title
- descriptor[:feature] = OpenTox::Feature.find_or_create({
- RDF::DC.title => descriptor[:title],
- RDF.type => [RDF::OT.Feature, RDF::OT.NumericFeature],
- #RDF::DC.description => descriptor[:title], # impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java
- }, @subjectid)
- end
- descriptors[:joelib] = joelib_descriptors.select{|d| d[:title]}
-
- DESCRIPTORS = descriptors
-
- end
-
- helpers do
-
- def cdk compounds, descriptors
- sdf_3d compounds
- # use java system call (rjb blocks within tasks)
- # use Tempfiles to avoid "Argument list too long" error
- puts `java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{@sdf_file.path} #{descriptors.collect{|d| d[:title].split("\s").last}.join(" ")}`
- YAML.load_file(@sdf_file.path+"cdk.yaml").each_with_index do |calculation,i|
- $logger.error "Descriptor calculation failed for compound #{compounds[i].uri}." if calculation.empty?
- calculation.each do |name,value|
- feature = DESCRIPTORS[:cdk].collect{|d| d[:features]}.flatten.select{|f| f[RDF::DC.title].split("\s").last == name.to_s}.first
- @feature_dataset.add_data_entry compounds[i], feature, fix_value(value)
- end
- end
- end
-
- def joelib compounds, descriptors
- # use java system call (rjb blocks within tasks)
- # use Tempfiles to avoid "Argument list too long" error
- sdf_3d compounds
- puts `java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{@sdf_file.path} #{descriptors.collect{|d| d[:java_class]}.join(" ")}`
- YAML.load_file(@sdf_file.path+"joelib.yaml").each_with_index do |calculation,i|
- $logger.error "Descriptor calculation failed for compound #{compounds[i].uri}." if calculation.empty?
- calculation.each do |java_class,value|
- feature = DESCRIPTORS[:joelib].select{|d| d[:java_class] == java_class}.first[:feature]
- @feature_dataset.add_data_entry compounds[i], feature, fix_value(value)
- end
- end
+ bad_request_error "Please provide 'descriptors' parameters.", @uri unless params[:descriptors]
end
-
- def sdf_3d compounds
- unless @sdf_file and File.exists? @sdf_file.path
- @sdf_file = Tempfile.open("sdf")
- @@obconversion.set_out_format 'sdf'
- # create 3d sdf file (faster in Openbabel than in CDK)
- compounds.each do |compound|
- @@obconversion.read_string @@obmol, compound.inchi
- sdf_2d = @@obconversion.write_string(@@obmol)
- OpenBabel::OBOp.find_type("Gen3D").do(@@obmol)
- sdf_3d = @@obconversion.write_string(@@obmol)
- if sdf_3d.match(/.nan/)
- warning = "3D generation failed for compound #{compound.uri}, trying to calculate descriptors from 2D structure."
- $logger.warn warning
- @feature_dataset[RDF::OT.Warnings] ? @feature_dataset[RDF::OT.Warnings] << warning : @feature_dataset[RDF::OT.Warnings] = warning
- @sdf_file.puts sdf_2d
- else
- @sdf_file.puts sdf_3d
+ if params[:compound_uri] # return json
+ @compounds = [params[:compound_uri]].flatten.collect{|u| OpenTox::Compound.new u}
+ result = OpenTox::Algorithm::Descriptor.send(params[:method].to_sym, @compounds, params[:descriptors])
+ Hash[result.map {|compound, v| [compound.uri, v] }].to_json
+ elsif params[:dataset_uri] # return dataset
+ task = OpenTox::Task.run("Calculating #{params[:method]} descriptors for dataset #{params[:dataset_uri]}.", @uri, @subjectid) do |task|
+ @compounds = OpenTox::Dataset.new(params[:dataset_uri], SUBJECTID).compounds
+ result = OpenTox::Algorithm::Descriptor.send(params[:method].to_sym, @compounds, params[:descriptors])
+ dataset = OpenTox::Dataset.new nil, SUBJECTID
+ dataset.metadata = {
+ RDF::DC.title => "Physico-chemical descriptors",
+ RDF::DC.creator => @uri,
+ RDF::OT.hasSource => @uri,
+ }
+ dataset.parameters = [
+ { RDF::DC.title => "dataset_uri", RDF::OT.paramValue => params[:dataset_uri] },
+ { RDF::DC.title => "descriptors", RDF::OT.paramValue => params[:descriptors] },
+ ]
+ params[:method] == "smarts_match" ? feature_type = RDF::OT.NominalFeature : feature_type = RDF::OT.NumericFeature
+ @compounds.each do |compound|
+ @features ||= result[compound].keys.collect{|name|
+ OpenTox::Feature.find_or_create({
+ RDF::DC.title => name,
+ RDF.type => [RDF::OT.Feature, feature_type],
+ RDF::DC.description => OpenTox::Algorithm::Descriptor.description(name)
+ }, SUBJECTID)
+ }
+ @features.each do |feature|
+ value = result[compound][feature.title]
+ dataset.add_data_entry compound, feature, value if value
end
end
- @sdf_file.close
- end
- end
-<<<<<<< HEAD
-=======
-
- def fix_value val
- if val.numeric?
- val = Float(val)
- val = nil if val.nan? or val.infinite?
- else
- val = nil if val == "NaN"
- end
- val
- end
- end
-
- before '/descriptor/?*' do
- if request.get?
- @algorithm = OpenTox::Algorithm.new @uri, @subjectid
- @algorithm.parameters = [
- { RDF::DC.description => "Dataset URI",
- RDF::OT.paramScope => "optional",
- RDF::DC.title => "dataset_uri" } ,
- { RDF::DC.description => "Compound URI",
- RDF::OT.paramScope => "optional",
- RDF::DC.title => "compound_uri" }
- ]
- @algorithm.metadata = {
- RDF.type => [RDF::OTA.DescriptorCalculation],
- }
- elsif request.post?
- @feature_dataset = Dataset.new nil, @subjectid
- @feature_dataset.metadata = {
- RDF::DC.title => "Physico-chemical descriptors",
- RDF::DC.creator => @uri,
- RDF::OT.hasSource => @uri,
- }
- if params[:compound_uri]
- @feature_dataset.parameters = [ { RDF::DC.title => "compound_uri", RDF::OT.paramValue => params[:compound_uri] }]
- elsif params[:dataset_uri]
- @feature_dataset.parameters = [ { RDF::DC.title => "dataset_uri", RDF::OT.paramValue => params[:dataset_uri] }]
- else
- bad_request_error "Please provide a dataset_uri or compound_uri parameter", @uri
- end
- end
->>>>>>> ad386110267ecc3e0c5301769b4880a7e555a44e
- end
-
- before '/descriptor/:lib/:descriptor/?' do
- @descriptors = DESCRIPTORS[params[:lib].to_sym].select{|d| d[:title].split(" ").last == params[:descriptor]}
- bad_request_error "Unknown descriptor #{@uri}. See #{uri('descriptor')} for a complete list of supported descriptors.", @uri if @descriptors.empty?
- @descriptor = @descriptors.first
- end
-
- after do # Tempfile cleanup
- if @sdf_file and File.exists? @sdf_file.path
- FileUtils.rm Dir["#{@sdf_file.path}*.yaml"]
- @sdf_file.unlink
- end
- @sdf_file = nil
- end
-
- # Get representation of descriptor calculation
- # @return [String] Representation
- get '/descriptor/:lib/:descriptor/?' do
- render @algorithm
- end
-
- post '/descriptor/?' do
- task = OpenTox::Task.run "Calculating PC descriptors", @uri, @subjectid do |task|
- if params[:descriptor_uris]
- descriptors = {}
- params[:descriptor_uris].each do |descriptor_uri|
- lib = descriptor_uri.split('/')[-2]
- descriptors[lib.to_sym] ||= []
- descriptors[lib.to_sym] += DESCRIPTORS[lib.to_sym].select{|d| d[:uri] == descriptor_uri}
- end
- else
- descriptors = DESCRIPTORS
- end
- if params[:compound_uri]
- compounds = [ Compound.new(params[:compound_uri], @subjectid) ]
- elsif params[:dataset_uri]
- compounds = Dataset.new(params[:dataset_uri], @subjectid).compounds
+ dataset.put
+ dataset.uri
end
- [:openbabel, :cdk, :joelib].each{ |lib| send lib, compounds, descriptors[lib] if descriptors[lib] }
- @feature_dataset.put
- @feature_dataset.uri
+ response['Content-Type'] = 'text/uri-list'
+ halt 202,task.uri
+ else
+ bad_request_error "Please provide a dataset_uri or compound_uri parameter", @uri
end
- response['Content-Type'] = 'text/uri-list'
- halt 202, task.uri
end
-=end
end
diff --git a/java/CdkDescriptors.class b/java/CdkDescriptors.class
index c2dc069..34b973f 100644
--- a/java/CdkDescriptors.class
+++ b/java/CdkDescriptors.class
Binary files differ
diff --git a/java/CdkDescriptors.java b/java/CdkDescriptors.java
index 50f6379..644e3d8 100644
--- a/java/CdkDescriptors.java
+++ b/java/CdkDescriptors.java
@@ -11,19 +11,15 @@ class CdkDescriptors {
// parse command line arguments > 1 (descriptors)
DescriptorEngine engine;
- if (args.length > 1) {
- List<String> classNames = new ArrayList<String>();
- for (int i =1; i < args.length; i++) {
- classNames.add("org.openscience.cdk.qsar.descriptors.molecular." + args[i] + "Descriptor");
- }
- engine = new DescriptorEngine(classNames);
- List<IDescriptor> instances = engine.instantiateDescriptors(classNames);
- List<DescriptorSpecification> specs = engine.initializeSpecifications(instances);
- engine.setDescriptorInstances(instances);
- engine.setDescriptorSpecifications(specs);
- } else {
- engine = new DescriptorEngine(DescriptorEngine.MOLECULAR);
+ List<String> classNames = new ArrayList<String>();
+ for (int i =1; i < args.length; i++) {
+ classNames.add("org.openscience.cdk.qsar.descriptors.molecular." + args[i] + "Descriptor");
}
+ engine = new DescriptorEngine(classNames);
+ List<IDescriptor> instances = engine.instantiateDescriptors(classNames);
+ List<DescriptorSpecification> specs = engine.initializeSpecifications(instances);
+ engine.setDescriptorInstances(instances);
+ engine.setDescriptorSpecifications(specs);
try {
BufferedReader br = new BufferedReader(new FileReader(args[0]));
@@ -34,26 +30,22 @@ class CdkDescriptors {
try {
IMolecule molecule = (IMolecule)reader.next();
engine.process(molecule);
- Iterator it = molecule.getProperties().values().iterator();
+ Map<Object,Object> properties = molecule.getProperties();
Boolean first = true;
- while (it.hasNext()) {
+ for (Map.Entry<Object, Object> entry : properties.entrySet()) {
try {
- DescriptorValue value = (DescriptorValue)it.next();
- int size = value.getValue().length();
- if (size == 1) {
- if (first) { yaml.print("- "); }
- else { yaml.print(" "); }
- yaml.println(":"+value.getNames()[0].toString() + ": " + value.getValue());
- first = false;
- }
- else {
+ if ((entry.getKey() instanceof DescriptorSpecification) && (entry.getValue() instanceof DescriptorValue)) {
+ DescriptorSpecification property = (DescriptorSpecification)entry.getKey();
+ DescriptorValue value = (DescriptorValue)entry.getValue();
String[] values = value.getValue().toString().split(",");
- for (int i = 0; i < size; i++) {
- if (first) { yaml.print("- "); }
+ for (int i = 0; i < values.length; i++) {
+ if (first) { yaml.print("- "); first = false; }
else { yaml.print(" "); }
- yaml.println(":"+value.getNames()[i].toString() + ": " + values[i]);
- first = false;
+ String cdk_class = property.getImplementationTitle();
+ String name = cdk_class.substring(cdk_class.lastIndexOf(".")+1).replace("Descriptor","");
+ yaml.println("Cdk." + name + "." + value.getNames()[i] + ": " + values[i]);
}
+
}
}
catch (ClassCastException e) { } // sdf properties are stored as molecules properties (strings), ignore them
diff --git a/java/JoelibDescriptors.class b/java/JoelibDescriptors.class
index 7f3eced..d88ac63 100644
--- a/java/JoelibDescriptors.class
+++ b/java/JoelibDescriptors.class
Binary files differ
diff --git a/java/JoelibDescriptors.java b/java/JoelibDescriptors.java
index 64c099e..e90e35f 100644
--- a/java/JoelibDescriptors.java
+++ b/java/JoelibDescriptors.java
@@ -16,14 +16,8 @@ class JoelibDescriptors {
public static void main(String[] args) {
String[] features = null;
- // set features to all descriptors
- if (args.length == 1) {
- FeatureHelper helper = FeatureHelper.instance();
- features = (String[]) helper.getNativeFeatures().toArray(new String[0]);
- } else {
- features = new String[args.length-1];
- System.arraycopy(args,1,features,0,args.length-1);
- }
+ features = new String[args.length-1];
+ System.arraycopy(args,1,features,0,args.length-1);
FeatureFactory factory = FeatureFactory.instance();
MoleculeFileIO loader = null;
@@ -46,7 +40,7 @@ class JoelibDescriptors {
FeatureResult result = feature.calculate(mol);
if (i == 0) { yaml.print("- "); }
else { yaml.print(" "); }
- yaml.print( features[i]+": " );
+ yaml.print( "Joelib."+features[i]+": " );
yaml.println( result.toString() );
}
diff --git a/lib/descriptor.rb b/lib/descriptor.rb
index 8c8129c..8bc75ac 100644
--- a/lib/descriptor.rb
+++ b/lib/descriptor.rb
@@ -12,32 +12,39 @@ module OpenTox
LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar")
JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
-=begin
- def initialize uri, subjectid
- super uri, subjectid
- @parameters = [
- { RDF::DC.description => "Dataset URI",
- RDF::OT.paramScope => "optional",
- RDF::DC.title => "dataset_uri" } ,
- { RDF::DC.description => "Compound URI",
- RDF::OT.paramScope => "optional",
- RDF::DC.title => "compound_uri" }
- ]
- tokens = uri.split %r{/}
- @metadata = {
- RDF::DC.title => "#{tokens[-2].capitalize} #{tokens[-1]}",
- RDF.type => [RDF::OT.Algorithm, RDF::OTA.DescriptorCalculation],
- }
- end
-=end
-
- def self.list
- list = OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect{|line| "/openbabel/#{line.split(/\s+/).first}" }
- list += YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`).collect{|d| "cdk/#{d[:java_class].split('.').last.sub(/Descriptor/,'')}" }
- joelib = YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`) # strip Joelib messages at stdout
- # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
- list += joelib.collect{|d| "joelib/#{d[:java_class].split('.').last}" unless d[:java_class] == "joelib2.feature.types.MoleculeHashcode" or d[:java_class] == "joelib2.feature.types.GlobalTopologicalChargeIndex"}.compact
- list.collect{|item| File.join "descriptor",item}
+ obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title"]
+ OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
+ name,description = d.split(/\s+/,2)
+ ["Openbabel."+name,description] unless obexclude.include? name
+ end.compact.sort{|a,b| a[0] <=> b[0]}]
+
+ CDKDESCRIPTORS = Hash[YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`).collect { |d| ["Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}]
+
+ # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
+ joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
+ # strip Joelib messages from stdout
+ JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
+ name = d[:java_class].sub(/^joelib2.feature.types./,'')
+ # impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java
+ ["Joelib."+name, "no description available"] unless joelibexclude.include? name
+ end.compact.sort{|a,b| a[0] <=> b[0]}]
+
+ DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
+ require_relative "unique_descriptors.rb"
+
+ def self.description descriptor
+ lib = descriptor.split('.').first
+ case lib
+ when "Openbabel"
+ OBDESCRIPTORS[descriptor]
+ when "Cdk"
+ name = descriptor.split('.')[0..-2].join('.')
+ CDKDESCRIPTORS[name]
+ when "Joelib"
+ JOELIBDESCRIPTORS[descriptor]
+ when "lookup"
+ "Read feature values from a dataset"
+ end
end
def self.smarts_match compounds, smarts, count=false
@@ -71,16 +78,17 @@ module OpenTox
def self.physchem compounds, descriptors
des = {}
descriptors.each do |d|
- lib, descriptor = d.split(".")
- des[lib.to_sym] ||= []
- des[lib.to_sym] << descriptor
+ lib, descriptor = d.split(".",2)
+ lib = lib.downcase.to_sym
+ des[lib] ||= []
+ des[lib] << descriptor
end
result = {}
des.each do |lib,d|
send(lib, compounds, d).each do |compound,values|
result[compound] ||= {}
result[compound].merge! values
- end
+ end
end
result
end
@@ -95,7 +103,7 @@ module OpenTox
obconversion.read_string obmol, compound.inchi
fingerprint[compound] = {}
obdescriptors.each_with_index do |descriptor,i|
- fingerprint[compound][descriptors[i]] = fix_value(descriptor.predict(obmol))
+ fingerprint[compound]["Openbabel."+descriptors[i]] = fix_value(descriptor.predict(obmol))
end
end
fingerprint
@@ -109,7 +117,7 @@ module OpenTox
fingerprint = {}
YAML.load_file(sdf+"cdk.yaml").each_with_index do |calculation,i|
$logger.error "Descriptor calculation failed for compound #{compounds[i].uri}." if calculation.empty?
- descriptors.each_with_index do |descriptor,j|
+ descriptors.each do |descriptor|
fingerprint[compounds[i]] = calculation
end
end
@@ -125,7 +133,7 @@ module OpenTox
fingerprint = {}
YAML.load_file(sdf+"joelib.yaml").each_with_index do |calculation,i|
$logger.error "Descriptor calculation failed for compound #{compounds[i].uri}." if calculation.empty?
- descriptors.each_with_index do |descriptor,j|
+ descriptors.each do |descriptor|
fingerprint[compounds[i]] = calculation
end
end
@@ -186,44 +194,3 @@ module OpenTox
end
end
end
-=begin
- class Set
-
- def initialize params
- bad_request_error "Please provide a compound_uri or dataset_uri parameter." unless params[:compound_uri] or params[:dataset_uri]
- @dataset = OpenTox::Dataset.new params[:dataset_uri]
- @compound = OpenTox::Compound.new params[:compound_uri]
- @descriptors = []
-
- end
-
- def calculate
- end
-
- end
-
- class Openbabel
- include Descriptor
-
- def initialize uri, subjectid=nil
- descriptor = OpenBabel::OBDescriptor.find_type(uri.split("/").last)
- bad_request_error "Unknown descriptor #{uri}. See #{File.join $algorithm[:uri], "descriptor"} for a list of supported descriptors.", uri unless descriptor
- super uri, subjectid
- @metadata[RDF::DC.description] = descriptor.description.split("\n").first
- @obmol = OpenBabel::OBMol.new
- @obconversion = OpenBabel::OBConversion.new
- @obconversion.set_in_format 'inchi'
- end
-
- def self.all
- OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
- title = d.split(/\s+/).first
- unless title =~ /cansmi|formula|InChI|smarts|title/ or title == "s"
- File.join $algorithm[:uri], "descriptor/openbabel" ,title
- end
- end.compact.sort{|a,b| a.upcase <=> b.upcase}
- end
-
-
- end
-=end
diff --git a/lib/lazar.rb b/lib/lazar.rb
index d2eba5c..f88c695 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -184,8 +184,6 @@ module OpenTox
if @compound_uri # add neighbors only for compound predictions
@neighbors.each do |neighbor|
- puts "Neighbor"
- puts neighbor.inspect
n = neighbor[:compound]
@prediction_feature.feature_type == "classification" ? a = @prediction_feature.value_map[neighbor[:activity]] : a = neighbor[:activity]
@prediction_dataset.add_data_entry n, @prediction_feature, a
@@ -195,7 +193,6 @@ module OpenTox
end
end # iteration over compounds
- puts prediction_dataset.to_turtle
@prediction_dataset.put
@prediction_dataset
diff --git a/lib/transform.rb b/lib/transform.rb
index 7b92df5..cb4fc87 100644
--- a/lib/transform.rb
+++ b/lib/transform.rb
@@ -401,8 +401,6 @@ module OpenTox
# Same for compound fingerprints.
def get_matrices
@compounds = @model.training_compounds
- puts @compounds.inspect
- #@fingerprints = @model.fingerprints
@activities = @model.training_activities
@n_prop = @model.training_fingerprints
@q_prop = @model.query_fingerprint
diff --git a/lib/unique_descriptors.rb b/lib/unique_descriptors.rb
new file mode 100644
index 0000000..676f34a
--- /dev/null
+++ b/lib/unique_descriptors.rb
@@ -0,0 +1,120 @@
+# set of non redundant descriptors, faster algorithms are preferred
+# TODO:
+# select logP algorithm
+# select l5 algorithm
+# use smarts matcher for atom counts
+# check correlations
+UNIQUEDESCRIPTORS = [
+ "Openbabel.abonds", #Number of aromatic bonds
+ "Openbabel.atoms", #Number of atoms
+ "Openbabel.bonds", #Number of bonds
+ "Openbabel.dbonds", #Number of double bonds
+ "Openbabel.HBA1", #Number of Hydrogen Bond Acceptors 1 (JoelLib)
+ "Openbabel.HBA2", #Number of Hydrogen Bond Acceptors 2 (JoelLib)
+ "Openbabel.HBD", #Number of Hydrogen Bond Donors (JoelLib)
+ "Openbabel.L5", #Lipinski Rule of Five
+ "Openbabel.logP", #octanol/water partition coefficient
+ "Openbabel.MP", #Melting point
+ "Openbabel.MR", #molar refractivity
+ "Openbabel.MW", #Molecular Weight filter
+ "Openbabel.nF", #Number of Fluorine Atoms
+ "Openbabel.sbonds", #Number of single bonds
+ "Openbabel.tbonds", #Number of triple bonds
+ "Openbabel.TPSA", #topological polar surface area
+ "Cdk.ALOGP", #Calculates atom additive logP and molar refractivity values as described by Ghose and Crippen and
+ "Cdk.APol", #Descriptor that calculates the sum of the atomic polarizabilities (including implicit hydrogens).
+ "Cdk.AcidicGroupCount", #Returns the number of acidic groups.
+ "Cdk.AminoAcidCount", #Returns the number of amino acids found in the system
+ #"Cdk.AromaticAtomsCount", #Descriptor based on the number of aromatic atoms of a molecule.
+ #"Cdk.AromaticBondsCount", #Descriptor based on the number of aromatic bonds of a molecule.
+ #"Cdk.AtomCount", #Descriptor based on the number of atoms of a certain element type.
+ "Cdk.AutocorrelationCharge", #The Moreau-Broto autocorrelation descriptors using partial charges
+ "Cdk.AutocorrelationMass", #The Moreau-Broto autocorrelation descriptors using atomic weight
+ "Cdk.AutocorrelationPolarizability", #The Moreau-Broto autocorrelation descriptors using polarizability
+ "Cdk.BCUT", #Eigenvalue based descriptor noted for its utility in chemical diversity described by Pearlman et al. .
+ "Cdk.BPol", #Descriptor that calculates the sum of the absolute value of the difference between atomic polarizabilities of all bonded atoms in the molecule (including implicit hydrogens).
+ "Cdk.BasicGroupCount", #Returns the number of basic groups.
+ #"Cdk.BondCount", #Descriptor based on the number of bonds of a certain bond order.
+ "Cdk.CPSA", #A variety of descriptors combining surface area and partial charge information
+ "Cdk.CarbonTypes", #Characterizes the carbon connectivity in terms of hybridization
+ "Cdk.ChiChain", #Evaluates the Kier & Hall Chi chain indices of orders 3,4,5 and 6
+ "Cdk.ChiCluster", #Evaluates the Kier & Hall Chi cluster indices of orders 3,4,5,6 and 7
+ "Cdk.ChiPathCluster", #Evaluates the Kier & Hall Chi path cluster indices of orders 4,5 and 6
+ "Cdk.ChiPath", #Evaluates the Kier & Hall Chi path indices of orders 0,1,2,3,4,5,6 and 7
+ "Cdk.EccentricConnectivityIndex", #A topological descriptor combining distance and adjacency information.
+ "Cdk.FMF", #Descriptor characterizing molecular complexity in terms of its Murcko framework
+ "Cdk.FragmentComplexity", #Class that returns the complexity of a system. The complexity is defined as @cdk.cite{Nilakantan06}
+ "Cdk.GravitationalIndex", #Descriptor characterizing the mass distribution of the molecule.
+ #"Cdk.HBondAcceptorCount", #Descriptor that calculates the number of hydrogen bond acceptors.
+ #"Cdk.HBondDonorCount", #Descriptor that calculates the number of hydrogen bond donors.
+ "Cdk.HybridizationRatio", #Characterizes molecular complexity in terms of carbon hybridization states.
+ "Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential.
+ "Cdk.KappaShapeIndices", #Descriptor that calculates Kier and Hall kappa molecular shape indices.
+ "Cdk.KierHallSmarts", #Counts the number of occurrences of the E-state fragments
+ "Cdk.LargestChain", #Returns the number of atoms in the largest chain
+ "Cdk.LargestPiSystem", #Returns the number of atoms in the largest pi chain
+ "Cdk.LengthOverBreadth", #Calculates the ratio of length to breadth.
+ "Cdk.LongestAliphaticChain", #Returns the number of atoms in the longest aliphatic chain
+ "Cdk.MDE", #Evaluate molecular distance edge descriptors for C, N and O
+ "Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms .
+ "Cdk.MomentOfInertia", #Descriptor that calculates the principal moments of inertia and ratios of the principal moments. Als calculates the radius of gyration.
+ "Cdk.PetitjeanNumber", #Descriptor that calculates the Petitjean Number of a molecule.
+ "Cdk.PetitjeanShapeIndex", #The topological and geometric shape indices described Petitjean and Bath et al. respectively. Both measure the anisotropy in a molecule.
+ "Cdk.RotatableBondsCount", #Descriptor that calculates the number of nonrotatable bonds on a molecule.
+ #"Cdk.RuleOfFive", #This Class contains a method that returns the number failures of the Lipinski's Rule Of Five.
+ #"Cdk.TPSA", #Calculation of topological polar surface area based on fragment contributions .
+ "Cdk.VABC", #Describes the volume of a molecule.
+ "Cdk.VAdjMa", #Descriptor that calculates the vertex adjacency information of a molecule.
+ "Cdk.WHIM", #Holistic descriptors described by Todeschini et al .
+ #"Cdk.Weight", #Descriptor based on the weight of atoms of a certain element type. If no element is specified, the returned value is the Molecular Weight
+ "Cdk.WeightedPath", #The weighted path (molecular ID) descriptors described by Randic. They characterize molecular branching.
+ "Cdk.WienerNumbers", #This class calculates Wiener path number and Wiener polarity number.
+ "Cdk.XLogP", #Prediction of logP based on the atom-type method called XLogP.
+ "Cdk.ZagrebIndex", #The sum of the squared atom degrees of all heavy atoms.
+ "Joelib.count.NumberOfS", #no description available
+ "Joelib.count.NumberOfP", #no description available
+ "Joelib.count.NumberOfO", #no description available
+ "Joelib.count.NumberOfN", #no description available
+ #"Joelib.count.AromaticBonds", #no description available
+ "Joelib.count.NumberOfI", #no description available
+ "Joelib.count.NumberOfF", #no description available
+ "Joelib.count.NumberOfC", #no description available
+ "Joelib.count.NumberOfB", #no description available
+ "Joelib.count.HydrophobicGroups", #no description available
+ #"Joelib.KierShape3", #no description available
+ #"Joelib.KierShape2", #no description available
+ #"Joelib.KierShape1", #no description available
+ #"Joelib.count.AcidicGroups", #no description available
+ "Joelib.count.AliphaticOHGroups", #no description available
+ #"Joelib.count.NumberOfAtoms", #no description available
+ "Joelib.TopologicalRadius", #no description available
+ "Joelib.GeometricalShapeCoefficient", #no description available
+ #"Joelib.MolecularWeight", #no description available
+ "Joelib.FractionRotatableBonds", #no description available
+ #"Joelib.count.HBD2", #no description available
+ #"Joelib.count.HBD1", #no description available
+ "Joelib.LogP", #no description available
+ "Joelib.GraphShapeCoefficient", #no description available
+ "Joelib.count.BasicGroups", #no description available
+ #"Joelib.count.RotatableBonds", #no description available
+ "Joelib.count.HeavyBonds", #no description available
+ "Joelib.PolarSurfaceArea", #no description available
+ #"Joelib.ZagrebIndex1", #no description available
+ "Joelib.GeometricalRadius", #no description available
+ "Joelib.count.SO2Groups", #no description available
+ "Joelib.count.AromaticOHGroups", #no description available
+ "Joelib.GeometricalDiameter", #no description available
+ #"Joelib.MolarRefractivity", #no description available
+ "Joelib.count.NumberOfCl", #no description available
+ "Joelib.count.OSOGroups", #no description available
+ "Joelib.count.NumberOfBr", #no description available
+ "Joelib.count.NO2Groups", #no description available
+ "Joelib.count.HeteroCycles", #no description available
+ #"Joelib.count.HBA2", #no description available
+ #"Joelib.count.HBA1", #no description available
+ #"Joelib.count.NumberOfBonds", #no description available
+ "Joelib.count.SOGroups", #no description available
+ "Joelib.TopologicalDiameter", #no description available
+ "Joelib.count.NumberOfHal", #no description available
+
+].sort