diff options
author | Christoph Helma <helma@in-silico.ch> | 2013-07-02 19:20:23 +0200 |
---|---|---|
committer | Christoph Helma <helma@in-silico.ch> | 2013-07-02 19:20:23 +0200 |
commit | 404c644ee52bac97ff737fcb2057df3f1ec18b76 (patch) | |
tree | dfcb83af10fc48015818a4e0ed3f21232e548f4f | |
parent | c0f75827c36405c18a9108fa98106de2706eae8d (diff) |
algorithm service restructured. descriptor calculation and initial quantitative models working.
-rw-r--r-- | application.rb | 8 | ||||
-rw-r--r-- | descriptor.rb | 147 | ||||
-rw-r--r-- | fminer.rb | 16 | ||||
-rw-r--r-- | java/CdkDescriptors.class | bin | 3576 -> 3401 bytes | |||
-rw-r--r-- | java/JoelibDescriptors.class | bin | 2774 -> 2831 bytes | |||
-rw-r--r-- | java/JoelibDescriptors.java | 3 | ||||
-rw-r--r-- | lazar.rb | 18 | ||||
-rw-r--r-- | lib/descriptor.rb | 289 | ||||
-rw-r--r-- | lib/feature_values.rb | 81 | ||||
-rw-r--r-- | lib/fminer.rb | 9 | ||||
-rw-r--r-- | lib/lazar.rb | 314 | ||||
-rw-r--r-- | lib/neighbors.rb | 90 | ||||
-rw-r--r-- | lib/similarity.rb | 2 | ||||
-rw-r--r-- | lib/transform.rb | 33 |
14 files changed, 557 insertions, 453 deletions
diff --git a/application.rb b/application.rb index 6eb49e4..547da08 100644 --- a/application.rb +++ b/application.rb @@ -11,14 +11,6 @@ require_relative 'last-utils/lu.rb' # Library Code $logger.debug "Algorithm booting: #{$algorithm.collect{ |k,v| "#{k}: '#{v}'"} }" Dir['./lib/*.rb'].each { |f| require f; also_reload f } # Libs -=begin -Dir['./*.rb'].each do |f| - unless f == "unicorn.rb" - require_relative f - also_reload f # Webapps - end -end -=end [ "descriptor.rb", diff --git a/descriptor.rb b/descriptor.rb index 23f4174..13a6fa3 100644 --- a/descriptor.rb +++ b/descriptor.rb @@ -7,12 +7,36 @@ module OpenTox class Application < Service + before '/descriptor/:method' do + if params[:compound_uri] + @compounds = [params[:compound_uri]].flatten.collect{|u| OpenTox::Compound.new u} + elsif params[:dataset_uri] + @compounds = OpenTox::Dataset.new(params[:dataset_uri], @subjectid).compounds + else + bad_request_error "Please provide a dataset_uri or compound_uri parameter", @uri + end +=begin + new_params = {} + delete = [] + params.each do |k,v| + if k.match(/_uri$/) + klass = k.sub(/_uri$/,'') + v = [v] if v.is_a? String + new_params[klass] = v.collect{|u| OpenTox.const_get(klass.capitalize).new(u)} + delete << k + end + end + delete.each{|k| params.delete k} + params.merge! new_params +=end + end +=begin before '/descriptor/:lib/:descriptor/?' do #if request.get? - lib = @uri.split("/")[-2].capitalize - klass = OpenTox::Descriptor.const_get params[:lib].capitalize - @algorithm = klass.new @uri, @subjectid unless params[:lib] == "smarts" -=begin + #lib = @uri.split("/")[-2].capitalize + @klass = OpenTox::Descriptor.const_get params[:lib].capitalize + #@algorithm = klass.new @uri, @subjectid unless params[:lib] == "smarts" + @method = params[:descriptor].to_sym elsif request.post? @feature_dataset = Dataset.new nil, @subjectid @feature_dataset.metadata = { @@ -28,9 +52,88 @@ module OpenTox bad_request_error "Please provide a dataset_uri or compound_uri parameter", @uri end end + end =end + get '/descriptor/?' do + #OpenTox::Algorithm::Descriptor.list.collect{|d| uri d}.join "\n" + OpenTox::Algorithm::Descriptor.list.join "\n" + #OpenTox::Algorithm::Descriptor.list.inspect end + post '/descriptor/:method' do + puts params.inspect + bad_request_error "Please provide 'descriptors' parameters.", @uri unless params["descriptors"] + if params[:compound_uri] + result = OpenTox::Algorithm::Descriptor.send(params[:method].to_sym, @compounds, params["descriptors"]) + Hash[result.map {|compound, v| [compound.uri, v] }].to_json + elsif params[:dataset_uri] + puts "starting task" + task = OpenTox::Task.run("Calculating #{params[:method]} descriptors for dataset #{params[:dataset_uri]}.", @uri, @subjectid) do |task| + puts "start calculation" + result = OpenTox::Algorithm::Descriptor.send(params[:method].to_sym, @compounds, params["descriptors"]) + puts "create dataset" + puts result.inspect + dataset = OpenTox::Dataset.new nil, @subjectid + @compounds.each do |compound| + @features ||= result[compound].keys.collect{|name| + # TODO set other metadata + OpenTox::Feature.find_or_create({RDF::DC.title => name}, @subjectid) + } + @features.each do |feature| + value = result[compound][feature.title] + puts compound, feature, value if value + dataset.add_data_entry compound, feature, value if value + end + end + puts "put dataset" + dataset.put + puts "dataset stored" + dataset.uri + end + puts "Task" + puts task.uri + response['Content-Type'] = 'text/uri-list' + halt 202,task.uri + end + end + +=begin + post '/descriptor/smarts_match/?' do + bad_request_error "Please provide a compound_uri or dataset_uri parameter and a smarts parameter. The count parameter is optional and defaults to false." unless (params[:compound_uri] or params[:dataset_uri]) and params[:smarts] + params[:count] ? params[:count] = params[:count].to_boolean : params[:count] = false + if params[:compound_uri] + params[:compound_uri] = [ params[:compound_uri] ] unless params[:compound_uri].is_a? Array + response['Content-Type'] = "application/json" + OpenTox::Algorithm::Descriptor.smarts_match(params[:compound_uri].collect{|c| OpenTox::Compound.new c}, params[:smarts], params[:count]).to_json + elsif params[:dataset_uri] + task = OpenTox::Task.run("Calculating Smarts #{method} for dataset #{params[:dataset_uri]}.", @uri, @subjectid) do |task| + compounds = OpenTox::Dataset.new params[:dataset_uri] + matches = OpenTox::Descriptor::Smarts.fingerprint(compounds, params[:smarts], params[:count]) + end + response['Content-Type'] = 'text/uri-list' + halt 202,task.uri + end + end + + post '/descriptor/smarts_count/?' do + bad_request_error "Please provide a compound_uri or dataset_uri parameter and a smarts parameter. The count parameter is optional and defaults to false." unless (params[:compound_uri] or params[:dataset_uri]) and params[:smarts] + params[:count] ? params[:count] = params[:count].to_boolean : params[:count] = false + if params[:compound_uri] + params[:compound_uri] = [ params[:compound_uri] ] unless params[:compound_uri].is_a? Array + response['Content-Type'] = "application/json" + OpenTox::Algorithm::Descriptor.smarts_count(params[:compound_uri].collect{|c| OpenTox::Compound.new c}, params[:smarts]).to_json + elsif params[:dataset_uri] + task = OpenTox::Task.run("Calculating Smarts #{method} for dataset #{params[:dataset_uri]}.", @uri, @subjectid) do |task| + compounds = OpenTox::Dataset.new params[:dataset_uri] + matches = OpenTox::Descriptor::Smarts.fingerprint(compounds, params[:smarts], params[:count]) + end + response['Content-Type'] = 'text/uri-list' + halt 202,task.uri + end + end +=end + +=begin # Get a list of descriptor calculation # @return [text/uri-list] URIs get '/descriptor/?' do @@ -43,8 +146,12 @@ module OpenTox end get '/descriptor/:lib/?' do - klass = OpenTox::Descriptor.const_get params[:lib].capitalize - render klass.all + begin + klass = OpenTox::Descriptor.const_get params[:lib].capitalize + render klass.all + rescue + bad_request_error "Descriptor library '#{params[:lib]}' not found.", @uri + end end # Get representation of descriptor calculation @@ -53,30 +160,38 @@ module OpenTox render @algorithm end - post '/descriptor/smarts/:method/?' do - method = params[:method].to_sym - bad_request_error "Please provide a compound_uri or dataset_uri parameter and a smarts parameter. The count parameter is optional and defaults to false." unless (params[:compound_uri] or params[:dataset_uri]) and params[:smarts] - params[:count] ? params[:count] = params[:count].to_boolean : params[:count] = false + post '/descriptor/?' do + descriptors = OpenTox::Descriptor::Set.new params if params[:compound_uri] - compounds = OpenTox::Compound.new params[:compound_uri] - response['Content-Type'] = "application/json" - OpenTox::Descriptor::Smarts.send(method, compounds, params[:smarts], params[:count]).to_json + compound = OpenTox::Compound.new params[:compound_uri] + descriptors.calculate compound elsif params[:dataset_uri] - compounds = OpenTox::Dataset.new params[:dataset_uri] - # TODO: create and return dataset + task = OpenTox::Task.run("Calculating Smarts #{method} for dataset #{params[:dataset_uri]}.", @uri, @subjectid) do |task| + dataset = OpenTox::Dataset.new params[:dataset_uri] + descriptors.calculate dataset + end + response['Content-Type'] = 'text/uri-list' + halt 202,task.uri + else + end end + #post '/descriptor/physchem/?' do + #post '/descriptor/lookup/?' do # use /descriptor with dataset_uri and descriptor_uri parameters for efficient calculation of multiple compounds/descriptors post '/descriptor/:lib/:descriptor/?' do bad_request_error "Please provide a compound_uri parameter", @uri unless params[:compound_uri] params[:descriptor_uris] = [@uri] - @algorithm.calculate params + result = @algorithm.calculate(params) + puts result.inspect + result.to_json #compounds = [ Compound.new(params[:compound_uri], @subjectid) ] #send params[:lib].to_sym, compounds, @descriptors #@feature_dataset.put #@feature_dataset.uri end +=end =begin ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk" JAVA_DIR = File.join(File.dirname(__FILE__),"java") @@ -21,21 +21,7 @@ module OpenTox # Get list of fminer algorithms # @return [text/uri-list] URIs get '/fminer/?' do -=begin - list = [ uri('/fminer/bbrc'), - #uri('/fminer/bbrc/sample'), - uri('/fminer/last'), - #uri('/fminer/bbrc/match'), - #uri('/fminer/last/match') - ].join("\n") + "\n" - render(list) -=end - render [ uri('/fminer/bbrc'), - #uri('/fminer/bbrc/sample'), - uri('/fminer/last'), - #uri('/fminer/bbrc/match'), - #uri('/fminer/last/match') - ] + render [ uri('/fminer/bbrc'), uri('/fminer/last') ] end # Get representation of BBRC algorithm diff --git a/java/CdkDescriptors.class b/java/CdkDescriptors.class Binary files differindex 9373bc7..c2dc069 100644 --- a/java/CdkDescriptors.class +++ b/java/CdkDescriptors.class diff --git a/java/JoelibDescriptors.class b/java/JoelibDescriptors.class Binary files differindex 1426e7d..7f3eced 100644 --- a/java/JoelibDescriptors.class +++ b/java/JoelibDescriptors.class diff --git a/java/JoelibDescriptors.java b/java/JoelibDescriptors.java index ecd1b3f..64c099e 100644 --- a/java/JoelibDescriptors.java +++ b/java/JoelibDescriptors.java @@ -41,7 +41,8 @@ class JoelibDescriptors { Boolean success = loader.read(mol); if (!success) { break; } // last molecule for (int i =0; i < features.length; i++) { - Feature feature = factory.getFeature(features[i]); + String name = "joelib2.feature.types." + features[i]; + Feature feature = factory.getFeature(name); FeatureResult result = feature.calculate(mol); if (i == 0) { yaml.print("- "); } else { yaml.print(" "); } @@ -4,7 +4,7 @@ module OpenTox # Get representation of lazar algorithm # @return [String] Representation get '/lazar/?' do - algorithm = OpenTox::Algorithm.new(to('/lazar',:full)) + algorithm = OpenTox::Algorithm::Generic.new(to('/lazar',:full)) algorithm.metadata = { RDF::DC.title => 'lazar', RDF::DC.creator => 'helma@in-silico.ch, andreas@maunz.de', @@ -17,7 +17,6 @@ module OpenTox { RDF::DC.description => "Feature dataset URI", RDF::OT.paramScope => "optional", RDF::DC.title => "feature_dataset_uri" }, { RDF::DC.description => "Further parameters for the feature generation service", RDF::OT.paramScope => "optional" } ] - #format_output(algorithm) render algorithm end @@ -33,11 +32,7 @@ module OpenTox #resource_not_found_error "Dataset '#{params[:dataset_uri]}' not found." unless URI.accessible? params[:dataset_uri], @subjectid # wrong URI class bad_request_error "Please provide a feature_generation_uri parameter." unless params[:feature_generation_uri] task = OpenTox::Task.run("Create lazar model", uri('/lazar'), @subjectid) do |task| - #lazar = OpenTox::Model::Lazar.new(nil, @subjectid) - lazar = OpenTox::Model::Lazar.new(File.join($model[:uri],SecureRandom.uuid), @subjectid) - lazar.create(params) - #lazar.put - #lazar.uri + OpenTox::Model::Lazar.create(params) end response['Content-Type'] = 'text/uri-list' halt 202,task.uri @@ -60,19 +55,12 @@ module OpenTox post '/lazar/predict/?' do # pass parameters instead of model_uri, because model service is blocked by incoming call - puts "LAZAR" - puts params.inspect task = OpenTox::Task.run("Apply lazar model",uri('/lazar/predict'), @subjectid) do |task| - - lazar = OpenTox::LazarPrediction.new params - puts lazar.inspect - lazar.prediction_dataset.uri - + OpenTox::Model::Lazar.new(params[:model_uri]).predict(params).uri end response['Content-Type'] = 'text/uri-list' halt 202,task.uri end - end end diff --git a/lib/descriptor.rb b/lib/descriptor.rb index b6b7cd4..8c8129c 100644 --- a/lib/descriptor.rb +++ b/lib/descriptor.rb @@ -1,105 +1,56 @@ +require 'digest/md5' +ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk" module OpenTox - module Descriptor - include OpenTox - - def initialize uri, subjectid - super uri, subjectid - @parameters = [ - { RDF::DC.description => "Dataset URI", - RDF::OT.paramScope => "optional", - RDF::DC.title => "dataset_uri" } , - { RDF::DC.description => "Compound URI", - RDF::OT.paramScope => "optional", - RDF::DC.title => "compound_uri" } - ] - tokens = uri.split %r{/} - @metadata = { - RDF::DC.title => "#{tokens[-2].capitalize} #{tokens[-1]}", - RDF.type => [RDF::OT.Algorithm, RDF::OTA.DescriptorCalculation], - } - end + module Algorithm + class Descriptor + include OpenTox - def fix_value val - if val.numeric? - val = Float(val) - val = nil if val.nan? or val.infinite? - else - val = nil if val == "NaN" - end - val - end + JAVA_DIR = File.join(File.dirname(__FILE__),"..","java") + CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last + JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar") + LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar") + JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar") - class Openbabel - include Descriptor - - def initialize uri, subjectid=nil - descriptor = OpenBabel::OBDescriptor.find_type(uri.split("/").last) - bad_request_error "Unknown descriptor #{uri}. See #{File.join $algorithm[:uri], "descriptor"} for a list of supported descriptors.", uri unless descriptor +=begin + def initialize uri, subjectid super uri, subjectid - @metadata[RDF::DC.description] = descriptor.description.split("\n").first - @obmol = OpenBabel::OBMol.new - @obconversion = OpenBabel::OBConversion.new - @obconversion.set_in_format 'inchi' + @parameters = [ + { RDF::DC.description => "Dataset URI", + RDF::OT.paramScope => "optional", + RDF::DC.title => "dataset_uri" } , + { RDF::DC.description => "Compound URI", + RDF::OT.paramScope => "optional", + RDF::DC.title => "compound_uri" } + ] + tokens = uri.split %r{/} + @metadata = { + RDF::DC.title => "#{tokens[-2].capitalize} #{tokens[-1]}", + RDF.type => [RDF::OT.Algorithm, RDF::OTA.DescriptorCalculation], + } end +=end - def self.all - puts OpenBabel::OBDescriptor.list_as_string("descriptors") - OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d| - title = d.split(/\s+/).first - puts title - unless title =~ /cansmi|formula|InChI|smarts|title/ or title == "s" - File.join $algorithm[:uri], "descriptor/openbabel" ,title - end - end.compact.sort{|a,b| a.upcase <=> b.upcase} - end - - # TODO: add to feature dataset - # find feature - # generic method for all libs - def calculate params - if params[:compound_uri] - compounds = [ Compound.new(params[:compound_uri], @subjectid) ] - elsif params[:dataset_uri] - compounds = Dataset.new(params[:dataset_uri], @subjectid).compounds - end - compounds.collect do |compound| - @obconversion.read_string @obmol, compound.inchi - params[:descriptor_uris].each do |descriptor_uri| - method = descriptor_uri.split('/').last - calculator = OpenBabel::OBDescriptor.find_type method - value = fix_value calculator.predict(@obmol) - feature = OpenTox::Feature.find_or_create({ - RDF::DC.title => "OpenBabel "+method, - RDF.type => [RDF::OT.Feature, RDF::OT.NumericFeature], - RDF::DC.description => calculator.description, - }, @subjectid) - [compound, feature, value] - end - end + def self.list + list = OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect{|line| "/openbabel/#{line.split(/\s+/).first}" } + list += YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`).collect{|d| "cdk/#{d[:java_class].split('.').last.sub(/Descriptor/,'')}" } + joelib = YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`) # strip Joelib messages at stdout + # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug) + list += joelib.collect{|d| "joelib/#{d[:java_class].split('.').last}" unless d[:java_class] == "joelib2.feature.types.MoleculeHashcode" or d[:java_class] == "joelib2.feature.types.GlobalTopologicalChargeIndex"}.compact + list.collect{|item| File.join "descriptor",item} end - end - - class Smarts - def self.fingerprint compounds, smarts, count=false - if compounds.is_a? OpenTox::Compound - compounds = [compounds] - elsif compounds.is_a? OpenTox::Dataset - # TODO: create and return dataset - compounds = compounds.compounds - else - bad_request_error "Cannot match smarts on #{compounds.class} objects." - end - smarts = [smarts] unless smarts.is_a? Array + def self.smarts_match compounds, smarts, count=false obconversion = OpenBabel::OBConversion.new obmol = OpenBabel::OBMol.new obconversion.set_in_format('inchi') smarts_pattern = OpenBabel::OBSmartsPattern.new - matches = [] + fingerprint = {} + compounds = [compounds] unless compounds.is_a? Array + smarts = [smarts] unless smarts.is_a? Array compounds.each do |compound| obconversion.read_string(obmol,compound.inchi) - matches << [] + fingerprint[compound] = {} smarts.each do |smart| smarts_pattern.init(smart) if smarts_pattern.match(obmol) @@ -107,16 +58,172 @@ module OpenTox else value = 0 end - matches.last << value + fingerprint[compound][smart] = value end end - matches + fingerprint end def self.smarts_count compounds, smarts - smarts_fingerprint compounds,smarts,true + smarts_match compounds,smarts,true + end + + def self.physchem compounds, descriptors + des = {} + descriptors.each do |d| + lib, descriptor = d.split(".") + des[lib.to_sym] ||= [] + des[lib.to_sym] << descriptor + end + result = {} + des.each do |lib,d| + send(lib, compounds, d).each do |compound,values| + result[compound] ||= {} + result[compound].merge! values + end + end + result + end + + def self.openbabel compounds, descriptors + obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d} + obmol = OpenBabel::OBMol.new + obconversion = OpenBabel::OBConversion.new + obconversion.set_in_format 'inchi' + fingerprint = {} + compounds.each do |compound| + obconversion.read_string obmol, compound.inchi + fingerprint[compound] = {} + obdescriptors.each_with_index do |descriptor,i| + fingerprint[compound][descriptors[i]] = fix_value(descriptor.predict(obmol)) + end + end + fingerprint + end + + def self.cdk compounds, descriptors + sdf = sdf_3d compounds + # use java system call (rjb blocks within tasks) + # use Tempfiles to avoid "Argument list too long" error + `java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf} #{descriptors.join(" ")}` + fingerprint = {} + YAML.load_file(sdf+"cdk.yaml").each_with_index do |calculation,i| + $logger.error "Descriptor calculation failed for compound #{compounds[i].uri}." if calculation.empty? + descriptors.each_with_index do |descriptor,j| + fingerprint[compounds[i]] = calculation + end + end + FileUtils.rm sdf+"cdk.yaml" + fingerprint + end + + def self.joelib compounds, descriptors + # use java system call (rjb blocks within tasks) + # use Tempfiles to avoid "Argument list too long" error + sdf = sdf_3d compounds + `java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf} #{descriptors.join(' ')}` + fingerprint = {} + YAML.load_file(sdf+"joelib.yaml").each_with_index do |calculation,i| + $logger.error "Descriptor calculation failed for compound #{compounds[i].uri}." if calculation.empty? + descriptors.each_with_index do |descriptor,j| + fingerprint[compounds[i]] = calculation + end + end + FileUtils.rm sdf+"joelib.yaml" + fingerprint end + + def self.lookup compounds, features, dataset + fingerprint = [] + compounds.each do |compound| + fingerprint << [] + features.each do |feature| + end + end + end + + def self.sdf_3d compounds + obconversion = OpenBabel::OBConversion.new + obmol = OpenBabel::OBMol.new + obconversion.set_in_format 'inchi' + obconversion.set_out_format 'sdf' + digest = Digest::MD5.hexdigest compounds.inspect + sdf_file = "/tmp/#{digest}.sdf" + unless File.exists? sdf_file # do not recreate existing 3d sdfs + sdf = File.open sdf_file,"w+" + # create 3d sdf file (faster in Openbabel than in CDK) + compounds.each do |compound| + obconversion.read_string obmol, compound.inchi + sdf_2d = obconversion.write_string(obmol) + OpenBabel::OBOp.find_type("Gen3D").do(obmol) + sdf_3d = obconversion.write_string(obmol) + if sdf_3d.match(/.nan/) + warning = "3D generation failed for compound #{compound.uri}, trying to calculate descriptors from 2D structure." + $logger.warn warning + # TODO + #@feature_dataset[RDF::OT.Warnings] ? @feature_dataset[RDF::OT.Warnings] << warning : @feature_dataset[RDF::OT.Warnings] = warning + sdf.puts sdf_2d + else + sdf.puts sdf_3d + end + end + sdf.close + end + sdf_file + end + + def self.fix_value val + val = val.first if val.is_a? Array and val.size == 1 + if val.numeric? + val = Float(val) + val = nil if val.nan? or val.infinite? + else + val = nil if val == "NaN" + end + val + end + private_class_method :sdf_3d, :fix_value end end - end +=begin + class Set + + def initialize params + bad_request_error "Please provide a compound_uri or dataset_uri parameter." unless params[:compound_uri] or params[:dataset_uri] + @dataset = OpenTox::Dataset.new params[:dataset_uri] + @compound = OpenTox::Compound.new params[:compound_uri] + @descriptors = [] + + end + + def calculate + end + + end + + class Openbabel + include Descriptor + + def initialize uri, subjectid=nil + descriptor = OpenBabel::OBDescriptor.find_type(uri.split("/").last) + bad_request_error "Unknown descriptor #{uri}. See #{File.join $algorithm[:uri], "descriptor"} for a list of supported descriptors.", uri unless descriptor + super uri, subjectid + @metadata[RDF::DC.description] = descriptor.description.split("\n").first + @obmol = OpenBabel::OBMol.new + @obconversion = OpenBabel::OBConversion.new + @obconversion.set_in_format 'inchi' + end + + def self.all + OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d| + title = d.split(/\s+/).first + unless title =~ /cansmi|formula|InChI|smarts|title/ or title == "s" + File.join $algorithm[:uri], "descriptor/openbabel" ,title + end + end.compact.sort{|a,b| a.upcase <=> b.upcase} + end + + + end +=end diff --git a/lib/feature_values.rb b/lib/feature_values.rb deleted file mode 100644 index b441c23..0000000 --- a/lib/feature_values.rb +++ /dev/null @@ -1,81 +0,0 @@ -=begin -* Name: feature_values.rb -* Description: Feature value calculation -* Author: Andreas Maunz <andreas@maunz.de> -* Date: 10/2012 -=end - -module OpenTox - class Algorithm - - class FeatureValues - # Substructure matching - # @param [Hash] keys: compound, feature_dataset, values: OpenTox::Compound, Array of SMARTS strings - # @return [Array] Array with matching Smarts - def self.match(params, subjectid) - features = params[:feature_dataset].features.collect{ |f| f[RDF::DC.title] } - params[:compound].match(features) - end - - # Substructure matching with number of non-unique hits - # @param [Hash] keys: compound, feature_dataset, values: OpenTox::Compound, Array of SMARTS strings - # @return [Hash] Hash with matching Smarts and number of hits - def self.match_hits(params, subjectid) - features = params[:feature_dataset].features.collect{ |f| f[RDF::DC.title] }, - params[:compound].match_hits(features) - end - - # PC descriptor calculation - # @param [Hash] keys: compound, feature_dataset, pc_type, lib, values: OpenTox::Compound, String, String - # @return [Hash] Hash with feature name as key and value as value - def self.lookup(params, subjectid) - puts "lookup started" - ds = params[:feature_dataset] - #ds.build_feature_positions - cmpd_inchi = params[:compound].inchi - cmpd_idxs = ds.compounds.each_with_index.collect{ |cmpd,idx| - idx if cmpd.inchi == cmpd_inchi - }.compact - if cmpd_idxs.size > 0 # We have entries - puts "entries" - cmpd_numeric_f = ds.features.collect { |f| - f if f[RDF.type].include? RDF::OT.NumericFeature - }.compact - cmpd_data_entries = cmpd_idxs.collect { |idx| - ds.data_entries[idx] - } - cmpd_fingerprints = cmpd_numeric_f.inject({}) { |h,f| - values = cmpd_data_entries.collect { |entry| - val = entry[ds.feature_positions[f.uri]] - val.nil? ? nil : val.to_f - }.compact - h[f.title] = (values.size > 0) ? values.to_scale.median : nil # AM: median for numeric features - h - } - (ds.features - cmpd_numeric_f).each { |f| - values = cmpd_data_entries.collect { |entry| - val = entry[ds.feature_positions[f.uri]] - val.nil? ? nil : val - }.compact - cmpd_fingerprints[f.title] = values.to_scale.mode # AM: mode for the others - } - else # We need lookup - puts "no entries" - params[:subjectid] = subjectid - [:compound, :feature_dataset].each { |p| params.delete(p) }; [:pc_type, :lib].each { |p| params.delete(p) if params[p] == "" } - single_cmpd_ds = OpenTox::Dataset.new(nil,subjectid) - # TODO: ntriples !!! - single_cmpd_ds.parse_rdfxml(RestClientWrapper.post(File.join($compound[:uri],cmpd_inchi,"pc"), params, {:accept => "application/rdf+xml"})) - single_cmpd_ds.get(true) - #single_cmpd_ds.build_feature_positions - cmpd_fingerprints = single_cmpd_ds.features.inject({}) { |h,f| - h[f.title] = single_cmpd_ds.data_entries[0][single_cmpd_ds.feature_positions[f.uri]] - h - } - end - cmpd_fingerprints - end - end - - end -end diff --git a/lib/fminer.rb b/lib/fminer.rb index 7f88c8b..6b21ce8 100644 --- a/lib/fminer.rb +++ b/lib/fminer.rb @@ -6,14 +6,17 @@ =end module OpenTox - class Algorithm + module Algorithm # Fminer algorithms (https://github.com/amaunz/fminer2) - class Fminer < Algorithm + class Fminer #< Algorithm + attr_accessor :prediction_feature, :training_dataset, :minfreq, :compounds, :db_class_sizes, :all_activities, :smi def initialize(uri, subjectid=nil) - super(uri, subjectid) + @uri = uri + @subjectid = subjectid + #super(uri, subjectid) end diff --git a/lib/lazar.rb b/lib/lazar.rb index 98293d5..d2eba5c 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -7,214 +7,198 @@ module OpenTox - class LazarPrediction < Model - - attr_accessor :prediction_dataset - - def initialize(params) - @prediction_dataset = OpenTox::Dataset.new(nil, @subjectid) - # set instance variables and prediction dataset parameters from parameters - params.each {|k,v| - self.class.class_eval { attr_accessor k.to_sym } - instance_variable_set "@#{k}", v - @prediction_dataset.parameters << {RDF::DC.title => k, RDF::OT.paramValue => v} - } - ["cmpds", "fps", "acts", "n_prop", "q_prop", "neighbors"].each {|k| - self.class.class_eval { attr_accessor k.to_sym } - instance_variable_set("@#{k}", []) - } - - @prediction_feature = OpenTox::Feature.new @prediction_feature_uri, @subjectid - @predicted_variable = OpenTox::Feature.new @predicted_variable_uri, @subjectid - @predicted_confidence = OpenTox::Feature.new @predicted_confidence_uri, @subjectid - @prediction_dataset.metadata = { - RDF::DC.title => "Lazar prediction for #{@prediction_feature.title}", - RDF::DC.creator => @model_uri, - RDF::OT.hasSource => @model_uri, - RDF::OT.dependentVariables => @prediction_feature_uri, - RDF::OT.predictedVariables => [@predicted_variable_uri,@predicted_confidence_uri] - } - - @training_dataset = OpenTox::Dataset.new(@training_dataset_uri,@subjectid) - - @feature_dataset = OpenTox::Dataset.new(@feature_dataset_uri, @subjectid) - bad_request_error "No features found in feature dataset #{@feature_dataset.uri}." if @feature_dataset.features.empty? - - @similarity_feature = OpenTox::Feature.find_or_create({RDF::DC.title => "#{@similarity_algorithm.capitalize} similarity", RDF.type => [RDF::OT.Feature, RDF::OT.NumericFeature]}, @subjectid) - - @prediction_dataset.features = [ @predicted_variable, @predicted_confidence, @prediction_feature, @similarity_feature ] - - prediction_feature_pos = @training_dataset.features.collect{|f| f.uri}.index @prediction_feature.uri - - if @dataset_uri - compounds = OpenTox::Dataset.new(@dataset_uri, @subjectid).compounds - else - compounds = [ OpenTox::Compound.new(@compound_uri, @subjectid) ] - end - - compounds.each do |compound| - - - #database_activity = @training_dataset.database_activity(params) - database_activities = @training_dataset.values(compound,@prediction_feature) - if database_activities and !database_activities.empty? - database_activities.each do |database_activity| - @prediction_dataset.add_data_entry compound, @prediction_feature, database_activity - end - next - else - # AM: transform to cosine space - @min_sim = (@min_sim.to_f*2.0-1.0).to_s if @similarity_algorithm =~ /cosine/ - - compound_params = { - :compound => compound, - :feature_dataset => @feature_dataset, - } - #compound_fingerprints = OpenTox::Algorithm::FeatureValues.send( @feature_calculation_algorithm, compound_params, @subjectid ) - # TODO: fix for pc descriptors - #compound_fingerprints = OpenTox::Algorithm::Descriptor.send( @feature_calculation_algorithm, compound, @feature_dataset.features.collect{ |f| f[RDF::DC.title] } ) - compound_fingerprints = eval("#{@feature_calculation_algorithm}(compound, @feature_dataset.features.collect{ |f| f[RDF::DC.title] } )") - @training_dataset.compounds.each_with_index { |cmpd, idx| - act = @training_dataset.data_entries[idx][prediction_feature_pos] - @acts << (@prediction_feature.feature_type=="classification" ? @prediction_feature.value_map.invert[act] : nil) - @n_prop << @feature_dataset.data_entries[idx]#.collect.to_a - @cmpds << cmpd.uri - } - -=begin - @q_prop = @feature_dataset.features.collect { |f| - val = compound_fingerprints[f.title] - bad_request_error "Can not parse value '#{val}' to numeric" if val and !val.numeric? - val ? val.to_f : 0.0 - } # query structure -=end - @q_prop = compound_fingerprints.first.collect{|v| v.to_f} - - mtf = OpenTox::Algorithm::Transform::ModelTransformer.new(self) - mtf.transform - - prediction = OpenTox::Algorithm::Neighbors.send(@prediction_algorithm, - { :props => mtf.props, - :acts => mtf.acts, - :sims => mtf.sims, - :value_map => @prediction_feature.feature_type=="classification" ? @prediction_feature.value_map : nil, - :min_train_performance => @min_train_performance - } ) - - predicted_value = prediction[:prediction].to_f - confidence_value = prediction[:confidence].to_f - - # AM: transform to original space - confidence_value = ((confidence_value+1.0)/2.0).abs if @similarity_algorithm =~ /cosine/ - predicted_value = @prediction_feature.value_map[prediction[:prediction].to_i] if @prediction_feature.feature_type == "classification" - - end - - @prediction_dataset.add_data_entry compound, predicted_variable, predicted_value - @prediction_dataset.add_data_entry compound, predicted_confidence, confidence_value - - if @compound_uri # add neighbors only for compound predictions - @neighbors.each do |neighbor| - n = OpenTox::Compound.new(neighbor[:compound], @subjectid) - @prediction_dataset.add_data_entry n, @prediction_feature, @prediction_feature.value_map[neighbor[:activity]] - @prediction_dataset.add_data_entry n, @similarity_feature, neighbor[:similarity] - #@prediction_dataset << [ n, @prediction_feature.value_map[neighbor[:activity]], nil, nil, neighbor[:similarity] ] - end - end - - end # iteration over compounds - @prediction_dataset.put - - end + module Model - end - - class Model + class Lazar + include OpenTox - class Lazar < Model + attr_accessor :prediction_dataset # Check parameters for plausibility # Prepare lazar object (includes graph mining) # @param[Array] lazar parameters as strings # @param[Hash] REST parameters, as input by user - def create(params) + def self.create params + + lazar = OpenTox::Model::Lazar.new(File.join($model[:uri],SecureRandom.uuid), @subjectid) training_dataset = OpenTox::Dataset.new(params[:dataset_uri], @subjectid) - @parameters << {RDF::DC.title => "training_dataset_uri", RDF::OT.paramValue => training_dataset.uri} + lazar.parameters << {RDF::DC.title => "training_dataset_uri", RDF::OT.paramValue => training_dataset.uri} - # TODO: This is inconsistent, it would be better to have prediction_feature_uri in the API if params[:prediction_feature] resource_not_found_error "No feature '#{params[:prediction_feature]}' in dataset '#{params[:dataset_uri]}'" unless training_dataset.find_feature_uri( params[:prediction_feature] ) else # try to read prediction_feature from dataset resource_not_found_error "Please provide a prediction_feature parameter" unless training_dataset.features.size == 1 params[:prediction_feature] = training_dataset.features.first.uri end - self[RDF::OT.trainingDataset] = training_dataset.uri + lazar[RDF::OT.trainingDataset] = training_dataset.uri prediction_feature = OpenTox::Feature.new(params[:prediction_feature], @subjectid) predicted_variable = OpenTox::Feature.find_or_create({RDF::DC.title => "#{prediction_feature.title} prediction", RDF.type => [RDF::OT.Feature, prediction_feature[RDF.type]]}, @subjectid) - self[RDF::DC.title] = prediction_feature.title - @parameters << {RDF::DC.title => "prediction_feature_uri", RDF::OT.paramValue => prediction_feature.uri} - self[RDF::OT.dependentVariables] = prediction_feature.uri + lazar[RDF::DC.title] = prediction_feature.title + lazar.parameters << {RDF::DC.title => "prediction_feature_uri", RDF::OT.paramValue => prediction_feature.uri} + lazar[RDF::OT.dependentVariables] = prediction_feature.uri bad_request_error "Unknown prediction_algorithm #{params[:prediction_algorithm]}" if params[:prediction_algorithm] and !OpenTox::Algorithm::Neighbors.respond_to?(params[:prediction_algorithm]) - @parameters << {RDF::DC.title => "prediction_algorithm", RDF::OT.paramValue => params[:prediction_algorithm]} if params[:prediction_algorithm] + lazar.parameters << {RDF::DC.title => "prediction_algorithm", RDF::OT.paramValue => params[:prediction_algorithm]} if params[:prediction_algorithm] confidence_feature = OpenTox::Feature.find_or_create({RDF::DC.title => "predicted_confidence", RDF.type => [RDF::OT.Feature, RDF::OT.NumericFeature]}, @subjectid) - self[RDF::OT.predictedVariables] = [ predicted_variable.uri, confidence_feature.uri ] + lazar[RDF::OT.predictedVariables] = [ predicted_variable.uri, confidence_feature.uri ] case prediction_feature.feature_type when "classification" - @parameters << {RDF::DC.title => "prediction_algorithm", RDF::OT.paramValue => "weighted_majority_vote"} unless parameter_value "prediction_algorithm" - self[RDF.type] = [RDF::OT.Model, RDF::OTA.ClassificationLazySingleTarget] + lazar.parameters << {RDF::DC.title => "prediction_algorithm", RDF::OT.paramValue => "weighted_majority_vote"} unless lazar.parameter_value "prediction_algorithm" + lazar[RDF.type] = [RDF::OT.Model, RDF::OTA.ClassificationLazySingleTarget] when "regression" - @parameters << {RDF::DC.title => "prediction_algorithm", RDF::OT.paramValue => "local_svm_regression"} unless parameter_value "prediction_algorithm" - self[RDF.type] = [RDF::OT.Model, RDF::OTA.RegressionLazySingleTarget] + lazar.parameters << {RDF::DC.title => "prediction_algorithm", RDF::OT.paramValue => "local_svm_regression"} unless lazar.parameter_value "prediction_algorithm" + lazar[RDF.type] = [RDF::OT.Model, RDF::OTA.RegressionLazySingleTarget] end - parameter_value("prediction_algorithm") =~ /majority_vote/ ? @parameters << {RDF::DC.title => "propositionalized", RDF::OT.paramValue => false} : @parameters << {RDF::DC.title => "propositionalized", RDF::OT.paramValue => true} + lazar.parameter_value("prediction_algorithm") =~ /majority_vote/ ? lazar.parameters << {RDF::DC.title => "propositionalized", RDF::OT.paramValue => false} : lazar.parameters << {RDF::DC.title => "propositionalized", RDF::OT.paramValue => true} - @parameters << {RDF::DC.title => "min_sim", RDF::OT.paramValue => params[:min_sim].to_f} if params[:min_sim] and params[:min_sim].numeric? - @parameters << {RDF::DC.title => "feature_generation_uri", RDF::OT.paramValue => params[:feature_generation_uri]} - #@parameters["nr_hits"] = params[:nr_hits] + lazar.parameters << {RDF::DC.title => "min_sim", RDF::OT.paramValue => params[:min_sim].to_f} if params[:min_sim] and params[:min_sim].numeric? + lazar.parameters << {RDF::DC.title => "feature_generation_uri", RDF::OT.paramValue => params[:feature_generation_uri]} + #lazar.parameters["nr_hits"] = params[:nr_hits] case params["feature_generation_uri"] when /fminer/ if (params[:nr_hits] == "true") - @parameters << {RDF::DC.title => "feature_calculation_algorithm", RDF::OT.paramValue => "OpenTox::Descriptor::Smarts.count"} + lazar.parameters << {RDF::DC.title => "feature_calculation_algorithm", RDF::OT.paramValue => "smarts_count"} else - @parameters << {RDF::DC.title => "feature_calculation_algorithm", RDF::OT.paramValue => "OpenTox::Descriptor::Smarts.fingerprint"} + lazar.parameters << {RDF::DC.title => "feature_calculation_algorithm", RDF::OT.paramValue => "smarts_match"} end - @parameters << {RDF::DC.title => "similarity_algorithm", RDF::OT.paramValue => "tanimoto"} - @parameters << {RDF::DC.title => "min_sim", RDF::OT.paramValue => 0.3} unless parameter_value("min_sim") + lazar.parameters << {RDF::DC.title => "similarity_algorithm", RDF::OT.paramValue => "tanimoto"} + lazar.parameters << {RDF::DC.title => "min_sim", RDF::OT.paramValue => 0.3} unless lazar.parameter_value("min_sim") when /descriptor/ - @parameters << {RDF::DC.title => "feature_calculation_algorithm", RDF::OT.paramValue => "lookup"} - @parameters << {RDF::DC.title => "similarity_algorithm", RDF::OT.paramValue => "cosine"} - @parameters << {RDF::DC.title => "min_sim", RDF::OT.paramValue => 0.7} unless parameter_value("min_sim") + method = params["feature_generation_uri"].split(%r{/}).last.chomp + lazar.parameters << {RDF::DC.title => "feature_calculation_algorithm", RDF::OT.paramValue => method} + lazar.parameters << {RDF::DC.title => "similarity_algorithm", RDF::OT.paramValue => "cosine"} + lazar.parameters << {RDF::DC.title => "min_sim", RDF::OT.paramValue => 0.7} unless lazar.parameter_value("min_sim") end bad_request_error "Parameter min_train_performance is not numeric." if params[:min_train_performance] and !params[:min_train_performance].numeric? - @parameters << {RDF::DC.title => "min_train_performance", RDF::OT.paramValue => params[:min_train_performance].to_f} if params[:min_train_performance] and params[:min_train_performance].numeric? - @parameters << {RDF::DC.title => "min_train_performance", RDF::OT.paramValue => 0.1} unless parameter_value("min_train_performance") + lazar.parameters << {RDF::DC.title => "min_train_performance", RDF::OT.paramValue => params[:min_train_performance].to_f} if params[:min_train_performance] and params[:min_train_performance].numeric? + lazar.parameters << {RDF::DC.title => "min_train_performance", RDF::OT.paramValue => 0.1} unless lazar.parameter_value("min_train_performance") if params[:feature_dataset_uri] - bad_request_error "Feature dataset #{params[:feature_dataset_uri]} does not exist." unless URI.accessible? params[:feature_dataset_uri] - @parameters << {RDF::DC.title => "feature_dataset_uri", RDF::OT.paramValue => params[:feature_dataset_uri]} - self[RDF::OT.featureDataset] = params["feature_dataset_uri"] + bad_request_error "Feature dataset #{params[:feature_dataset_uri]} does not exist." unless URI.accessible? params[:feature_dataset_uri], @subjectid + lazar.parameters << {RDF::DC.title => "feature_dataset_uri", RDF::OT.paramValue => params[:feature_dataset_uri]} + lazar[RDF::OT.featureDataset] = params["feature_dataset_uri"] else # run feature generation algorithm - feature_dataset_uri = OpenTox::Algorithm.new(params[:feature_generation_uri]).run(params) - @parameters << {RDF::DC.title => "feature_dataset_uri", RDF::OT.paramValue => feature_dataset_uri} - self[RDF::OT.featureDataset] = feature_dataset_uri + feature_dataset_uri = OpenTox::Algorithm::Generic.new(params[:feature_generation_uri], @subjectid).run(params) + lazar.parameters << {RDF::DC.title => "feature_dataset_uri", RDF::OT.paramValue => feature_dataset_uri} + lazar[RDF::OT.featureDataset] = feature_dataset_uri end - if params[:feature_dataset_uri] - bad_request_error "Feature dataset #{params[:feature_dataset_uri]} does not exist." unless URI.accessible? params[:feature_dataset_uri], @subjectid - @parameters << {RDF::DC.title => "feature_dataset_uri", RDF::OT.paramValue => params[:feature_dataset_uri]} - self[RDF::OT.featureDataset] = params["feature_dataset_uri"] + lazar.put + lazar.uri + end + + def predict(params) + @prediction_dataset = OpenTox::Dataset.new(nil, @subjectid) + # set instance variables and prediction dataset parameters from parameters + params.each {|k,v| + self.class.class_eval { attr_accessor k.to_sym } + instance_variable_set "@#{k}", v + @prediction_dataset.parameters << {RDF::DC.title => k, RDF::OT.paramValue => v} + } + #["training_compounds", "fingerprints", "training_activities", "training_fingerprints", "query_fingerprint", "neighbors"].each {|k| + ["training_compounds", "training_activities", "training_fingerprints", "query_fingerprint", "neighbors"].each {|k| + self.class.class_eval { attr_accessor k.to_sym } + instance_variable_set("@#{k}", []) + } + + @prediction_feature = OpenTox::Feature.new @prediction_feature_uri, @subjectid + @predicted_variable = OpenTox::Feature.new @predicted_variable_uri, @subjectid + @predicted_confidence = OpenTox::Feature.new @predicted_confidence_uri, @subjectid + @prediction_dataset.metadata = { + RDF::DC.title => "Lazar prediction for #{@prediction_feature.title}", + RDF::DC.creator => @model_uri, + RDF::OT.hasSource => @model_uri, + RDF::OT.dependentVariables => @prediction_feature_uri, + RDF::OT.predictedVariables => [@predicted_variable_uri,@predicted_confidence_uri] + } + + @training_dataset = OpenTox::Dataset.new(@training_dataset_uri,@subjectid) + + @feature_dataset = OpenTox::Dataset.new(@feature_dataset_uri, @subjectid) + bad_request_error "No features found in feature dataset #{@feature_dataset.uri}." if @feature_dataset.features.empty? + + @similarity_feature = OpenTox::Feature.find_or_create({RDF::DC.title => "#{@similarity_algorithm.capitalize} similarity", RDF.type => [RDF::OT.Feature, RDF::OT.NumericFeature]}, @subjectid) + + @prediction_dataset.features = [ @predicted_variable, @predicted_confidence, @prediction_feature, @similarity_feature ] + + prediction_feature_pos = @training_dataset.features.collect{|f| f.uri}.index @prediction_feature.uri + + if @dataset_uri + compounds = OpenTox::Dataset.new(@dataset_uri, @subjectid).compounds else - # run feature generation algorithm - feature_dataset_uri = OpenTox::Algorithm.new(params[:feature_generation_uri], @subjectid).run(params) - @parameters << {RDF::DC.title => "feature_dataset_uri", RDF::OT.paramValue => feature_dataset_uri} - self[RDF::OT.featureDataset] = feature_dataset_uri + compounds = [ OpenTox::Compound.new(@compound_uri, @subjectid) ] end - put - @uri + + @training_fingerprints = @feature_dataset.data_entries + @training_compounds = @training_dataset.compounds + + query_fingerprints = OpenTox::Algorithm::Descriptor.send( @feature_calculation_algorithm, compounds, @feature_dataset.features.collect{ |f| f[RDF::DC.title] } )#.collect{|row| row.collect{|val| val ? val.to_f : 0.0 } } + + compounds.each do |compound| + + database_activities = @training_dataset.values(compound,@prediction_feature) + if database_activities and !database_activities.empty? + database_activities.each do |database_activity| + @prediction_dataset.add_data_entry compound, @prediction_feature, database_activity + end + next + else + # AM: transform to cosine space + @min_sim = (@min_sim.to_f*2.0-1.0).to_s if @similarity_algorithm =~ /cosine/ + @training_activities = @training_dataset.data_entries.collect{|entry| + act = entry[prediction_feature_pos] + @prediction_feature.feature_type=="classification" ? @prediction_feature.value_map.invert[act] : act + } + + @query_fingerprint = @feature_dataset.features.collect { |f| + val = query_fingerprints[compound][f.title] + bad_request_error "Can not parse value '#{val}' to numeric" if val and !val.numeric? + val ? val.to_f : 0.0 + } # query structure + + mtf = OpenTox::Algorithm::Transform::ModelTransformer.new(self) + mtf.transform + + prediction = OpenTox::Algorithm::Neighbors.send(@prediction_algorithm, + { :props => mtf.props, + :activities => mtf.activities, + :sims => mtf.sims, + :value_map => @prediction_feature.feature_type=="classification" ? @prediction_feature.value_map : nil, + :min_train_performance => @min_train_performance + } ) + + predicted_value = prediction[:prediction]#.to_f + confidence_value = prediction[:confidence]#.to_f + + # AM: transform to original space + confidence_value = ((confidence_value+1.0)/2.0).abs if @similarity_algorithm =~ /cosine/ + predicted_value = @prediction_feature.value_map[prediction[:prediction].to_i] if @prediction_feature.feature_type == "classification" + + end + + @prediction_dataset.add_data_entry compound, @predicted_variable, predicted_value + @prediction_dataset.add_data_entry compound, @predicted_confidence, confidence_value + + if @compound_uri # add neighbors only for compound predictions + @neighbors.each do |neighbor| + puts "Neighbor" + puts neighbor.inspect + n = neighbor[:compound] + @prediction_feature.feature_type == "classification" ? a = @prediction_feature.value_map[neighbor[:activity]] : a = neighbor[:activity] + @prediction_dataset.add_data_entry n, @prediction_feature, a + @prediction_dataset.add_data_entry n, @similarity_feature, neighbor[:similarity] + #@prediction_dataset << [ n, @prediction_feature.value_map[neighbor[:activity]], nil, nil, neighbor[:similarity] ] + end + end + + end # iteration over compounds + puts prediction_dataset.to_turtle + @prediction_dataset.put + @prediction_dataset + end end diff --git a/lib/neighbors.rb b/lib/neighbors.rb index b255f18..285afa8 100644 --- a/lib/neighbors.rb +++ b/lib/neighbors.rb @@ -4,14 +4,15 @@ * Author: Andreas Maunz <andreas@maunz.de> * Date: 10/2012 =end +require 'rinruby' module OpenTox - class Algorithm + module Algorithm class Neighbors # Get confidence. - # @param[Hash] Required keys: :sims, :acts + # @param[Hash] Required keys: :sims, :activities # @return[Float] Confidence def self.get_confidence(params) conf = params[:sims].inject{|sum,x| sum + x } @@ -21,7 +22,7 @@ module OpenTox end # Classification with majority vote from neighbors weighted by similarity - # @param [Hash] params Keys `:acts, :sims, :value_map` are required + # @param [Hash] params Keys `:activities, :sims, :value_map` are required # @return [Numeric] A prediction value. def self.weighted_majority_vote(params) @@ -32,11 +33,11 @@ module OpenTox $logger.debug "Weighted Majority Vote Classification." - params[:acts].each_index do |idx| + params[:activities].each_index do |idx| neighbor_weight = params[:sims][1][idx] - neighbor_contribution += params[:acts][idx] * neighbor_weight + neighbor_contribution += params[:activities][idx] * neighbor_weight if params[:value_map].size == 2 # AM: provide compat to binary classification: 1=>false 2=>true - case params[:acts][idx] + case params[:activities][idx] when 1 confidence_sum -= neighbor_weight when 2 @@ -48,16 +49,16 @@ module OpenTox end if params[:value_map].size == 2 if confidence_sum >= 0.0 - prediction = 2 unless params[:acts].size==0 + prediction = 2 unless params[:activities].size==0 elsif confidence_sum < 0.0 - prediction = 1 unless params[:acts].size==0 + prediction = 1 unless params[:activities].size==0 end else - prediction = (neighbor_contribution/confidence_sum).round unless params[:acts].size==0 # AM: new multinomial prediction + prediction = (neighbor_contribution/confidence_sum).round unless params[:activities].size==0 # AM: new multinomial prediction end #$logger.debug "Prediction: '" + prediction.to_s + "'." unless prediction.nil? - confidence = (confidence_sum/params[:acts].size).abs if params[:acts].size > 0 + confidence = (confidence_sum/params[:activities].size).abs if params[:activities].size > 0 #$logger.debug "Confidence: '" + confidence.to_s + "'." unless prediction.nil? return {:prediction => prediction, :confidence => confidence.abs} end @@ -65,7 +66,7 @@ module OpenTox # Local support vector regression from neighbors - # @param [Hash] params Keys `:props, :acts, :sims, :min_train_performance` are required + # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required # @return [Numeric] A prediction value. def self.local_svm_regression(params) puts "SVM" @@ -74,17 +75,17 @@ module OpenTox prediction = nil $logger.debug "Local SVM." - if params[:acts].size>0 + if params[:activities].size>0 if params[:props] n_prop = params[:props][0].collect.to_a q_prop = params[:props][1].collect.to_a props = [ n_prop, q_prop ] end - acts = params[:acts].collect.to_a - prediction = local_svm_prop( props, acts, params[:min_train_performance]) # params[:props].nil? signals non-prop setting + activities = params[:activities].collect.to_a + prediction = local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting prediction = nil if (!prediction.nil? && prediction.infinite?) #$logger.debug "Prediction: '" + prediction.to_s + "' ('#{prediction.class}')." - confidence = get_confidence({:sims => params[:sims][1], :acts => params[:acts]}) + confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]}) confidence = 0.0 if prediction.nil? end {:prediction => prediction, :confidence => confidence} @@ -93,7 +94,7 @@ module OpenTox # Local support vector regression from neighbors - # @param [Hash] params Keys `:props, :acts, :sims, :min_train_performance` are required + # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required # @return [Numeric] A prediction value. def self.local_svm_classification(params) @@ -101,19 +102,19 @@ module OpenTox prediction = nil $logger.debug "Local SVM." - if params[:acts].size>0 + if params[:activities].size>0 if params[:props] n_prop = params[:props][0].collect.to_a q_prop = params[:props][1].collect.to_a props = [ n_prop, q_prop ] end - acts = params[:acts].collect.to_a - acts = acts.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification - prediction = local_svm_prop( props, acts, params[:min_train_performance]) # params[:props].nil? signals non-prop setting + activities = params[:activities].collect.to_a + activities = activities.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification + prediction = local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting prediction = prediction.sub(/Val/,"") if prediction # Convert back confidence = 0.0 if prediction.nil? #$logger.debug "Prediction: '" + prediction.to_s + "' ('#{prediction.class}')." - confidence = get_confidence({:sims => params[:sims][1], :acts => params[:acts]}) + confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]}) end {:prediction => prediction, :confidence => confidence} @@ -125,18 +126,18 @@ module OpenTox # Uses propositionalized setting. # Not to be called directly (use local_svm_regression or local_svm_classification). # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ] - # @param [Array] acts, activities for neighbors. + # @param [Array] activities, activities for neighbors. # @param [Float] min_train_performance, parameter to control censoring # @return [Numeric] A prediction value. - def self.local_svm_prop(props, acts, min_train_performance) + def self.local_svm_prop(props, activities, min_train_performance) $logger.debug "Local SVM (Propositionalization / Kernlab Kernel)." n_prop = props[0] # is a matrix, i.e. two nested Arrays. q_prop = props[1] # is an Array. prediction = nil - if acts.uniq.size == 1 - prediction = acts[0] + if activities.uniq.size == 1 + prediction = activities[0] else #$logger.debug gram_matrix.to_yaml @r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests @@ -151,7 +152,7 @@ module OpenTox @r.n_prop = n_prop.flatten @r.n_prop_x_size = n_prop.size @r.n_prop_y_size = n_prop[0].size - @r.y = acts + @r.y = activities @r.q_prop = q_prop #@r.eval "y = matrix(y)" @r.eval "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)" @@ -196,22 +197,29 @@ module OpenTox EOR - # prediction - $logger.debug "Predicting ..." - @r.eval "predict(model,q_prop); p = predict(model,q_prop)" # kernlab bug: predict twice - @r.eval "if (class(y)!='numeric') p = as.character(p)" - prediction = @r.p - - # censoring - prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance.to_f ) - prediction = nil if prediction =~ /NA/ - prediction = nil unless train_success - $logger.debug "Performance: '#{sprintf("%.2f", @r.perf)}'" - #rescue Exception => e - #$logger.debug "#{e.class}: #{e.message}" - #$logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + if train_success + # prediction + $logger.debug "Predicting ..." + @r.eval "predict(model,q_prop); p = predict(model,q_prop)" # kernlab bug: predict twice + #@r.eval "p = predict(model,q_prop)" # kernlab bug: predict twice + @r.eval "if (class(y)!='numeric') p = as.character(p)" + prediction = @r.p + + # censoring + prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance.to_f ) + prediction = nil if prediction =~ /NA/ + $logger.debug "Performance: '#{sprintf("%.2f", @r.perf)}'" + else + $logger.debug "Model creation failed." + prediction = nil + end + rescue Exception => e + $logger.debug "#{e.class}: #{e.message}" + $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" ensure - @r.quit # free R + #puts @r.inspect + #TODO: broken pipe + #@r.quit # free R end end prediction diff --git a/lib/similarity.rb b/lib/similarity.rb index 22b4c28..cdac4b8 100644 --- a/lib/similarity.rb +++ b/lib/similarity.rb @@ -6,7 +6,7 @@ =end module OpenTox - class Algorithm + module Algorithm class Similarity diff --git a/lib/transform.rb b/lib/transform.rb index b2f7e7e..7b92df5 100644 --- a/lib/transform.rb +++ b/lib/transform.rb @@ -6,7 +6,7 @@ =end module OpenTox - class Algorithm + module Algorithm class Transform # Uses Statsample Library (http://ruby-statsample.rubyforge.org/) by C. Bustos @@ -231,7 +231,7 @@ module OpenTox # Attaches transformations to an OpenTox::Model # Stores props, sims, performs similarity calculations class ModelTransformer - attr_accessor :model, :similarity_algorithm, :acts, :sims + attr_accessor :model, :similarity_algorithm, :activities, :sims # @params[OpenTox::Model] model Model to transform def initialize model @@ -241,7 +241,7 @@ module OpenTox # Transforms the model def transform - get_matrices # creates @n_prop, @q_prop, @acts from ordered fps + get_matrices # creates @n_prop, @q_prop, @activities from ordered fingerprints @ids = (0..((@n_prop.length)-1)).to_a # surviving compounds; become neighbors if (@model.similarity_algorithm =~ /cosine/) @@ -258,9 +258,9 @@ module OpenTox $logger.debug "M: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" # adjust rest - fps_tmp = []; @ids.each { |idx| fps_tmp << @fps[idx] }; @fps = fps_tmp - cmpds_tmp = []; @ids.each { |idx| cmpds_tmp << @cmpds[idx] }; @cmpds = cmpds_tmp - acts_tmp = []; @ids.each { |idx| acts_tmp << @acts[idx] }; @acts = acts_tmp + #fingerprints_tmp = []; @ids.each { |idx| fingerprints_tmp << @fingerprints[idx] }; @fingerprints = fingerprints_tmp + compounds_tmp = []; @ids.each { |idx| compounds_tmp << @compounds[idx] }; @compounds = compounds_tmp + acts_tmp = []; @ids.each { |idx| acts_tmp << @activities[idx] }; @activities = acts_tmp # scale and svd nr_cases, nr_features = @n_prop.size, @n_prop[0].size @@ -284,7 +284,7 @@ module OpenTox @sims = [] # calculated by neighbor routine neighbors n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp # select neighbors from matrix - acts_tmp = []; @ids.each { |idx| acts_tmp << @acts[idx] }; @acts = acts_tmp + acts_tmp = []; @ids.each { |idx| acts_tmp << @activities[idx] }; @activities = acts_tmp # Sims between neighbors, if necessary @@ -312,7 +312,7 @@ module OpenTox end $logger.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" if (@n_prop && @n_prop[0] && @q_prop) - $logger.debug "Sims: #{@sims.size}, Acts: #{@acts.size}" + $logger.debug "Sims: #{@sims.size}, Acts: #{@activities.size}" @sims = [ gram_matrix, @sims ] @@ -334,13 +334,13 @@ module OpenTox # @param[Array] training_props Propositionalized data for this neighbor # @param[Integer] Index of neighbor def add_neighbor(training_props, idx) - unless @model.acts[idx].nil? + unless @model.training_activities[idx].nil? sim = similarity(training_props) if sim > @model.min_sim.to_f @model.neighbors << { - :compound => @cmpds[idx], + :compound => @compounds[idx], :similarity => sim, - :activity => acts[idx] + :activity => activities[idx] } @sims << sim @ids << idx @@ -400,11 +400,12 @@ module OpenTox # Converts fingerprints to matrix, order of rows by fingerprints. nil values allowed. # Same for compound fingerprints. def get_matrices - @cmpds = @model.cmpds - @fps = @model.fps - @acts = @model.acts - @n_prop = @model.n_prop - @q_prop = @model.q_prop + @compounds = @model.training_compounds + puts @compounds.inspect + #@fingerprints = @model.fingerprints + @activities = @model.training_activities + @n_prop = @model.training_fingerprints + @q_prop = @model.query_fingerprint end # Returns propositionalized data, if appropriate, or nil |