diff options
author | Christoph Helma <helma@in-silico.ch> | 2010-11-04 12:30:54 +0100 |
---|---|---|
committer | Christoph Helma <helma@in-silico.ch> | 2010-11-04 12:30:54 +0100 |
commit | 28aac60d38678340128a54ffe99bc56401561419 (patch) | |
tree | b39fa4396e7d0b7d232335b9e5ff090cfb5c600b | |
parent | e72bba4cdaa6fd68d62b567e21be730a49963207 (diff) | |
parent | e6d45f897d23987d03cff75fe958be342b0311a2 (diff) |
last merged from andi
-rw-r--r-- | .gitmodules | 3 | ||||
-rw-r--r-- | Rakefile | 98 | ||||
-rw-r--r-- | application.rb | 3 | ||||
-rw-r--r-- | balancer.rb | 98 | ||||
-rw-r--r-- | fminer.rb | 316 | ||||
m--------- | last-utils | 0 | ||||
-rw-r--r-- | lazar.rb | 1 | ||||
m--------- | libfminer | 0 |
8 files changed, 393 insertions, 126 deletions
diff --git a/.gitmodules b/.gitmodules index 3330d61..75218e9 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "libfminer"] path = libfminer url = http://github.com/amaunz/fminer2.git +[submodule "last-utils"] + path = last-utils + url = git://github.com/amaunz/last-utils.git @@ -4,41 +4,75 @@ require 'opentox-ruby-api-wrapper' #require 'tasks/opentox' namespace "fminer" do - desc "Install required gems and fminer" - task :install do - puts `git submodule init` - puts `git submodule update` - Dir.chdir('libfminer/libbbrc') - puts `git checkout master` - puts `git pull` - puts `./configure` - if $? == 0 - puts `echo "Fminer successfully configured."` - else - puts `echo "Fminer configuration failed!"` - exit - end - puts `make ruby` - end + desc "Install required gems and fminer" + task :install do + puts `git submodule init` + puts `git submodule update` + Dir.chdir('libfminer/libbbrc') + puts `git checkout master` + puts `git pull` + puts `./configure` + if $? == 0 + puts `echo "Fminer/LibBbrc successfully configured."` + else + puts `echo "Fminer/LibBbrc configuration failed!"` + exit + end + puts `make ruby` + Dir.chdir('../liblast') + puts `git checkout master` + puts `git pull` + puts `./configure` + if $? == 0 + puts `echo "Fminer/LibLast successfully configured."` + else + puts `echo "Fminer/LibLast configuration failed!"` + exit + end + puts `make ruby` + Dir.chdir('../../last-utils') + puts `git fetch` + # AM LAST: need branch 'experimental' until merged to master in last-utils + puts `git checkout -f -b experimental origin/experimental` + puts `git checkout experimental` + puts `git pull` + end - desc "Update gems and fminer" - task :update do - puts `git submodule update` - Dir.chdir('libfminer/libbbrc') - puts `git checkout master` - puts `git pull` - puts `./configure` - if $? == 0 - puts `echo "Fminer successfully configured."` - else - puts `echo "Fminer configuration failed!"` - exit - end - puts `make ruby` - end + desc "Update gems and fminer" + task :update do + puts `git submodule update` + Dir.chdir('libfminer/libbbrc') + puts `git checkout Makefile` + puts `git pull` + puts `./configure` + if $? == 0 + puts `echo "Fminer/LibBbrc successfully configured."` + else + puts `echo "Fminer/LibBbrc configuration failed!"` + exit + end + puts `make ruby` + Dir.chdir('../liblast') + puts `git checkout Makefile` + puts `git pull` + puts `./configure` + if $? == 0 + puts `echo "Fminer/LibLast successfully configured."` + else + puts `echo "Fminer/LibLast configuration failed!"` + exit + end + puts `make ruby` + Dir.chdir('../../last-utils') + puts `git fetch` + # AM LAST: need branch 'experimental' until merged to master in last-utils + puts `git checkout -f -b experimental origin/experimental` + puts `git checkout experimental` + puts `git pull` + end end desc "Run tests" task :test do - load 'test/test.rb' + load 'test/test.rb' end diff --git a/application.rb b/application.rb index d2a21c6..8e0a573 100644 --- a/application.rb +++ b/application.rb @@ -1,5 +1,8 @@ require 'rubygems' +# AM LAST: can include both libs, no problems require File.join(File.expand_path(File.dirname(__FILE__)), 'libfminer/libbbrc/bbrc') # has to be included before openbabel, otherwise we have strange SWIG overloading problems +require File.join(File.expand_path(File.dirname(__FILE__)), 'libfminer/liblast/last') # has to be included before openbabel, otherwise we have strange SWIG overloading problems +require File.join(File.expand_path(File.dirname(__FILE__)), 'last-utils/lu.rb') # AM LAST gem "opentox-ruby-api-wrapper", "= 1.6.6" require 'opentox-ruby-api-wrapper' diff --git a/balancer.rb b/balancer.rb new file mode 100644 index 0000000..4ed2fd7 --- /dev/null +++ b/balancer.rb @@ -0,0 +1,98 @@ +# cuts a classification dataset into balanced pieces +# let inact_act_ratio := majority_class.size/minority_class.size +# then: nr pieces = ceil(inact_act_ratio) if inact_act_ratio > 1.5 +# each piece contains the complete minority class and ceil(inact_act_ratio) majority class compounds. + +class Balancer + + attr_accessor :inact_act_ratio, :act_hash, :inact_hash, :majority_splits, :nr_majority_splits, :errors, :datasets + + # Supply a OpenTox::Dataset here + # Calculates inact_act_ratio, iff inact_act_ratio != +/-Infinity and no regression dataset is given + def initialize(dataset, feature_uri, creator_url) + @act_arr = [] + @inact_arr = [] + @inact_act_ratio = 1.0/0 # trick to define +infinity + @nr_majority_splits = 1 # +/-1 means: no split + @split = [] # splitted arrays with ids + @datasets = [] # result datasets + @errors = [] + + classification = true + if dataset.features.include?(feature_uri) + dataset.data.each do |i,a| + inchi = i + acts = a + acts.each do |act| + value = act[feature_uri] + if OpenTox::Utils.is_true?(value) + @act_arr << inchi + elsif OpenTox::Utils.classification?(value) + @inact_arr << inchi + else + classification = false + break; + end + end + end + @inact_act_ratio = @inact_arr.size.to_f / @act_arr.size.to_f unless (@act_arr.size == 0 or !classification) # leave alone for regression + set_nr_majority_splits + # perform majority split + @split = @nr_majority_splits > 0 ? shuffle_split(@inact_arr) : shuffle_split(@act_arr) unless @nr_majority_splits.abs == 1 + @split.each do |s| + new_c = @nr_majority_splits > 0 ? s.concat(@act_arr) : s.concat(@inac_arr) + @datasets << dataset.create_new_dataset(new_c, [feature_uri], dataset.title, creator_url) + end + + else + errors << "Feature not present in dataset." + end + errors << "Can not split regression dataset." unless classification + end + + + + # sets nr of splits for majority class ('+', if inact_cnt > act_cnt, or '-' else), or leaves unchanged for illegal values. + def set_nr_majority_splits + @nr_majority_splits = @inact_act_ratio >= 1.5 ? @inact_act_ratio.ceil : ( @inact_act_ratio <= (2.0/3.0) ? -(1.0/@inact_act_ratio).ceil : ( @inact_act_ratio>1.0 ? 1 : -1) ) unless OpenTox::Utils.infinity?(@inact_act_ratio) # leave alone for regression + end + + # does the actual shuffle and split + def shuffle_split (arr) + arr = arr.shuffle + arr.chunk(@nr_majority_splits.abs) + end + + # turns a hash into a 2 col csv + def hsh2csv (hsh) + res="" + hsh.each do |k,v| + arr = [v,(@nr_majority_splits > 0 ? 0 : 1)] + res += arr.join(", ") + "\n" + end + res + end + +end + +class Array + + # cuts an array into <num-pieces> chunks - returns a two-dimensional array + def chunk(pieces) + q, r = length.divmod(pieces) + (0..pieces).map { |i| i * q + [r, i].min }.enum_cons(2) \ + .map { |a, b| slice(a...b) } + end + + # shuffles the elements of an array + def shuffle( seed=nil ) + srand seed.to_i if seed + sort_by { Kernel.rand } + end + + # shuffels self + def shuffle!( seed=nil ) + self.replace shuffle( seed ) + end + +end @@ -1,8 +1,6 @@ ENV['FMINER_SMARTS'] = 'true' ENV['FMINER_NO_AROMATIC'] = 'true' ENV['FMINER_PVALUES'] = 'true' -@@fminer = Bbrc::Bbrc.new -@@fminer.SetMinfreq(5) get '/fminer/?' do @@ -26,23 +24,164 @@ get '/fminer/?' do end -post '/fminer/?' do +#post '/fminer/?' do +['/fminer/bbrc/?','/fminer/?'].each do |path| # AM LAST: set bbrc as default + post path do - halt 404, "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil? - halt 404, "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil? - prediction_feature = params[:prediction_feature] + @@fminer = Bbrc::Bbrc.new + @@fminer.SetMinfreq(5) + @@fminer.SetConsoleOut(false) + + halt 404, "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil? + halt 404, "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil? + prediction_feature = params[:prediction_feature] + + training_dataset = OpenTox::Dataset.new "#{params[:dataset_uri]}" + training_dataset.load_all + halt 404, "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless training_dataset.features and training_dataset.features.include?(params[:prediction_feature]) + + task_uri = OpenTox::Task.as_task("Mining BBRC features", url_for('/fminer',:full)) do + + feature_dataset = OpenTox::Dataset.new + feature_dataset.add_metadata({ + DC.title => "BBRC representatives for " + training_dataset.metadata[DC.title], + DC.creator => url_for('/fminer/bbrc',:full), + OT.hasSource => url_for('/fminer/bbrc', :full), + }) + feature_dataset.add_parameters({ + "dataset_uri" => params[:dataset_uri], + "prediction_feature" => params[:prediction_feature] + }) + feature_dataset.save + + id = 1 # fminer start id is not 0 + compounds = [] + nr_active=0 + nr_inactive=0 + all_activities = Hash.new# DV: for effect calculation in regression part + + @@fminer.Reset + training_dataset.data_entries.each do |compound,entry| + begin + smiles = OpenTox::Compound.new(compound.to_s).smiles + rescue + LOGGER.warn "No resource for #{compound.to_s}" + next + end + if smiles == '' or smiles.nil? + LOGGER.warn "Cannot find smiles for #{compound.to_s}." + next + end + entry.each do |feature,values| + values.each do |value| + if value.nil? + LOGGER.warn "No #{feature} activiity for #{compound.to_s}." + else + case value.to_s + when "true" + nr_active += 1 + activity = 1 + when "false" + nr_inactive += 1 + activity = 0 + else + activity = value.to_f + @@fminer.SetRegression(true) + end + begin + @@fminer.AddCompound(smiles,id) + @@fminer.AddActivity(activity, id) + all_activities[id]=activity # DV: insert global information + compounds[id] = compound + id += 1 + rescue + LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer" + end + end + end + end + end + + g_array=all_activities.values # DV: calculation of global median for effect calculation + g_median=OpenTox::Algorithm.median(g_array) + + raise "No compounds in dataset #{training_dataset.uri}" if compounds.size==0 + + features = Set.new + # run @@fminer + (0 .. @@fminer.GetNoRootNodes()-1).each do |j| + + results = @@fminer.MineRoot(j) + results.each do |result| + f = YAML.load(result)[0] + smarts = f[0] + p_value = f[1] + + if (!@@fminer.GetRegression) + ids = f[2] + f[3] + if f[2].size.to_f/ids.size > nr_active.to_f/(nr_active+nr_inactive) + effect = 'activating' + else + effect = 'deactivating' + end + else #regression part + ids = f[2] + # DV: effect calculation + f_arr=Array.new + f[2].each do |id| + f_arr.push(all_activities[id]) + end + f_median=OpenTox::Algorithm.median(f_arr) + if g_median >= f_median + effect = 'activating' + else + effect = 'deactivating' + end + end + + feature_uri = File.join feature_dataset.uri,"feature","bbrc", features.size.to_s + unless features.include? smarts + features << smarts + # TODO insert correct ontology entries + metadata = { + OT.hasSource => feature_dataset.uri, + OT.smarts => smarts, + OT.p_value => p_value.to_f, + OT.effect => effect } + feature_dataset.add_feature feature_uri, metadata + end + ids.each { |id| feature_dataset.add(compounds[id], feature_uri, true)} + end + end + feature_dataset.save + feature_dataset.uri + end + response['Content-Type'] = 'text/uri-list' + halt 202,task_uri.to_s+"\n" + end +end + +post '/fminer/last/?' do + + @@fminer = Last::Last.new + @@fminer.SetMinfreq(5) + @@fminer.SetConsoleOut(false) + + halt 404, "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil? + halt 404, "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil? + prediction_feature = params[:prediction_feature] training_dataset = OpenTox::Dataset.new "#{params[:dataset_uri]}" training_dataset.load_all - halt 404, "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless training_dataset.features and training_dataset.features.include?(params[:prediction_feature]) + halt 404, "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless training_dataset.features and training_dataset.features.include?(params[:prediction_feature]) - task_uri = OpenTox::Task.as_task("Mining BBRC features", url_for('/fminer',:full)) do + task_uri = OpenTox::Task.as_task("Mining LAST features", url_for('/fminer',:full)) do - feature_dataset = OpenTox::Dataset.new + feature_dataset = OpenTox::Dataset.new feature_dataset.add_metadata({ - DC.title => "BBRC representatives for " + training_dataset.metadata[DC.title], - DC.creator => url_for('/fminer',:full), - OT.hasSource => url_for('/fminer', :full), + DC.title => "LAST representatives for " + training_dataset.metadata[DC.title], + DC.creator => url_for('/fminer/last',:full), + OT.hasSource => url_for('/fminer/last', :full), }) feature_dataset.add_parameters({ "dataset_uri" => params[:dataset_uri], @@ -50,109 +189,98 @@ post '/fminer/?' do }) feature_dataset.save - id = 1 # fminer start id is not 0 - compounds = [] + id = 1 # fminer start id is not 0 + compounds = [] + smi = [] # AM LAST: needed for matching the patterns back nr_active=0 nr_inactive=0 - g_hash = Hash.new# DV: for effect calculation in regression part + all_activities = Hash.new# DV: for effect calculation in regression part - @@fminer.Reset + @@fminer.Reset training_dataset.data_entries.each do |compound,entry| - begin - smiles = OpenTox::Compound.new(compound.to_s).smiles - rescue - LOGGER.warn "No resource for #{compound.to_s}" - next - end - if smiles == '' or smiles.nil? - LOGGER.warn "Cannot find smiles for #{compound.to_s}." + begin + smiles = OpenTox::Compound.new(compound.to_s).smiles + rescue + LOGGER.warn "No resource for #{compound.to_s}" + next + end + if smiles == '' or smiles.nil? + LOGGER.warn "Cannot find smiles for #{compound.to_s}." next end entry.each do |feature,values| values.each do |value| - if value.nil? - LOGGER.warn "No #{feature} activiity for #{compound.to_s}." - else - case value.to_s - when "true" + if value.nil? + LOGGER.warn "No #{feature} activiity for #{compound.to_s}." + else + case value.to_s + when "true" nr_active += 1 - activity = 1 - when "false" + activity = 1 + when "false" nr_inactive += 1 - activity = 0 - else - activity = value.to_f - @@fminer.SetRegression(true) - end - begin - @@fminer.AddCompound(smiles,id) - @@fminer.AddActivity(activity, id) - g_hash[id]=activity # DV: insert global information + activity = 0 + else + activity = value.to_f + @@fminer.SetRegression(true) + end + begin + @@fminer.AddCompound(smiles,id) + @@fminer.AddActivity(activity, id) + all_activities[id]=activity # DV: insert global information compounds[id] = compound + smi[id] = smiles # AM LAST: changed this to store SMILES. id += 1 - rescue - LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer" - end + rescue + LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer" + end end end end end - g_array=g_hash.values # DV: calculation of global median for effect calculation + g_array=all_activities.values # DV: calculation of global median for effect calculation g_median=OpenTox::Algorithm.median(g_array) - - # TODO read from params + raise "No compounds in dataset #{training_dataset.uri}" if compounds.size==0 + # run @@fminer features = Set.new - # run @@fminer - (0 .. @@fminer.GetNoRootNodes()-1).each do |j| - - results = @@fminer.MineRoot(j) - results.each do |result| - f = YAML.load(result)[0] - smarts = f[0] - p_value = f[1] - - if (!@@fminer.GetRegression) - ids = f[2] + f[3] - if f[2].size.to_f/ids.size > nr_active.to_f/(nr_active+nr_inactive) - effect = 'activating' - else - effect = 'deactivating' - end - else #regression part - ids = f[2] - # DV: effect calculation - f_arr=Array.new - f[2].each do |id| - f_arr.push(g_hash[id]) - end - f_median=OpenTox::Algorithm.median(f_arr) - if g_median >= f_median - effect = 'activating' - else - effect = 'deactivating' - end - end + xml = "" - feature_uri = File.join feature_dataset.uri,"feature","bbrc", features.size.to_s - unless features.include? smarts - features << smarts - # TODO insert correct ontology entries - metadata = { - OT.hasSource => feature_dataset.uri, - OT.smarts => smarts, - OT.p_value => p_value.to_f, - OT.effect => effect } - feature_dataset.add_feature feature_uri, metadata - end - ids.each { |id| feature_dataset.add(compounds[id], feature_uri, true)} - end - end - feature_dataset.save + (0 .. @@fminer.GetNoRootNodes()-1).each do |j| + results = @@fminer.MineRoot(j) + results.each do |result| + xml << result + end + end + + lu = LU.new # AM LAST: uses last-utils here + dom=lu.read(xml) # AM LAST: parse GraphML (needs hpricot, @ch: to be included in wrapper!) + smarts=lu.smarts_rb(dom,'msa') # AM LAST: converts patterns to LAST-SMARTS using msa variant (see last-pm.maunz.de) + instances=lu.match_rb(smi,smarts) # AM LAST: creates instantiations + instances.each do |smarts, ids| + feat_hash = Hash[*(all_activities.select { |k,v| ids.include?(k) }.flatten)] # AM LAST: get activities of feature occurrences; see http://www.softiesonrails.com/2007/9/18/ruby-201-weird-hash-syntax + @@fminer.GetRegression() ? p_value = @@fminer.KSTest(all_activities.values, feat_hash.values).to_f : p_value = @@fminer.ChisqTest(all_activities.values, feat_hash.values).to_f # AM LAST: use internal function for test + + + effect = (p_value > 0) ? "activating" : "deactivating" + feature_uri = File.join feature_dataset.uri,"feature","last", features.size.to_s + unless features.include? smarts + features << smarts + metadata = { + OT.hasSource => feature_dataset.uri, + OT.smarts => smarts, + OT.p_value => p_value.to_f, + OT.effect => effect + } + feature_dataset.add_feature feature_uri, metadata + end + ids.each { |id| feature_dataset.add(compounds[id], feature_uri, true)} + end + feature_dataset.save feature_dataset.uri - end - response['Content-Type'] = 'text/uri-list' - halt 202,task_uri.to_s+"\n" + end + response['Content-Type'] = 'text/uri-list' + halt 202,task_uri.to_s+"\n" end diff --git a/last-utils b/last-utils new file mode 160000 +Subproject d2ad4f2bb82fdb5433d3f739400244ba89f0786 @@ -89,6 +89,7 @@ post '/lazar/?' do # create a model halt 202,task_uri end + post '/property_lazar/?' do # create a model LOGGER.debug "Dataset: '" + params[:dataset_uri].to_s + "'" diff --git a/libfminer b/libfminer -Subproject e955cc6b24d577d7187e5660716ee69d12174a8 +Subproject e0eee431ecb954328ff64e3cc48840c7003a276 |