summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2010-11-04 12:30:54 +0100
committerChristoph Helma <helma@in-silico.ch>2010-11-04 12:30:54 +0100
commit28aac60d38678340128a54ffe99bc56401561419 (patch)
treeb39fa4396e7d0b7d232335b9e5ff090cfb5c600b
parente72bba4cdaa6fd68d62b567e21be730a49963207 (diff)
parente6d45f897d23987d03cff75fe958be342b0311a2 (diff)
last merged from andi
-rw-r--r--.gitmodules3
-rw-r--r--Rakefile98
-rw-r--r--application.rb3
-rw-r--r--balancer.rb98
-rw-r--r--fminer.rb316
m---------last-utils0
-rw-r--r--lazar.rb1
m---------libfminer0
8 files changed, 393 insertions, 126 deletions
diff --git a/.gitmodules b/.gitmodules
index 3330d61..75218e9 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
[submodule "libfminer"]
path = libfminer
url = http://github.com/amaunz/fminer2.git
+[submodule "last-utils"]
+ path = last-utils
+ url = git://github.com/amaunz/last-utils.git
diff --git a/Rakefile b/Rakefile
index 70139f4..efadf01 100644
--- a/Rakefile
+++ b/Rakefile
@@ -4,41 +4,75 @@ require 'opentox-ruby-api-wrapper'
#require 'tasks/opentox'
namespace "fminer" do
- desc "Install required gems and fminer"
- task :install do
- puts `git submodule init`
- puts `git submodule update`
- Dir.chdir('libfminer/libbbrc')
- puts `git checkout master`
- puts `git pull`
- puts `./configure`
- if $? == 0
- puts `echo "Fminer successfully configured."`
- else
- puts `echo "Fminer configuration failed!"`
- exit
- end
- puts `make ruby`
- end
+ desc "Install required gems and fminer"
+ task :install do
+ puts `git submodule init`
+ puts `git submodule update`
+ Dir.chdir('libfminer/libbbrc')
+ puts `git checkout master`
+ puts `git pull`
+ puts `./configure`
+ if $? == 0
+ puts `echo "Fminer/LibBbrc successfully configured."`
+ else
+ puts `echo "Fminer/LibBbrc configuration failed!"`
+ exit
+ end
+ puts `make ruby`
+ Dir.chdir('../liblast')
+ puts `git checkout master`
+ puts `git pull`
+ puts `./configure`
+ if $? == 0
+ puts `echo "Fminer/LibLast successfully configured."`
+ else
+ puts `echo "Fminer/LibLast configuration failed!"`
+ exit
+ end
+ puts `make ruby`
+ Dir.chdir('../../last-utils')
+ puts `git fetch`
+ # AM LAST: need branch 'experimental' until merged to master in last-utils
+ puts `git checkout -f -b experimental origin/experimental`
+ puts `git checkout experimental`
+ puts `git pull`
+ end
- desc "Update gems and fminer"
- task :update do
- puts `git submodule update`
- Dir.chdir('libfminer/libbbrc')
- puts `git checkout master`
- puts `git pull`
- puts `./configure`
- if $? == 0
- puts `echo "Fminer successfully configured."`
- else
- puts `echo "Fminer configuration failed!"`
- exit
- end
- puts `make ruby`
- end
+ desc "Update gems and fminer"
+ task :update do
+ puts `git submodule update`
+ Dir.chdir('libfminer/libbbrc')
+ puts `git checkout Makefile`
+ puts `git pull`
+ puts `./configure`
+ if $? == 0
+ puts `echo "Fminer/LibBbrc successfully configured."`
+ else
+ puts `echo "Fminer/LibBbrc configuration failed!"`
+ exit
+ end
+ puts `make ruby`
+ Dir.chdir('../liblast')
+ puts `git checkout Makefile`
+ puts `git pull`
+ puts `./configure`
+ if $? == 0
+ puts `echo "Fminer/LibLast successfully configured."`
+ else
+ puts `echo "Fminer/LibLast configuration failed!"`
+ exit
+ end
+ puts `make ruby`
+ Dir.chdir('../../last-utils')
+ puts `git fetch`
+ # AM LAST: need branch 'experimental' until merged to master in last-utils
+ puts `git checkout -f -b experimental origin/experimental`
+ puts `git checkout experimental`
+ puts `git pull`
+ end
end
desc "Run tests"
task :test do
- load 'test/test.rb'
+ load 'test/test.rb'
end
diff --git a/application.rb b/application.rb
index d2a21c6..8e0a573 100644
--- a/application.rb
+++ b/application.rb
@@ -1,5 +1,8 @@
require 'rubygems'
+# AM LAST: can include both libs, no problems
require File.join(File.expand_path(File.dirname(__FILE__)), 'libfminer/libbbrc/bbrc') # has to be included before openbabel, otherwise we have strange SWIG overloading problems
+require File.join(File.expand_path(File.dirname(__FILE__)), 'libfminer/liblast/last') # has to be included before openbabel, otherwise we have strange SWIG overloading problems
+require File.join(File.expand_path(File.dirname(__FILE__)), 'last-utils/lu.rb') # AM LAST
gem "opentox-ruby-api-wrapper", "= 1.6.6"
require 'opentox-ruby-api-wrapper'
diff --git a/balancer.rb b/balancer.rb
new file mode 100644
index 0000000..4ed2fd7
--- /dev/null
+++ b/balancer.rb
@@ -0,0 +1,98 @@
+# cuts a classification dataset into balanced pieces
+# let inact_act_ratio := majority_class.size/minority_class.size
+# then: nr pieces = ceil(inact_act_ratio) if inact_act_ratio > 1.5
+# each piece contains the complete minority class and ceil(inact_act_ratio) majority class compounds.
+
+class Balancer
+
+ attr_accessor :inact_act_ratio, :act_hash, :inact_hash, :majority_splits, :nr_majority_splits, :errors, :datasets
+
+ # Supply a OpenTox::Dataset here
+ # Calculates inact_act_ratio, iff inact_act_ratio != +/-Infinity and no regression dataset is given
+ def initialize(dataset, feature_uri, creator_url)
+ @act_arr = []
+ @inact_arr = []
+ @inact_act_ratio = 1.0/0 # trick to define +infinity
+ @nr_majority_splits = 1 # +/-1 means: no split
+ @split = [] # splitted arrays with ids
+ @datasets = [] # result datasets
+ @errors = []
+
+ classification = true
+ if dataset.features.include?(feature_uri)
+ dataset.data.each do |i,a|
+ inchi = i
+ acts = a
+ acts.each do |act|
+ value = act[feature_uri]
+ if OpenTox::Utils.is_true?(value)
+ @act_arr << inchi
+ elsif OpenTox::Utils.classification?(value)
+ @inact_arr << inchi
+ else
+ classification = false
+ break;
+ end
+ end
+ end
+ @inact_act_ratio = @inact_arr.size.to_f / @act_arr.size.to_f unless (@act_arr.size == 0 or !classification) # leave alone for regression
+ set_nr_majority_splits
+ # perform majority split
+ @split = @nr_majority_splits > 0 ? shuffle_split(@inact_arr) : shuffle_split(@act_arr) unless @nr_majority_splits.abs == 1
+ @split.each do |s|
+ new_c = @nr_majority_splits > 0 ? s.concat(@act_arr) : s.concat(@inac_arr)
+ @datasets << dataset.create_new_dataset(new_c, [feature_uri], dataset.title, creator_url)
+ end
+
+ else
+ errors << "Feature not present in dataset."
+ end
+ errors << "Can not split regression dataset." unless classification
+ end
+
+
+
+ # sets nr of splits for majority class ('+', if inact_cnt > act_cnt, or '-' else), or leaves unchanged for illegal values.
+ def set_nr_majority_splits
+ @nr_majority_splits = @inact_act_ratio >= 1.5 ? @inact_act_ratio.ceil : ( @inact_act_ratio <= (2.0/3.0) ? -(1.0/@inact_act_ratio).ceil : ( @inact_act_ratio>1.0 ? 1 : -1) ) unless OpenTox::Utils.infinity?(@inact_act_ratio) # leave alone for regression
+ end
+
+ # does the actual shuffle and split
+ def shuffle_split (arr)
+ arr = arr.shuffle
+ arr.chunk(@nr_majority_splits.abs)
+ end
+
+ # turns a hash into a 2 col csv
+ def hsh2csv (hsh)
+ res=""
+ hsh.each do |k,v|
+ arr = [v,(@nr_majority_splits > 0 ? 0 : 1)]
+ res += arr.join(", ") + "\n"
+ end
+ res
+ end
+
+end
+
+class Array
+
+ # cuts an array into <num-pieces> chunks - returns a two-dimensional array
+ def chunk(pieces)
+ q, r = length.divmod(pieces)
+ (0..pieces).map { |i| i * q + [r, i].min }.enum_cons(2) \
+ .map { |a, b| slice(a...b) }
+ end
+
+ # shuffles the elements of an array
+ def shuffle( seed=nil )
+ srand seed.to_i if seed
+ sort_by { Kernel.rand }
+ end
+
+ # shuffels self
+ def shuffle!( seed=nil )
+ self.replace shuffle( seed )
+ end
+
+end
diff --git a/fminer.rb b/fminer.rb
index b5956c5..3ba3057 100644
--- a/fminer.rb
+++ b/fminer.rb
@@ -1,8 +1,6 @@
ENV['FMINER_SMARTS'] = 'true'
ENV['FMINER_NO_AROMATIC'] = 'true'
ENV['FMINER_PVALUES'] = 'true'
-@@fminer = Bbrc::Bbrc.new
-@@fminer.SetMinfreq(5)
get '/fminer/?' do
@@ -26,23 +24,164 @@ get '/fminer/?' do
end
-post '/fminer/?' do
+#post '/fminer/?' do
+['/fminer/bbrc/?','/fminer/?'].each do |path| # AM LAST: set bbrc as default
+ post path do
- halt 404, "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil?
- halt 404, "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil?
- prediction_feature = params[:prediction_feature]
+ @@fminer = Bbrc::Bbrc.new
+ @@fminer.SetMinfreq(5)
+ @@fminer.SetConsoleOut(false)
+
+ halt 404, "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil?
+ halt 404, "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil?
+ prediction_feature = params[:prediction_feature]
+
+ training_dataset = OpenTox::Dataset.new "#{params[:dataset_uri]}"
+ training_dataset.load_all
+ halt 404, "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless training_dataset.features and training_dataset.features.include?(params[:prediction_feature])
+
+ task_uri = OpenTox::Task.as_task("Mining BBRC features", url_for('/fminer',:full)) do
+
+ feature_dataset = OpenTox::Dataset.new
+ feature_dataset.add_metadata({
+ DC.title => "BBRC representatives for " + training_dataset.metadata[DC.title],
+ DC.creator => url_for('/fminer/bbrc',:full),
+ OT.hasSource => url_for('/fminer/bbrc', :full),
+ })
+ feature_dataset.add_parameters({
+ "dataset_uri" => params[:dataset_uri],
+ "prediction_feature" => params[:prediction_feature]
+ })
+ feature_dataset.save
+
+ id = 1 # fminer start id is not 0
+ compounds = []
+ nr_active=0
+ nr_inactive=0
+ all_activities = Hash.new# DV: for effect calculation in regression part
+
+ @@fminer.Reset
+ training_dataset.data_entries.each do |compound,entry|
+ begin
+ smiles = OpenTox::Compound.new(compound.to_s).smiles
+ rescue
+ LOGGER.warn "No resource for #{compound.to_s}"
+ next
+ end
+ if smiles == '' or smiles.nil?
+ LOGGER.warn "Cannot find smiles for #{compound.to_s}."
+ next
+ end
+ entry.each do |feature,values|
+ values.each do |value|
+ if value.nil?
+ LOGGER.warn "No #{feature} activiity for #{compound.to_s}."
+ else
+ case value.to_s
+ when "true"
+ nr_active += 1
+ activity = 1
+ when "false"
+ nr_inactive += 1
+ activity = 0
+ else
+ activity = value.to_f
+ @@fminer.SetRegression(true)
+ end
+ begin
+ @@fminer.AddCompound(smiles,id)
+ @@fminer.AddActivity(activity, id)
+ all_activities[id]=activity # DV: insert global information
+ compounds[id] = compound
+ id += 1
+ rescue
+ LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer"
+ end
+ end
+ end
+ end
+ end
+
+ g_array=all_activities.values # DV: calculation of global median for effect calculation
+ g_median=OpenTox::Algorithm.median(g_array)
+
+ raise "No compounds in dataset #{training_dataset.uri}" if compounds.size==0
+
+ features = Set.new
+ # run @@fminer
+ (0 .. @@fminer.GetNoRootNodes()-1).each do |j|
+
+ results = @@fminer.MineRoot(j)
+ results.each do |result|
+ f = YAML.load(result)[0]
+ smarts = f[0]
+ p_value = f[1]
+
+ if (!@@fminer.GetRegression)
+ ids = f[2] + f[3]
+ if f[2].size.to_f/ids.size > nr_active.to_f/(nr_active+nr_inactive)
+ effect = 'activating'
+ else
+ effect = 'deactivating'
+ end
+ else #regression part
+ ids = f[2]
+ # DV: effect calculation
+ f_arr=Array.new
+ f[2].each do |id|
+ f_arr.push(all_activities[id])
+ end
+ f_median=OpenTox::Algorithm.median(f_arr)
+ if g_median >= f_median
+ effect = 'activating'
+ else
+ effect = 'deactivating'
+ end
+ end
+
+ feature_uri = File.join feature_dataset.uri,"feature","bbrc", features.size.to_s
+ unless features.include? smarts
+ features << smarts
+ # TODO insert correct ontology entries
+ metadata = {
+ OT.hasSource => feature_dataset.uri,
+ OT.smarts => smarts,
+ OT.p_value => p_value.to_f,
+ OT.effect => effect }
+ feature_dataset.add_feature feature_uri, metadata
+ end
+ ids.each { |id| feature_dataset.add(compounds[id], feature_uri, true)}
+ end
+ end
+ feature_dataset.save
+ feature_dataset.uri
+ end
+ response['Content-Type'] = 'text/uri-list'
+ halt 202,task_uri.to_s+"\n"
+ end
+end
+
+post '/fminer/last/?' do
+
+ @@fminer = Last::Last.new
+ @@fminer.SetMinfreq(5)
+ @@fminer.SetConsoleOut(false)
+
+ halt 404, "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil?
+ halt 404, "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil?
+ prediction_feature = params[:prediction_feature]
training_dataset = OpenTox::Dataset.new "#{params[:dataset_uri]}"
training_dataset.load_all
- halt 404, "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless training_dataset.features and training_dataset.features.include?(params[:prediction_feature])
+ halt 404, "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless training_dataset.features and training_dataset.features.include?(params[:prediction_feature])
- task_uri = OpenTox::Task.as_task("Mining BBRC features", url_for('/fminer',:full)) do
+ task_uri = OpenTox::Task.as_task("Mining LAST features", url_for('/fminer',:full)) do
- feature_dataset = OpenTox::Dataset.new
+ feature_dataset = OpenTox::Dataset.new
feature_dataset.add_metadata({
- DC.title => "BBRC representatives for " + training_dataset.metadata[DC.title],
- DC.creator => url_for('/fminer',:full),
- OT.hasSource => url_for('/fminer', :full),
+ DC.title => "LAST representatives for " + training_dataset.metadata[DC.title],
+ DC.creator => url_for('/fminer/last',:full),
+ OT.hasSource => url_for('/fminer/last', :full),
})
feature_dataset.add_parameters({
"dataset_uri" => params[:dataset_uri],
@@ -50,109 +189,98 @@ post '/fminer/?' do
})
feature_dataset.save
- id = 1 # fminer start id is not 0
- compounds = []
+ id = 1 # fminer start id is not 0
+ compounds = []
+ smi = [] # AM LAST: needed for matching the patterns back
nr_active=0
nr_inactive=0
- g_hash = Hash.new# DV: for effect calculation in regression part
+ all_activities = Hash.new# DV: for effect calculation in regression part
- @@fminer.Reset
+ @@fminer.Reset
training_dataset.data_entries.each do |compound,entry|
- begin
- smiles = OpenTox::Compound.new(compound.to_s).smiles
- rescue
- LOGGER.warn "No resource for #{compound.to_s}"
- next
- end
- if smiles == '' or smiles.nil?
- LOGGER.warn "Cannot find smiles for #{compound.to_s}."
+ begin
+ smiles = OpenTox::Compound.new(compound.to_s).smiles
+ rescue
+ LOGGER.warn "No resource for #{compound.to_s}"
+ next
+ end
+ if smiles == '' or smiles.nil?
+ LOGGER.warn "Cannot find smiles for #{compound.to_s}."
next
end
entry.each do |feature,values|
values.each do |value|
- if value.nil?
- LOGGER.warn "No #{feature} activiity for #{compound.to_s}."
- else
- case value.to_s
- when "true"
+ if value.nil?
+ LOGGER.warn "No #{feature} activiity for #{compound.to_s}."
+ else
+ case value.to_s
+ when "true"
nr_active += 1
- activity = 1
- when "false"
+ activity = 1
+ when "false"
nr_inactive += 1
- activity = 0
- else
- activity = value.to_f
- @@fminer.SetRegression(true)
- end
- begin
- @@fminer.AddCompound(smiles,id)
- @@fminer.AddActivity(activity, id)
- g_hash[id]=activity # DV: insert global information
+ activity = 0
+ else
+ activity = value.to_f
+ @@fminer.SetRegression(true)
+ end
+ begin
+ @@fminer.AddCompound(smiles,id)
+ @@fminer.AddActivity(activity, id)
+ all_activities[id]=activity # DV: insert global information
compounds[id] = compound
+ smi[id] = smiles # AM LAST: changed this to store SMILES.
id += 1
- rescue
- LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer"
- end
+ rescue
+ LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer"
+ end
end
end
end
end
- g_array=g_hash.values # DV: calculation of global median for effect calculation
+ g_array=all_activities.values # DV: calculation of global median for effect calculation
g_median=OpenTox::Algorithm.median(g_array)
-
- # TODO read from params
+
raise "No compounds in dataset #{training_dataset.uri}" if compounds.size==0
+ # run @@fminer
features = Set.new
- # run @@fminer
- (0 .. @@fminer.GetNoRootNodes()-1).each do |j|
-
- results = @@fminer.MineRoot(j)
- results.each do |result|
- f = YAML.load(result)[0]
- smarts = f[0]
- p_value = f[1]
-
- if (!@@fminer.GetRegression)
- ids = f[2] + f[3]
- if f[2].size.to_f/ids.size > nr_active.to_f/(nr_active+nr_inactive)
- effect = 'activating'
- else
- effect = 'deactivating'
- end
- else #regression part
- ids = f[2]
- # DV: effect calculation
- f_arr=Array.new
- f[2].each do |id|
- f_arr.push(g_hash[id])
- end
- f_median=OpenTox::Algorithm.median(f_arr)
- if g_median >= f_median
- effect = 'activating'
- else
- effect = 'deactivating'
- end
- end
+ xml = ""
- feature_uri = File.join feature_dataset.uri,"feature","bbrc", features.size.to_s
- unless features.include? smarts
- features << smarts
- # TODO insert correct ontology entries
- metadata = {
- OT.hasSource => feature_dataset.uri,
- OT.smarts => smarts,
- OT.p_value => p_value.to_f,
- OT.effect => effect }
- feature_dataset.add_feature feature_uri, metadata
- end
- ids.each { |id| feature_dataset.add(compounds[id], feature_uri, true)}
- end
- end
- feature_dataset.save
+ (0 .. @@fminer.GetNoRootNodes()-1).each do |j|
+ results = @@fminer.MineRoot(j)
+ results.each do |result|
+ xml << result
+ end
+ end
+
+ lu = LU.new # AM LAST: uses last-utils here
+ dom=lu.read(xml) # AM LAST: parse GraphML (needs hpricot, @ch: to be included in wrapper!)
+ smarts=lu.smarts_rb(dom,'msa') # AM LAST: converts patterns to LAST-SMARTS using msa variant (see last-pm.maunz.de)
+ instances=lu.match_rb(smi,smarts) # AM LAST: creates instantiations
+ instances.each do |smarts, ids|
+ feat_hash = Hash[*(all_activities.select { |k,v| ids.include?(k) }.flatten)] # AM LAST: get activities of feature occurrences; see http://www.softiesonrails.com/2007/9/18/ruby-201-weird-hash-syntax
+ @@fminer.GetRegression() ? p_value = @@fminer.KSTest(all_activities.values, feat_hash.values).to_f : p_value = @@fminer.ChisqTest(all_activities.values, feat_hash.values).to_f # AM LAST: use internal function for test
+
+
+ effect = (p_value > 0) ? "activating" : "deactivating"
+ feature_uri = File.join feature_dataset.uri,"feature","last", features.size.to_s
+ unless features.include? smarts
+ features << smarts
+ metadata = {
+ OT.hasSource => feature_dataset.uri,
+ OT.smarts => smarts,
+ OT.p_value => p_value.to_f,
+ OT.effect => effect
+ }
+ feature_dataset.add_feature feature_uri, metadata
+ end
+ ids.each { |id| feature_dataset.add(compounds[id], feature_uri, true)}
+ end
+ feature_dataset.save
feature_dataset.uri
- end
- response['Content-Type'] = 'text/uri-list'
- halt 202,task_uri.to_s+"\n"
+ end
+ response['Content-Type'] = 'text/uri-list'
+ halt 202,task_uri.to_s+"\n"
end
diff --git a/last-utils b/last-utils
new file mode 160000
+Subproject d2ad4f2bb82fdb5433d3f739400244ba89f0786
diff --git a/lazar.rb b/lazar.rb
index 98e0aa7..9fbc679 100644
--- a/lazar.rb
+++ b/lazar.rb
@@ -89,6 +89,7 @@ post '/lazar/?' do # create a model
halt 202,task_uri
end
+
post '/property_lazar/?' do # create a model
LOGGER.debug "Dataset: '" + params[:dataset_uri].to_s + "'"
diff --git a/libfminer b/libfminer
-Subproject e955cc6b24d577d7187e5660716ee69d12174a8
+Subproject e0eee431ecb954328ff64e3cc48840c7003a276