summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Vorgrimmler <vorgrimmlerdavid@gmx.de>2012-06-05 16:04:32 +0200
committerDavid Vorgrimmler <vorgrimmlerdavid@gmx.de>2012-06-05 16:04:32 +0200
commit62ccab084fca19172368866ef19f11dd228a4d75 (patch)
tree12c7128aa288ce8c4e1233499c19f81b2509ef08
parent3bf8011b4eb59c4d40d660bf45d80f0b270674e5 (diff)
parent1ea44d57e59b3bd80b475961d029b673e76dea31 (diff)
Merge branch 'bbrc-sample' into development
-rw-r--r--.gitmodules5
-rw-r--r--application.rb3
m---------bbrc-sample0
-rw-r--r--fminer.rb454
m---------last-utils0
5 files changed, 405 insertions, 57 deletions
diff --git a/.gitmodules b/.gitmodules
index 61a4b92..af7a8f4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -3,4 +3,7 @@
url = git://github.com/amaunz/fminer2.git
[submodule "last-utils"]
path = last-utils
- url = git://github.com/amaunz/last-utils.git
+ url = git://github.com/amaunz/last-utils.git
+[submodule "bbrc-sample"]
+ path = bbrc-sample
+ url = git://github.com/amaunz/bbrc-sample
diff --git a/application.rb b/application.rb
index ef123da..75d1e21 100644
--- a/application.rb
+++ b/application.rb
@@ -20,6 +20,7 @@ require File.join(File.expand_path(File.dirname(__FILE__)), 'last-utils/lu.rb')
gem "opentox-ruby", "~> 3"
require 'opentox-ruby'
require 'rjb'
+require 'rinruby'
# main
@@ -38,7 +39,7 @@ end
#
# @return [text/uri-list] algorithm URIs
get '/?' do
- list = [ url_for('/lazar', :full), url_for('/fminer/bbrc', :full), url_for('/fminer/last', :full), url_for('/feature_selection/rfe', :full), url_for('/pc', :full) ].join("\n") + "\n"
+ list = [ url_for('/lazar', :full), url_for('/fminer/bbrc', :full), url_for('/fminer/bbrc/sample', :full), url_for('/fminer/last', :full), url_for('/fminer/bbrc/match', :full), url_for('/fminer/last/match', :full), url_for('/feature_selection/rfe', :full), url_for('/pc', :full) ].join("\n") + "\n"
case request.env['HTTP_ACCEPT']
when /text\/html/
content_type "text/html"
diff --git a/bbrc-sample b/bbrc-sample
new file mode 160000
+Subproject 0d1d349ac33ae2fcc1bbdf31617ed9132c7527c
diff --git a/fminer.rb b/fminer.rb
index da06fc8..052d20d 100644
--- a/fminer.rb
+++ b/fminer.rb
@@ -4,14 +4,16 @@ ENV['FMINER_PVALUES'] = 'true'
ENV['FMINER_SILENT'] = 'true'
ENV['FMINER_NR_HITS'] = 'true'
-@@bbrc = Bbrc::Bbrc.new
-@@last = Last::Last.new
+@@bbrc = Bbrc::Bbrc.new
+@@last = Last::Last.new
+
+
# Get list of fminer algorithms
#
# @return [text/uri-list] URIs of fminer algorithms
get '/fminer/?' do
- list = [ url_for('/fminer/bbrc', :full), url_for('/fminer/last', :full) ].join("\n") + "\n"
+ list = [ url_for('/fminer/bbrc', :full), url_for('/fminer/bbrc/sample', :full), url_for('/fminer/last', :full), url_for('/fminer/bbrc/match', :full), url_for('/fminer/last/match', :full) ].join("\n") + "\n"
case request.env['HTTP_ACCEPT']
when /text\/html/
content_type "text/html"
@@ -22,6 +24,8 @@ get '/fminer/?' do
end
end
+
+
# Get RDF/XML representation of fminer bbrc algorithm
# @return [application/rdf+xml] OWL-DL representation of fminer bbrc algorithm
get "/fminer/bbrc/?" do
@@ -50,7 +54,40 @@ get "/fminer/bbrc/?" do
content_type "application/x-yaml"
algorithm.to_yaml
else
- response['Content-Type'] = 'application/rdf+xml'
+ response['Content-Type'] = 'application/rdf+xml'
+ algorithm.to_rdfxml
+ end
+end
+
+# Get RDF/XML representation of fminer bbrc algorithm
+# @return [application/rdf+xml] OWL-DL representation of fminer bbrc algorithm
+get "/fminer/bbrc/sample/?" do
+ algorithm = OpenTox::Algorithm::Generic.new(url_for('/fminer/bbrc/sample',:full))
+ algorithm.metadata = {
+ DC.title => 'fminer backbone refinement class representatives, obtained from samples of a dataset',
+ DC.creator => "andreas@maunz.de",
+# BO.instanceOf => "http://opentox.org/ontology/ist-algorithms.owl#fminer_bbrc",
+ RDF.type => [OT.Algorithm,OTA.PatternMiningSupervised],
+ OT.parameters => [
+ { DC.description => "Dataset URI", OT.paramScope => "mandatory", DC.title => "dataset_uri" },
+ { DC.description => "Feature URI for dependent variable", OT.paramScope => "mandatory", DC.title => "prediction_feature" },
+ { DC.description => "Number of bootstrap samples", OT.paramScope => "optional", DC.title => "num_boots" },
+ { DC.description => "Minimum sampling support", OT.paramScope => "optional", DC.title => "min_sampling_support" },
+ { DC.description => "Minimum frequency", OT.paramScope => "optional", DC.title => "min_frequency" },
+ { DC.description => "Whether subgraphs should be weighted with their occurrence counts in the instances (frequency)", OT.paramScope => "optional", DC.title => "nr_hits" },
+ { DC.description => "BBRC classes, pass 'false' to switch off mining for BBRC representatives.", OT.paramScope => "optional", DC.title => "backbone" },
+ { DC.description => "Chisq estimation method, pass 'mean' to use simple mean estimate for chisq test.", OT.paramScope => "optional", DC.title => "method" }
+ ]
+ }
+ case request.env['HTTP_ACCEPT']
+ when /text\/html/
+ content_type "text/html"
+ OpenTox.text_to_html algorithm.to_yaml
+ when /yaml/
+ content_type "application/x-yaml"
+ algorithm.to_yaml
+ else
+ response['Content-Type'] = 'application/rdf+xml'
algorithm.to_rdfxml
end
end
@@ -81,41 +118,44 @@ get "/fminer/last/?" do
content_type "application/x-yaml"
algorithm.to_yaml
else
- response['Content-Type'] = 'application/rdf+xml'
+ response['Content-Type'] = 'application/rdf+xml'
algorithm.to_rdfxml
end
end
-# Creates same features for dataset <dataset_uri> that have been created
-# with fminer in dataset <feature_dataset_uri>
-# accept params[:nr_hits] as used in other fminer methods
-post '/fminer/:method/match?' do
- raise OpenTox::BadRequestError.new "feature_dataset_uri not given" unless params[:feature_dataset_uri]
- raise OpenTox::BadRequestError.new "dataset_uri not given" unless params[:dataset_uri]
- task = OpenTox::Task.create("Matching features", url_for('/fminer/match',:full)) do |task|
- f_dataset = OpenTox::Dataset.find params[:feature_dataset_uri],@subjectid
- c_dataset = OpenTox::Dataset.find params[:dataset_uri],@subjectid
- res_dataset = OpenTox::Dataset.create CONFIG[:services]["dataset"],@subjectid
- f_dataset.features.each do |f,m|
- res_dataset.add_feature(f,m)
- end
- c_dataset.compounds.each do |c|
- res_dataset.add_compound(c)
- comp = OpenTox::Compound.new(c)
- f_dataset.features.each do |f,m|
- if params[:nr_hits] == "true"
- hits = comp.match_hits([m[OT.smarts]])
- res_dataset.add(c,f,hits[m[OT.smarts]]) if hits[m[OT.smarts]]
- else
- res_dataset.add(c,f,1) if comp.match?(m[OT.smarts])
- end
- end
- end
- res_dataset.save @subjectid
- res_dataset.uri
+
+# Get RDF/XML representation of fminer matching algorithm
+# @param [String] dataset_uri URI of the dataset
+# @param [String] feature_dataset_uri URI of the feature dataset (i.e. dependent variable)
+# @param [optional] parameters Accepted parameters are
+# - prediction_feature URI of prediction feature to calculate p-values for
+get "/fminer/:method/match?" do
+ algorithm = OpenTox::Algorithm::Generic.new(url_for("/fminer/#{params[:method]}/match",:full))
+ algorithm.metadata = {
+ DC.title => 'fminer feature matching',
+ DC.creator => "mguetlein@gmail.com, andreas@maunz.de",
+ RDF.type => [OT.Algorithm,OTA.PatternMiningSupervised],
+ OT.parameters => [
+ { DC.description => "Dataset URI", OT.paramScope => "mandatory", DC.title => "dataset_uri" },
+ { DC.description => "Feature Dataset URI", OT.paramScope => "mandatory", DC.title => "feature_dataset_uri" },
+ { DC.description => "Feature URI for dependent variable", OT.paramScope => "optional", DC.title => "prediction_feature" }
+ ]
+ }
+ case request.env['HTTP_ACCEPT']
+ when /text\/html/
+ content_type "text/html"
+ OpenTox.text_to_html algorithm.to_yaml
+ when /application\/x-yaml/
+ content_type "application/x-yaml"
+ algorithm.to_yaml
+ else
+ response['Content-Type'] = 'application/rdf+xml'
+ algorithm.to_rdfxml
end
- return_task(task)
-end
+end
+
+
+
# Run bbrc algorithm on dataset
#
@@ -128,7 +168,7 @@ end
# - min_chisq_significance Significance threshold (between 0 and 1)
# - nr_hits Set to "true" to get hit count instead of presence
# @return [text/uri-list] Task URI
-post '/fminer/bbrc/?' do
+post '/fminer/bbrc/?' do
fminer=OpenTox::Algorithm::Fminer.new
fminer.check_params(params,5,@subjectid)
@@ -144,7 +184,7 @@ post '/fminer/bbrc/?' do
end
@@bbrc.SetMinfreq(fminer.minfreq)
@@bbrc.SetType(1) if params[:feature_type] == "paths"
- @@bbrc.SetBackbone(eval params[:backbone]) if params[:backbone] and ( params[:backbone] == "true" or params[:backbone] == "false" ) # convert string to boolean
+ @@bbrc.SetBackbone(false) if params[:backbone] == "false"
@@bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance]
@@bbrc.SetConsoleOut(false)
@@ -155,7 +195,11 @@ post '/fminer/bbrc/?' do
OT.hasSource => url_for('/fminer/bbrc', :full),
OT.parameters => [
{ DC.title => "dataset_uri", OT.paramValue => params[:dataset_uri] },
- { DC.title => "prediction_feature", OT.paramValue => params[:prediction_feature] }
+ { DC.title => "prediction_feature", OT.paramValue => params[:prediction_feature] },
+ { DC.title => "min_frequency", OT.paramValue => fminer.minfreq },
+ { DC.title => "nr_hits", OT.paramValue => (params[:nr_hits] == "true" ? "true" : "false") },
+ { DC.title => "backbone", OT.paramValue => (params[:backbone] == "false" ? "false" : "true") }
+
]
})
feature_dataset.save(@subjectid)
@@ -185,20 +229,20 @@ post '/fminer/bbrc/?' do
smarts = f[0]
p_value = f[1]
- if (!@@bbrc.GetRegression)
+ if (!@@bbrc.GetRegression)
id_arrs = f[2..-1].flatten
- max = OpenTox::Algorithm.effect(f[2..-1], fminer.db_class_sizes)
- effect = f[2..-1].size-max
+ max = OpenTox::Algorithm.effect(f[2..-1].reverse, fminer.db_class_sizes) # f needs reversal for bbrc
+ effect = max+1
else #regression part
id_arrs = f[2]
# DV: effect calculation
f_arr=Array.new
f[2].each do |id|
id=id.keys[0] # extract id from hit count hash
- f_arr.push(fminer.all_activities[id])
- end
+ f_arr.push(fminer.all_activities[id])
+ end
f_median=f_arr.to_scale.median
- if g_median >= f_median
+ if g_median >= f_median
effect = 'activating'
else
effect = 'deactivating'
@@ -232,13 +276,13 @@ post '/fminer/bbrc/?' do
end
}
- end # end of
+ end # end of
end # feature parsing
# AM: add feature values for non-present features
- # feature_dataset.complete_data_entries
+ # feature_dataset.complete_data_entries
- feature_dataset.save(@subjectid)
+ feature_dataset.save(@subjectid)
feature_dataset.uri
end
response['Content-Type'] = 'text/uri-list'
@@ -247,6 +291,191 @@ post '/fminer/bbrc/?' do
end
#end
+
+# Run bbrc/sample algorithm on a dataset
+#
+# @param [String] dataset_uri URI of the training dataset
+# @param [String] prediction_feature URI of the prediction feature (i.e. dependent variable)
+# @param [optional] BBRC sample parameters, accepted are
+# - num_boots Number of bootstrap samples (default 150)
+# - min_sampling_support Minimum sampling support (default 30% of num_boots)
+# - min_frequency Minimum frequency (default 10% of dataset size)
+# - nr_hits Whether subgraphs should be weighted with their occurrence counts in the instances (frequency)
+# - random_seed Random seed ensures same datasets in bootBbrc
+# - backbone BBRC classes, pass 'false' to switch off mining for BBRC representatives. (default "true")
+# - method Chisq estimation method, pass 'mean' to use simple mean estimate (default 'mle').
+#
+# @return [text/uri-list] Task URI
+post '/fminer/bbrc/sample/?' do
+
+ fminer=OpenTox::Algorithm::Fminer.new
+ fminer.check_params(params,100,@subjectid) # AM: 100 per-mil (10%) as default minfreq
+
+ # num_boots
+ unless params[:num_boots]
+ num_boots = 150
+ LOGGER.debug "Set num_boots to default value #{num_boots}"
+ else
+ raise OpenTox::BadRequestError.new "num_boots is not numeric" unless OpenTox::Algorithm.numeric? params[:num_boots]
+ num_boots = params[:num_boots].to_i.ceil
+ end
+
+ # min_sampling_support
+ unless params[:min_sampling_support]
+ min_sampling_support = (num_boots * 0.3).ceil
+ LOGGER.debug "Set min_sampling_support to default value #{min_sampling_support}"
+ else
+ raise OpenTox::BadRequestError.new "min_sampling_support is not numeric" unless OpenTox::Algorithm.numeric? params[:min_sampling_support]
+ min_sampling_support= params[:min_sampling_support].to_i.ceil
+ end
+
+ # random_seed
+ unless params[:random_seed]
+ random_seed = 1
+ LOGGER.debug "Set random seed to default value #{random_seed}"
+ else
+ raise OpenTox::BadRequestError.new "random_seed is not numeric" unless OpenTox::Algorithm.numeric? params[:random_seed]
+ random_seed= params[:random_seed].to_i.ceil
+ end
+
+ # backbone
+ unless params[:backbone]
+ backbone = "true"
+ LOGGER.debug "Set backbone to default value #{backbone}"
+ else
+ raise OpenTox::BadRequestError.new "backbone is neither 'true' nor 'false'" unless (params[:backbone] == "true" or params[:backbone] == "false")
+ backbone = params[:backbone]
+ end
+
+ # method
+ unless params[:method]
+ method="mle"
+ LOGGER.debug "Set method to default value #{method}"
+ else
+ raise OpenTox::BadRequestError.new "method is neither 'mle' nor 'mean'" unless (params[:method] == "mle" or params[:method] == "mean")
+ method = params[:method]
+ end
+
+ task = OpenTox::Task.create("Mining BBRC sample features", url_for('/fminer',:full)) do |task|
+ if fminer.prediction_feature.feature_type == "regression"
+ raise OpenTox::BadRequestError.new "BBRC sampling is only for classification"
+ else
+ raise "no accept values for dataset '"+fminer.training_dataset.uri.to_s+"' and feature '"+fminer.prediction_feature.uri.to_s+
+ "'" unless fminer.training_dataset.accept_values(fminer.prediction_feature.uri)
+ @value_map=fminer.training_dataset.value_map(fminer.prediction_feature.uri)
+ end
+
+ feature_dataset = OpenTox::Dataset.new(nil, @subjectid)
+ feature_dataset.add_metadata({
+ DC.title => "BBRC representatives for " + fminer.training_dataset.metadata[DC.title].to_s + "(bootstrapped)",
+ DC.creator => url_for('/fminer/bbrc/sample',:full),
+ OT.hasSource => url_for('/fminer/bbrc/sample', :full)
+ })
+ feature_dataset.save(@subjectid)
+
+ # filled by add_fminer_data:
+ fminer.compounds = [] # indexed by id, starting from 1 (not 0)
+ fminer.db_class_sizes = Array.new # for effect calculation
+ fminer.all_activities = Hash.new # for effect calculation, indexed by id, starting from 1 (not 0)
+ fminer.smi = [] # needed for matching the patterns back, indexed by id, starting from 1 (not 0)
+ fminer.add_fminer_data(nil, @value_map) # To only fill in administrative data (no fminer priming) pass 'nil' as instance
+
+ raise "No compounds in dataset #{fminer.training_dataset.uri}" if fminer.compounds.size==0
+
+
+ # run bbrc-sample, obtain smarts and p-values
+ features = Set.new
+ task.progress 10
+ @r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests
+ @r.assign "dataset.uri", params[:dataset_uri]
+ @r.assign "prediction.feature.uri", fminer.prediction_feature.uri
+ @r.assign "num.boots", num_boots
+ @r.assign "min.frequency.per.sample", fminer.minfreq
+ @r.assign "min.sampling.support", min_sampling_support
+ @r.assign "random.seed", random_seed
+ @r.assign "backbone", backbone
+ @r.assign "bbrc.service", File.join(CONFIG[:services]["opentox-algorithm"], "fminer/bbrc")
+ @r.assign "dataset.service", CONFIG[:services]["opentox-dataset"]
+ @r.assign "method", method
+ @r.eval "source(\"bbrc-sample/bbrc-sample.R\")"
+ begin
+ @r.eval "bootBbrc(dataset.uri, prediction.feature.uri, num.boots, min.frequency.per.sample, min.sampling.support, NULL, bbrc.service, dataset.service, T, random.seed, as.logical(backbone), method)"
+ smarts = (@r.pull "ans.patterns").collect! { |id| id.gsub(/\'/,"") } # remove extra quotes around smarts
+ r_p_values = @r.pull "ans.p.values"
+ smarts_p_values = {}; smarts.size.times { |i| smarts_p_values[ smarts[i] ] = r_p_values[i] }
+ merge_time = @r.pull "merge.time"
+ n_stripped_mss = @r.pull "n.stripped.mss"
+ n_stripped_cst = @r.pull "n.stripped.cst"
+ rescue Exception => e
+ LOGGER.debug "#{e.class}: #{e.message}"
+ LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ end
+ @r.quit # free R
+
+ # matching
+ task.progress 90
+ lu = LU.new # AM LAST: uses last-utils here
+ params[:nr_hits] == "true" ? hit_count=true: hit_count=false
+ matches, counts = lu.match_rb(fminer.smi,smarts,hit_count) # AM LAST: creates instantiations
+
+ feature_dataset.add_metadata({
+ OT.parameters => [
+ { DC.title => "dataset_uri", OT.paramValue => params[:dataset_uri] },
+ { DC.title => "prediction_feature", OT.paramValue => params[:prediction_feature] },
+ { DC.title => "min_sampling_support", OT.paramValue => min_sampling_support },
+ { DC.title => "num_boots", OT.paramValue => num_boots },
+ { DC.title => "min_frequency_per_sample", OT.paramValue => fminer.minfreq },
+ { DC.title => "nr_hits", OT.paramValue => hit_count.to_s },
+ { DC.title => "merge_time", OT.paramValue => merge_time.to_s },
+ { DC.title => "n_stripped_mss", OT.paramValue => n_stripped_mss.to_s },
+ { DC.title => "n_stripped_cst", OT.paramValue => n_stripped_cst.to_s },
+ { DC.title => "random_seed", OT.paramValue => random_seed.to_s },
+ { DC.title => "backbone", OT.paramValue => backbone.to_s },
+ { DC.title => "method", OT.paramValue => method.to_s }
+ ]
+ })
+
+ matches.each do |smarts, ids|
+ feat_hash = Hash[*(fminer.all_activities.select { |k,v| ids.include?(k) }.flatten)] # AM LAST: get activities of feature occurrences; see http://www.softiesonrails.com/2007/9/18/ruby-201-weird-hash-syntax
+ g = Array.new
+ @value_map.each { |y,act| g[y-1]=Array.new }
+ feat_hash.each { |x,y| g[y-1].push(x) }
+ max = OpenTox::Algorithm.effect(g, fminer.db_class_sizes)
+ effect = max + 1
+ feature_uri = File.join feature_dataset.uri,"feature","bbrc", features.size.to_s
+ unless features.include? smarts
+ features << smarts
+ metadata = {
+ RDF.type => [OT.Feature, OT.Substructure],
+ OT.hasSource => feature_dataset.uri,
+ OT.smarts => smarts,
+ OT.pValue => smarts_p_values[smarts],
+ OT.effect => effect,
+ OT.parameters => [
+ { DC.title => "dataset_uri", OT.paramValue => params[:dataset_uri] },
+ { DC.title => "prediction_feature", OT.paramValue => params[:prediction_feature] }
+ ]
+ }
+ feature_dataset.add_feature feature_uri, metadata
+ end
+ if !hit_count
+ ids.each { |id| feature_dataset.add(fminer.compounds[id], feature_uri, 1)}
+ else
+ ids.each_with_index { |id,i| feature_dataset.add(fminer.compounds[id], feature_uri, counts[smarts][i])}
+ end
+ end
+
+ # AM: add feature values for non-present features
+ # feature_dataset.complete_data_entries
+
+ feature_dataset.save(@subjectid)
+ feature_dataset.uri
+ end
+ response['Content-Type'] = 'text/uri-list'
+ raise OpenTox::ServiceUnavailableError.newtask.uri+"\n" if task.status == "Cancelled"
+ halt 202,task.uri.to_s+"\n"
+end
+
# Run last algorithm on a dataset
#
# @param [String] dataset_uri URI of the training dataset
@@ -282,8 +511,10 @@ post '/fminer/last/?' do
OT.hasSource => url_for('/fminer/last', :full),
OT.parameters => [
{ DC.title => "dataset_uri", OT.paramValue => params[:dataset_uri] },
- { DC.title => "prediction_feature", OT.paramValue => params[:prediction_feature] }
- ]
+ { DC.title => "prediction_feature", OT.paramValue => params[:prediction_feature] },
+ { DC.title => "min_frequency", OT.paramValue => fminer.minfreq },
+ { DC.title => "nr_hits", OT.paramValue => (params[:nr_hits] == "true" ? "true" : "false") }
+ ]
})
feature_dataset.save(@subjectid)
@@ -312,14 +543,14 @@ post '/fminer/last/?' do
end
lu = LU.new # AM LAST: uses last-utils here
- dom=lu.read(xml) # AM LAST: parse GraphML
+ dom=lu.read(xml) # AM LAST: parse GraphML
smarts=lu.smarts_rb(dom,'nls') # AM LAST: converts patterns to LAST-SMARTS using msa variant (see last-pm.maunz.de)
params[:nr_hits] == "true" ? hit_count=true: hit_count=false
matches, counts = lu.match_rb(fminer.smi,smarts,hit_count) # AM LAST: creates instantiations
matches.each do |smarts, ids|
feat_hash = Hash[*(fminer.all_activities.select { |k,v| ids.include?(k) }.flatten)] # AM LAST: get activities of feature occurrences; see http://www.softiesonrails.com/2007/9/18/ruby-201-weird-hash-syntax
- if @@last.GetRegression()
+ if @@last.GetRegression()
p_value = @@last.KSTest(fminer.all_activities.values, feat_hash.values).to_f # AM LAST: use internal function for test
effect = (p_value > 0) ? "activating" : "deactivating"
else
@@ -328,7 +559,7 @@ post '/fminer/last/?' do
@value_map.each { |y,act| g[y-1]=Array.new }
feat_hash.each { |x,y| g[y-1].push(x) }
max = OpenTox::Algorithm.effect(g, fminer.db_class_sizes)
- effect = g.size-max
+ effect = max+1
end
feature_uri = File.join feature_dataset.uri,"feature","last", features.size.to_s
unless features.include? smarts
@@ -343,23 +574,136 @@ post '/fminer/last/?' do
{ DC.title => "dataset_uri", OT.paramValue => params[:dataset_uri] },
{ DC.title => "prediction_feature", OT.paramValue => params[:prediction_feature] }
]
- }
+ }
feature_dataset.add_feature feature_uri, metadata
end
if !hit_count
ids.each { |id| feature_dataset.add(fminer.compounds[id], feature_uri, 1)}
else
- ids.each_with_index { |id,i| feature_dataset.add(fminer.compounds[id], feature_uri, counts[smarts][i])}
+ ids.each_with_index { |id,i| feature_dataset.add(fminer.compounds[id], feature_uri, counts[smarts][i])}
end
end
# AM: add feature values for non-present features
- # feature_dataset.complete_data_entries
+ # feature_dataset.complete_data_entries
- feature_dataset.save(@subjectid)
+ feature_dataset.save(@subjectid)
feature_dataset.uri
end
response['Content-Type'] = 'text/uri-list'
raise OpenTox::ServiceUnavailableError.newtask.uri+"\n" if task.status == "Cancelled"
halt 202,task.uri.to_s+"\n"
end
+
+# Matches features of a a feature dataset onto instances of another dataset.
+# The latter is referred to as 'training dataset', since p-values are computed,
+# if user passes a prediction feature, or if the training dataset has only one feature.
+# The result does not contain the prediction feature.
+# @param [String] dataset_uri URI of the dataset
+# @param [String] feature_dataset_uri URI of the feature dataset (i.e. dependent variable)
+# @param [optional] parameters Accepted parameters are
+# - prediction_feature URI of prediction feature to calculate p-values for
+# @return [text/uri-list] Task URI
+post '/fminer/:method/match?' do
+ raise OpenTox::BadRequestError.new "feature_dataset_uri not given" unless params[:feature_dataset_uri]
+ raise OpenTox::BadRequestError.new "dataset_uri not given" unless params[:dataset_uri]
+
+ training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}"
+ unless params[:prediction_feature] # try to read prediction_feature from dataset
+ prediction_feature = OpenTox::Feature.find(training_dataset.features.keys.first) if training_dataset.features.size == 1
+ end
+ prediction_feature = OpenTox::Feature.find(params[:prediction_feature]) if params[:prediction_feature]
+
+ task = OpenTox::Task.create("Matching features", url_for('/fminer/match',:full)) do |task|
+
+ # get endpoint statistics
+ if prediction_feature
+ db_class_sizes = Array.new # for effect calculation
+ all_activities = Hash.new # for effect calculation, indexed by id, starting from 1 (not 0)
+ id = 1
+ training_dataset.compounds.each do |compound|
+ entry=training_dataset.data_entries[compound]
+ entry.each do |feature,values|
+ if feature == prediction_feature.uri
+ values.each { |val|
+ if val.nil?
+ LOGGER.warn "No #{feature} activity for #{compound.to_s}."
+ else
+ if prediction_feature.feature_type == "classification"
+ activity= training_dataset.value_map(prediction_feature.uri).invert[val].to_i # activities are mapped to 1..n
+ db_class_sizes[activity-1].nil? ? db_class_sizes[activity-1]=1 : db_class_sizes[activity-1]+=1 # AM effect
+ elsif prediction_feature.feature_type == "regression"
+ activity= val.to_f
+ end
+ begin
+ all_activities[id]=activity # DV: insert global information
+ id += 1
+ rescue Exception => e
+ LOGGER.warn "Could not add " + smiles + "\t" + val.to_s + " to fminer"
+ LOGGER.warn e.backtrace
+ end
+ end
+ }
+ end
+ end
+ end
+ end
+
+ # Intialize result by adding compounds
+ f_dataset = OpenTox::Dataset.find params[:feature_dataset_uri],@subjectid
+ c_dataset = OpenTox::Dataset.find params[:dataset_uri],@subjectid
+ res_dataset = OpenTox::Dataset.create CONFIG[:services]["dataset"],@subjectid
+ c_dataset.compounds.each do |c|
+ res_dataset.add_compound(c)
+ end
+
+ # Run matching, put data entries in result. Features are recreated.
+ smi = [nil]; smi += c_dataset.compounds.collect { |c| OpenTox::Compound.new(c).to_smiles }
+ smarts = f_dataset.features.collect { |f,m| m[OT.smarts] }
+ params[:nr_hits] == "true" ? hit_count=true: hit_count=false
+ matches, counts = LU.new.match_rb(smi, smarts, hit_count) if smarts.size>0
+
+ f_dataset.features.each do |f,m|
+ if (matches[m[OT.smarts]] && matches[m[OT.smarts]].size>0)
+
+ feature_uri = File.join res_dataset.uri,"feature","match", res_dataset.features.size.to_s
+ metadata = {
+ RDF.type => [OT.Feature, OT.Substructure],
+ OT.hasSource => f_dataset.uri,
+ OT.smarts => m[OT.smarts],
+ OT.parameters => [
+ { DC.title => "dataset_uri", OT.paramValue => params[:dataset_uri] }
+ ]
+ }
+
+ if (prediction_feature)
+ feat_hash = Hash[*(all_activities.select { |k,v| matches[m[OT.smarts]].include?(k) }.flatten)]
+ if prediction_feature.feature_type == "regression"
+ p_value = @@last.KSTest(all_activities.values, feat_hash.values).to_f # AM LAST: use internal function for test
+ effect = (p_value > 0) ? "activating" : "deactivating"
+ else
+ p_value = @@last.ChisqTest(all_activities.values, feat_hash.values).to_f
+ g=Array.new # g is filled in *a*scending activity
+ training_dataset.value_map(prediction_feature.uri).each { |y,act| g[y-1]=Array.new }
+ feat_hash.each { |x,y| g[y-1].push(x) }
+ max = OpenTox::Algorithm.effect(g, db_class_sizes) # db_class_sizes is filled in *a*scending activity
+ effect = max+1
+ end
+ metadata[OT.effect] = effect
+ metadata[OT.pValue] = p_value.abs
+ metadata[OT.parameters] << { DC.title => "prediction_feature", OT.paramValue => prediction_feature.uri }
+ end
+
+ res_dataset.add_feature feature_uri, metadata
+
+ matches[m[OT.smarts]].each_with_index {|id,idx|
+ res_dataset.add(c_dataset.compounds[id-1],feature_uri,counts[m[OT.smarts]][idx])
+ }
+ end
+ end
+ res_dataset.save @subjectid
+ res_dataset.uri
+ end
+ return_task(task)
+end
+
diff --git a/last-utils b/last-utils
-Subproject cf0238477127e54509b6ab8b5c38f50dd6ffce0
+Subproject efcc3f41dd9e2f590a1520dfee3bf709120b2e4