summaryrefslogtreecommitdiff
path: root/fminer.rb
diff options
context:
space:
mode:
Diffstat (limited to 'fminer.rb')
-rw-r--r--fminer.rb487
1 files changed, 344 insertions, 143 deletions
diff --git a/fminer.rb b/fminer.rb
index 15379fd..5cc83ed 100644
--- a/fminer.rb
+++ b/fminer.rb
@@ -1,153 +1,354 @@
ENV['FMINER_SMARTS'] = 'true'
ENV['FMINER_NO_AROMATIC'] = 'true'
ENV['FMINER_PVALUES'] = 'true'
-@@fminer = Bbrc::Bbrc.new
+@@bbrc = Bbrc::Bbrc.new
+@@last = Last::Last.new
+
+# Get list of fminer algorithms
+#
+# @return [text/uri-list] URIs of fminer algorithms
get '/fminer/?' do
- owl = OpenTox::Owl.create 'Algorithm', url_for('/fminer',:full)
- owl.set 'title',"fminer"
- owl.set 'creator',"http://github.com/amaunz/fminer2"
- owl.parameters = {
- "Dataset URI" => { :scope => "mandatory", :value => "dataset_uri" },
- "Feature URI for dependent variable" => { :scope => "mandatory", :value => "feature_uri" }
- }
- rdf = owl.rdf
- File.open('public/fminer.owl', 'w') {|f| f.print rdf}
+ response['Content-Type'] = 'text/uri-list'
+ [ url_for('/fminer/bbrc', :full), url_for('/fminer/last', :full) ].join("\n") + "\n"
+end
+
+# Get RDF/XML representation of fminer bbrc algorithm
+# @return [application/rdf+xml] OWL-DL representation of fminer bbrc algorithm
+get "/fminer/bbrc/?" do
response['Content-Type'] = 'application/rdf+xml'
- rdf
+ algorithm = OpenTox::Algorithm::Generic.new(url_for('/fminer/bbrc',:full))
+ algorithm.metadata = {
+ DC.title => 'fminer backbone refinement class representatives',
+ DC.creator => "andreas@maunz.de, helma@in-silico.ch",
+ DC.contributor => "vorgrimmlerdavid@gmx.de",
+ OT.isA => OTA.PatternMiningSupervised,
+ OT.parameters => [
+ { DC.description => "Dataset URI", OT.paramScope => "mandatory", DC.title => "dataset_uri" },
+ { DC.description => "Feature URI for dependent variable", OT.paramScope => "mandatory", DC.title => "prediction_feature" },
+ { DC.description => "Minimum frequency", OT.paramScope => "optional", DC.title => "minfreq" },
+ { DC.description => "Feature type, can be 'paths' or 'trees'", OT.paramScope => "optional", DC.title => "feature_type" },
+ { DC.description => "BBRC classes, pass 'false' to switch off mining for BBRC representatives.", OT.paramScope => "optional", DC.title => "backbone" },
+ { DC.description => "Significance threshold (between 0 and 1)", OT.paramScope => "optional", DC.title => "min_chisq_significance" },
+ ]
+ }
+ algorithm.to_rdfxml
end
-post '/fminer/?' do
+# Get RDF/XML representation of fminer last algorithm
+# @return [application/rdf+xml] OWL-DL representation of fminer last algorithm
+get "/fminer/last/?" do
+ algorithm = OpenTox::Algorithm::Generic.new(url_for('/fminer/last',:full))
+ algorithm.metadata = {
+ DC.title => 'fminer latent structure class representatives',
+ DC.creator => "andreas@maunz.de, helma@in-silico.ch",
+ DC.contributor => "vorgrimmlerdavid@gmx.de",
+ OT.isA => OTA.PatternMiningSupervised,
+ OT.parameters => [
+ { DC.description => "Dataset URI", OT.paramScope => "mandatory", DC.title => "dataset_uri" },
+ { DC.description => "Feature URI for dependent variable", OT.paramScope => "mandatory", DC.title => "prediction_feature" },
+ { DC.description => "Minimum frequency", OT.paramScope => "optional", DC.title => "minfreq" },
+ { DC.description => "Feature type, can be 'paths' or 'trees'", OT.paramScope => "optional", DC.title => "feature_type" },
+ { DC.description => "Maximum number of hops", OT.paramScope => "optional", DC.title => "hops" },
+ ]
+ }
+ algorithm.to_rdfxml
+end
+
+# Run bbrc algorithm on dataset
+#
+# @param [String] dataset_uri URI of the training dataset
+# @param [String] prediction_feature URI of the prediction feature (i.e. dependent variable)
+# @param [optional] parameters BBRC parameters, accepted parameters are
+# - minfreq Minimum frequency (default 5)
+# - feature_type Feature type, can be 'paths' or 'trees' (default "trees")
+# - backbone BBRC classes, pass 'false' to switch off mining for BBRC representatives. (default "true")
+# - min_chisq_significance Significance threshold (between 0 and 1)
+# @return [text/uri-list] Task URI
+post '/fminer/bbrc/?' do
+
+ # TODO: is this thread safe??
+ #@@bbrc = Bbrc::Bbrc.new
+ minfreq = 5 unless minfreq = params[:min_frequency]
+ @@bbrc.SetMinfreq(minfreq)
+ @@bbrc.SetType(1) if params[:feature_type] == "paths"
+ @@bbrc.SetBackbone(params[:backbone]) if params[:backbone]
+ @@bbrc.SetChisqSig(params[:min_chisq_significance]) if params[:min_chisq_significance]
+ @@bbrc.SetConsoleOut(false)
+
+ halt 404, "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil?
+ halt 404, "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil?
+ prediction_feature = params[:prediction_feature]
+
+ training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}", @subjectid
+ halt 404, "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless training_dataset.features and training_dataset.features.include?(params[:prediction_feature])
+
+ task = OpenTox::Task.create("Mining BBRC features", url_for('/fminer',:full)) do
+
+ feature_dataset = OpenTox::Dataset.new(nil, @subjectid)
+ feature_dataset.add_metadata({
+ DC.title => "BBRC representatives for " + training_dataset.metadata[DC.title].to_s,
+ DC.creator => url_for('/fminer/bbrc',:full),
+ OT.hasSource => url_for('/fminer/bbrc', :full),
+ OT.parameters => [
+ { DC.title => "dataset_uri", OT.paramValue => params[:dataset_uri] },
+ { DC.title => "prediction_feature", OT.paramValue => params[:prediction_feature] }
+ ]
+ })
+ feature_dataset.save(@subjectid)
+
+ id = 1 # fminer start id is not 0
+ compounds = []
+ nr_active=0
+ nr_inactive=0
+ all_activities = Hash.new# DV: for effect calculation in regression part
+
+ @@bbrc.Reset
+ training_dataset.data_entries.each do |compound,entry|
+ begin
+ smiles = OpenTox::Compound.new(compound.to_s).to_smiles
+ rescue
+ LOGGER.warn "No resource for #{compound.to_s}"
+ next
+ end
+ if smiles == '' or smiles.nil?
+ LOGGER.warn "Cannot find smiles for #{compound.to_s}."
+ next
+ end
+ entry.each do |feature,values|
+ values.each do |value|
+ if value.nil?
+ LOGGER.warn "No #{feature} activiity for #{compound.to_s}."
+ else
+ case value.to_s
+ when "true"
+ nr_active += 1
+ activity = 1
+ when "false"
+ nr_inactive += 1
+ activity = 0
+ else
+ activity = value.to_f
+ @@bbrc.SetRegression(true)
+ end
+ begin
+ @@bbrc.AddCompound(smiles,id)
+ @@bbrc.AddActivity(activity, id)
+ all_activities[id]=activity # DV: insert global information
+ compounds[id] = compound
+ id += 1
+ rescue
+ LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer"
+ end
+ end
+ end
+ end
+ end
+
+ g_array=all_activities.values # DV: calculation of global median for effect calculation
+ g_median=OpenTox::Algorithm.median(g_array)
+
+ raise "No compounds in dataset #{training_dataset.uri}" if compounds.size==0
+
+ features = Set.new
+ # run @@bbrc
+ (0 .. @@bbrc.GetNoRootNodes()-1).each do |j|
+
+ results = @@bbrc.MineRoot(j)
+ results.each do |result|
+ f = YAML.load(result)[0]
+ smarts = f[0]
+ p_value = f[1]
+
+ if (!@@bbrc.GetRegression)
+ ids = f[2] + f[3]
+ if f[2].size.to_f/ids.size > nr_active.to_f/(nr_active+nr_inactive)
+ effect = 'activating'
+ else
+ effect = 'deactivating'
+ end
+ else #regression part
+ ids = f[2]
+ # DV: effect calculation
+ f_arr=Array.new
+ f[2].each do |id|
+ f_arr.push(all_activities[id])
+ end
+ f_median=OpenTox::Algorithm.median(f_arr)
+ if g_median >= f_median
+ effect = 'activating'
+ else
+ effect = 'deactivating'
+ end
+ end
+
+ feature_uri = File.join feature_dataset.uri,"feature","bbrc", features.size.to_s
+ unless features.include? smarts
+ features << smarts
+ metadata = {
+ OT.hasSource => url_for('/fminer/bbrc', :full),
+ OT.isA => OT.Substructure,
+ OT.smarts => smarts,
+ OT.pValue => p_value.to_f,
+ OT.effect => effect,
+ OT.parameters => [
+ { DC.title => "dataset_uri", OT.paramValue => params[:dataset_uri] },
+ { DC.title => "prediction_feature", OT.paramValue => params[:prediction_feature] }
+ ]
+ }
+ feature_dataset.add_feature feature_uri, metadata
+ #feature_dataset.add_feature_parameters feature_uri, feature_dataset.parameters
+ end
+ ids.each { |id| feature_dataset.add(compounds[id], feature_uri, true)}
+ end
+ end
+ feature_dataset.save(@subjectid)
+ feature_dataset.uri
+ end
+ response['Content-Type'] = 'text/uri-list'
+ halt 503,task.uri+"\n" if task.status == "Cancelled"
+ halt 202,task.uri.to_s+"\n"
+ end
+#end
+
+# Run last algorithm on a dataset
+#
+# @param [String] dataset_uri URI of the training dataset
+# @param [String] prediction_feature URI of the prediction feature (i.e. dependent variable)
+# @param [optional] parameters LAST parameters, accepted parameters are
+# - minfreq Minimum frequency (default 5)
+# - feature_type Feature type, can be 'paths' or 'trees' (default "trees")
+# - hops Maximum number of hops
+# @return [text/uri-list] Task URI
+post '/fminer/last/?' do
+ #@@last = Last::Last.new
+ minfreq = 5 unless minfreq = params[:min_frequency]
+ @@last.SetMinfreq(minfreq)
+ @@last.SetType(1) if params[:feature_type] == "paths"
+ @@last.SetMaxHops(params[:hops]) if params[:hops]
+ @@last.SetConsoleOut(false)
+
+ halt 404, "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil?
+ halt 404, "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil?
+ prediction_feature = params[:prediction_feature]
+
+ training_dataset = OpenTox::Dataset.new "#{params[:dataset_uri]}", @subjectid
+
+ training_dataset.load_all(@subjectid)
+ halt 404, "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless training_dataset.features and training_dataset.features.include?(params[:prediction_feature])
+
+ task = OpenTox::Task.create("Mining LAST features", url_for('/fminer',:full)) do
+
+ feature_dataset = OpenTox::Dataset.new
+ feature_dataset.add_metadata({
+ DC.title => "LAST representatives for " + training_dataset.metadata[DC.title].to_s,
+ DC.creator => url_for('/fminer/last',:full),
+ OT.hasSource => url_for('/fminer/last', :full),
+ OT.parameters => [
+ { DC.title => "dataset_uri", OT.paramValue => params[:dataset_uri] },
+ { DC.title => "prediction_feature", OT.paramValue => params[:prediction_feature] }
+ ]
+ })
+ feature_dataset.save(@subjectid)
+
+ id = 1 # fminer start id is not 0
+ compounds = []
+ smi = [] # AM LAST: needed for matching the patterns back
+ nr_active=0
+ nr_inactive=0
+ all_activities = Hash.new# DV: for effect calculation in regression part
+
+ @@last.Reset
+ training_dataset.data_entries.each do |compound,entry|
+ begin
+ smiles = OpenTox::Compound.new(compound.to_s).to_smiles
+ rescue
+ LOGGER.warn "No resource for #{compound.to_s}"
+ next
+ end
+ if smiles == '' or smiles.nil?
+ LOGGER.warn "Cannot find smiles for #{compound.to_s}."
+ next
+ end
+ entry.each do |feature,values|
+ values.each do |value|
+ if value.nil?
+ LOGGER.warn "No #{feature} activiity for #{compound.to_s}."
+ else
+ case value.to_s
+ when "true"
+ nr_active += 1
+ activity = 1
+ when "false"
+ nr_inactive += 1
+ activity = 0
+ else
+ activity = value.to_f
+ @@last.SetRegression(true)
+ end
+ begin
+ @@last.AddCompound(smiles,id)
+ @@last.AddActivity(activity, id)
+ all_activities[id]=activity # DV: insert global information
+ compounds[id] = compound
+ smi[id] = smiles # AM LAST: changed this to store SMILES.
+ id += 1
+ rescue
+ LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer"
+ end
+ end
+ end
+ end
+ end
+
+ g_array=all_activities.values # DV: calculation of global median for effect calculation
+ g_median=OpenTox::Algorithm.median(g_array)
- halt 404, "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil?
- halt 404, "Please submit a feature_uri." unless params[:feature_uri] and !params[:feature_uri].nil?
- LOGGER.debug "Dataset: " + params[:dataset_uri]
- LOGGER.debug "Endpoint: " + params[:feature_uri]
- feature_uri = params[:feature_uri]
- begin
- LOGGER.debug "Retrieving #{params[:dataset_uri]}"
- training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}"
- rescue
- LOGGER.error "Dataset #{params[:dataset_uri]} not found"
- halt 404, "Dataset #{params[:dataset_uri]} not found." if training_dataset.nil?
- end
- halt 404, "No feature #{params[:feature_uri]} in dataset #{params[:dataset_uri]}" unless training_dataset.features and training_dataset.features.include?(params[:feature_uri])
-
- task_uri = OpenTox::Task.as_task("Mine features", url_for('/fminer',:full)) do
-
- feature_dataset = OpenTox::Dataset.new
- title = "BBRC representatives for " + training_dataset.title
- feature_dataset.title = title
- feature_dataset.creator = url_for('/fminer',:full)
- bbrc_uri = url_for("/fminer#BBRC_representative",:full)
- feature_dataset.features << bbrc_uri
-
- id = 1 # fminer start id is not 0
- compounds = []
-
- g_hash = Hash.new# DV: for effect calculation in regression part
- @@fminer.Reset
- #@@fminer.SetChisqSig(0.99)
- LOGGER.debug "Fminer: initialising ..."
- training_dataset.data.each do |c,features|
- begin
- smiles = OpenTox::Compound.new(:uri => c.to_s).smiles
- rescue
- LOGGER.warn "No resource for #{c.to_s}"
- next
- end
- if smiles == '' or smiles.nil?
- LOGGER.warn "Cannot find smiles for #{c.to_s}."
- else
- feature_dataset.compounds << c.to_s
- features.each do |feature|
- act = feature[feature_uri]
- if act.nil?
- LOGGER.warn "No #{feature_uri} activiity for #{c.to_s}."
- else
- case act.to_s
- when "true"
- #LOGGER.debug id.to_s + ' "' + smiles +'"' + "\t" + true.to_s
- activity = 1
- when "false"
- #LOGGER.debug id.to_s + ' "' + smiles +'"' + "\t" + false.to_s
- activity = 0
- else
- # AM: add quantitative activity
- activity = act.to_f
- @@fminer.SetRegression(true)
- end
- compounds[id] = c.to_s
- begin
- @@fminer.AddCompound(smiles,id)
- @@fminer.AddActivity(activity, id)
- g_hash[id]=activity # DV: insert global information
- rescue
- LOGGER.warn "Could not add " + smiles + "\t" + act.to_s + " to fminer"
- end
- end
- end
- id += 1
- end
- end
- g_array=g_hash.values # DV: calculation of global median for effect calculation
- g_median=OpenTox::Utils.median(g_array)
- minfreq = (0.02*id).round
- @@fminer.SetMinfreq(minfreq)
- LOGGER.debug "Fminer: initialised with #{id} compounds, minimum frequency #{minfreq}"
-
- raise "no compounds" if compounds.size==0
-
- values = {}
- # run @@fminer
- LOGGER.debug "Fminer: mining ..."
- (0 .. @@fminer.GetNoRootNodes()-1).each do |j|
- results = @@fminer.MineRoot(j)
- results.each do |result|
- f = YAML.load(result)[0]
- smarts = f[0]
- p_value = f[1]
- # AM: f[3] missing on regression
- if (!@@fminer.GetRegression)
- ids = f[2] + f[3]
- if f[2].size > f[3].size
- effect = 'activating'
- else
- effect = 'deactivating'
- end
- else #regression part
- ids = f[2]
- # DV: effect calculation
- f_arr=Array.new
- f[2].each do |id|
- f_arr.push(g_hash[id])
- end
- f_median=OpenTox::Utils.median(f_arr)
- if g_median >= f_median
- effect = 'activating'
- else
- effect = 'deactivating'
- end
- end
-
- tuple = {
- url_for('/fminer#smarts',:full) => smarts,
- url_for('/fminer#p_value',:full) => p_value.to_f,
- url_for('/fminer#effect',:full) => effect
- }
- #LOGGER.debug "#{f[0]}\t#{f[1]}\t#{effect}"
- ids.each do |id|
- feature_dataset.data[compounds[id]] = [] unless feature_dataset.data[compounds[id]]
- feature_dataset.data[compounds[id]] << {bbrc_uri => tuple}
- end
- end
- end
-
- uri = feature_dataset.save
- LOGGER.debug "Fminer finished, dataset #{uri} created."
- uri
- end
- LOGGER.debug "Fimer task started: "+task_uri.to_s
- response['Content-Type'] = 'text/uri-list'
- halt 202,task_uri.to_s+"\n"
+ raise "No compounds in dataset #{training_dataset.uri}" if compounds.size==0
+
+ # run @@last
+ features = Set.new
+ xml = ""
+
+ (0 .. @@last.GetNoRootNodes()-1).each do |j|
+ results = @@last.MineRoot(j)
+ results.each do |result|
+ xml << result
+ end
+ end
+
+ lu = LU.new # AM LAST: uses last-utils here
+ dom=lu.read(xml) # AM LAST: parse GraphML (needs hpricot, @ch: to be included in wrapper!)
+ smarts=lu.smarts_rb(dom,'msa') # AM LAST: converts patterns to LAST-SMARTS using msa variant (see last-pm.maunz.de)
+ instances=lu.match_rb(smi,smarts) # AM LAST: creates instantiations
+ instances.each do |smarts, ids|
+ feat_hash = Hash[*(all_activities.select { |k,v| ids.include?(k) }.flatten)] # AM LAST: get activities of feature occurrences; see http://www.softiesonrails.com/2007/9/18/ruby-201-weird-hash-syntax
+ @@last.GetRegression() ? p_value = @@last.KSTest(all_activities.values, feat_hash.values).to_f : p_value = @@last.ChisqTest(all_activities.values, feat_hash.values).to_f # AM LAST: use internal function for test
+
+
+ effect = (p_value > 0) ? "activating" : "deactivating"
+ feature_uri = File.join feature_dataset.uri,"feature","last", features.size.to_s
+ unless features.include? smarts
+ features << smarts
+ metadata = {
+ OT.isA => OT.Substructure,
+ OT.hasSource => feature_dataset.uri,
+ OT.smarts => smarts,
+ OT.pValue => p_value.to_f,
+ OT.effect => effect,
+ OT.parameters => [
+ { DC.title => "dataset_uri", OT.paramValue => params[:dataset_uri] },
+ { DC.title => "prediction_feature", OT.paramValue => params[:prediction_feature] }
+ ]
+ }
+ feature_dataset.add_feature feature_uri, metadata
+ end
+ ids.each { |id| feature_dataset.add(compounds[id], feature_uri, true)}
+ end
+ feature_dataset.save(@subjectid)
+ feature_dataset.uri
+ end
+ response['Content-Type'] = 'text/uri-list'
+ halt 503,task.uri+"\n" if task.status == "Cancelled"
+ halt 202,task.uri.to_s+"\n"
end