Java Main function
[algorithm] / fminer.rb
1 ENV['FMINER_SMARTS'] = 'true'
2 ENV['FMINER_NO_AROMATIC'] = 'true'
3 ENV['FMINER_PVALUES'] = 'true'
4 ENV['FMINER_SILENT'] = 'true'
5 ENV['FMINER_NR_HITS'] = 'true'
6
7 @@bbrc = Bbrc::Bbrc.new
8 @@last = Last::Last.new
9
10
11
12 # Get list of fminer algorithms
13 #
14 # @return [text/uri-list] URIs of fminer algorithms
15 get '/fminer/?' do
16   list = [ url_for('/fminer/bbrc', :full), url_for('/fminer/bbrc/sample', :full), url_for('/fminer/last', :full), url_for('/fminer/bbrc/match', :full), url_for('/fminer/last/match', :full) ].join("\n") + "\n"
17   case request.env['HTTP_ACCEPT']
18   when /text\/html/
19     content_type "text/html"
20     OpenTox.text_to_html list
21   else
22     content_type 'text/uri-list'
23     list
24   end
25 end
26
27
28
29 # Get RDF/XML representation of fminer bbrc algorithm
30 # @return [application/rdf+xml] OWL-DL representation of fminer bbrc algorithm
31 get "/fminer/bbrc/?" do
32   algorithm = OpenTox::Algorithm::Generic.new(url_for('/fminer/bbrc',:full))
33   algorithm.metadata = {
34     DC.title => 'fminer backbone refinement class representatives',
35     DC.creator => "andreas@maunz.de, helma@in-silico.ch",
36     DC.contributor => "vorgrimmlerdavid@gmx.de",
37 #    BO.instanceOf => "http://opentox.org/ontology/ist-algorithms.owl#fminer_bbrc",
38     RDF.type => [OT.Algorithm,OTA.PatternMiningSupervised],
39     OT.parameters => [
40       { DC.description => "Dataset URI", OT.paramScope => "mandatory", DC.title => "dataset_uri" },
41       { DC.description => "Feature URI for dependent variable", OT.paramScope => "mandatory", DC.title => "prediction_feature" },
42       { DC.description => "Minimum frequency", OT.paramScope => "optional", DC.title => "min_frequency" },
43       { DC.description => "Feature type, can be 'paths' or 'trees'", OT.paramScope => "optional", DC.title => "feature_type" },
44       { DC.description => "BBRC classes, pass 'false' to switch off mining for BBRC representatives.", OT.paramScope => "optional", DC.title => "backbone" },
45       { DC.description => "Significance threshold (between 0 and 1)", OT.paramScope => "optional", DC.title => "min_chisq_significance" },
46       { DC.description => "Whether subgraphs should be weighted with their occurrence counts in the instances (frequency)", OT.paramScope => "optional", DC.title => "nr_hits" },
47   ]
48   }
49   case request.env['HTTP_ACCEPT']
50   when /text\/html/
51     content_type "text/html"
52     OpenTox.text_to_html algorithm.to_yaml
53   when /application\/x-yaml/
54     content_type "application/x-yaml"
55     algorithm.to_yaml
56   else
57     response['Content-Type'] = 'application/rdf+xml'
58     algorithm.to_rdfxml
59   end
60 end
61
62 # Get RDF/XML representation of fminer bbrc algorithm
63 # @return [application/rdf+xml] OWL-DL representation of fminer bbrc algorithm
64 get "/fminer/bbrc/sample/?" do
65   algorithm = OpenTox::Algorithm::Generic.new(url_for('/fminer/bbrc/sample',:full))
66   algorithm.metadata = {
67     DC.title => 'fminer backbone refinement class representatives, obtained from samples of a dataset',
68     DC.creator => "andreas@maunz.de",
69 #    BO.instanceOf => "http://opentox.org/ontology/ist-algorithms.owl#fminer_bbrc",
70     RDF.type => [OT.Algorithm,OTA.PatternMiningSupervised],
71     OT.parameters => [
72       { DC.description => "Dataset URI", OT.paramScope => "mandatory", DC.title => "dataset_uri" },
73       { DC.description => "Feature URI for dependent variable", OT.paramScope => "mandatory", DC.title => "prediction_feature" },
74       { DC.description => "Number of bootstrap samples", OT.paramScope => "optional", DC.title => "num_boots" },
75       { DC.description => "Minimum sampling support", OT.paramScope => "optional", DC.title => "min_sampling_support" },
76       { DC.description => "Minimum frequency", OT.paramScope => "optional", DC.title => "min_frequency" },
77       { DC.description => "Whether subgraphs should be weighted with their occurrence counts in the instances (frequency)", OT.paramScope => "optional", DC.title => "nr_hits" },
78       { DC.description => "BBRC classes, pass 'false' to switch off mining for BBRC representatives.", OT.paramScope => "optional", DC.title => "backbone" },
79       { DC.description => "Chisq estimation method, pass 'mean' to use simple mean estimate for chisq test.", OT.paramScope => "optional", DC.title => "method" }
80   ]
81   }
82   case request.env['HTTP_ACCEPT']
83   when /text\/html/
84     content_type "text/html"
85     OpenTox.text_to_html algorithm.to_yaml
86   when /yaml/
87     content_type "application/x-yaml"
88     algorithm.to_yaml
89   else
90     response['Content-Type'] = 'application/rdf+xml'
91     algorithm.to_rdfxml
92   end
93 end
94
95 # Get RDF/XML representation of fminer last algorithm
96 # @return [application/rdf+xml] OWL-DL representation of fminer last algorithm
97 get "/fminer/last/?" do
98   algorithm = OpenTox::Algorithm::Generic.new(url_for('/fminer/last',:full))
99   algorithm.metadata = {
100     DC.title => 'fminer latent structure class representatives',
101     DC.creator => "andreas@maunz.de, helma@in-silico.ch",
102     DC.contributor => "vorgrimmlerdavid@gmx.de",
103 #    BO.instanceOf => "http://opentox.org/ontology/ist-algorithms.owl#fminer_last",
104     RDF.type => [OT.Algorithm,OTA.PatternMiningSupervised],
105     OT.parameters => [
106       { DC.description => "Dataset URI", OT.paramScope => "mandatory", DC.title => "dataset_uri" },
107       { DC.description => "Feature URI for dependent variable", OT.paramScope => "mandatory", DC.title => "prediction_feature" },
108       { DC.description => "Minimum frequency", OT.paramScope => "optional", DC.title => "min_frequency" },
109       { DC.description => "Feature type, can be 'paths' or 'trees'", OT.paramScope => "optional", DC.title => "feature_type" },
110       { DC.description => "Whether subgraphs should be weighted with their occurrence counts in the instances (frequency)", OT.paramScope => "optional", DC.title => "nr_hits" },
111   ]
112   }
113   case request.env['HTTP_ACCEPT']
114   when /text\/html/
115     content_type "text/html"
116     OpenTox.text_to_html algorithm.to_yaml
117   when /application\/x-yaml/
118     content_type "application/x-yaml"
119     algorithm.to_yaml
120   else
121     response['Content-Type'] = 'application/rdf+xml'
122     algorithm.to_rdfxml
123   end
124 end
125
126
127 # Get RDF/XML representation of fminer matching algorithm
128 # @param [String] dataset_uri URI of the dataset 
129 # @param [String] feature_dataset_uri URI of the feature dataset (i.e. dependent variable)
130 # @param [optional] parameters Accepted parameters are
131 # - prediction_feature URI of prediction feature to calculate p-values for
132 get "/fminer/:method/match?" do
133   algorithm = OpenTox::Algorithm::Generic.new(url_for("/fminer/#{params[:method]}/match",:full))
134   algorithm.metadata = {
135     DC.title => 'fminer feature matching',
136     DC.creator => "mguetlein@gmail.com, andreas@maunz.de",
137     RDF.type => [OT.Algorithm,OTA.PatternMiningSupervised],
138     OT.parameters => [
139       { DC.description => "Dataset URI", OT.paramScope => "mandatory", DC.title => "dataset_uri" },
140       { DC.description => "Feature Dataset URI", OT.paramScope => "mandatory", DC.title => "feature_dataset_uri" },
141       { DC.description => "Feature URI for dependent variable", OT.paramScope => "optional", DC.title => "prediction_feature" }
142   ]
143   }
144   case request.env['HTTP_ACCEPT']
145   when /text\/html/
146     content_type "text/html"
147     OpenTox.text_to_html algorithm.to_yaml
148   when /application\/x-yaml/
149     content_type "application/x-yaml"
150     algorithm.to_yaml
151   else
152     response['Content-Type'] = 'application/rdf+xml'
153     algorithm.to_rdfxml
154   end
155 end
156
157
158
159
160 # Run bbrc algorithm on dataset
161 #
162 # @param [String] dataset_uri URI of the training dataset
163 # @param [String] prediction_feature URI of the prediction feature (i.e. dependent variable)
164 # @param [optional] parameters BBRC parameters, accepted parameters are
165 #   - min_frequency  Minimum frequency (default 5)
166 #   - feature_type Feature type, can be 'paths' or 'trees' (default "trees")
167 #   - backbone BBRC classes, pass 'false' to switch off mining for BBRC representatives. (default "true")
168 #   - min_chisq_significance Significance threshold (between 0 and 1)
169 #   - nr_hits Set to "true" to get hit count instead of presence
170 # @return [text/uri-list] Task URI
171 post '/fminer/bbrc/?' do
172
173   fminer=OpenTox::Algorithm::Fminer.new
174   fminer.check_params(params,5,@subjectid)
175
176   task = OpenTox::Task.create("Mining BBRC features", url_for('/fminer',:full)) do |task|
177     @@bbrc.Reset
178     if fminer.prediction_feature.feature_type == "regression"
179       @@bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations!
180     else
181       raise "no accept values for dataset '"+fminer.training_dataset.uri.to_s+"' and feature '"+fminer.prediction_feature.uri.to_s+
182         "'" unless fminer.training_dataset.accept_values(fminer.prediction_feature.uri)
183       @value_map=fminer.training_dataset.value_map(fminer.prediction_feature.uri)
184     end
185     @@bbrc.SetMinfreq(fminer.minfreq)
186     @@bbrc.SetType(1) if params[:feature_type] == "paths"
187     @@bbrc.SetBackbone(false) if params[:backbone] == "false"
188     @@bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance]
189     @@bbrc.SetConsoleOut(false)
190
191     feature_dataset = OpenTox::Dataset.new(nil, @subjectid)
192     feature_dataset.add_metadata({
193       DC.title => "BBRC representatives for " + fminer.training_dataset.metadata[DC.title].to_s,
194       DC.creator => url_for('/fminer/bbrc',:full),
195       OT.hasSource => url_for('/fminer/bbrc', :full),
196       OT.parameters => [
197         { DC.title => "dataset_uri", OT.paramValue => params[:dataset_uri] },
198         { DC.title => "prediction_feature", OT.paramValue => params[:prediction_feature] },
199         { DC.title => "min_frequency", OT.paramValue => fminer.minfreq },
200         { DC.title => "nr_hits", OT.paramValue => (params[:nr_hits] == "true" ? "true" : "false") },
201         { DC.title => "backbone", OT.paramValue => (params[:backbone] == "false" ? "false" : "true") }
202
203     ]
204     })
205     feature_dataset.save(@subjectid)
206
207     fminer.compounds = []
208     fminer.db_class_sizes = Array.new # AM: effect
209     fminer.all_activities = Hash.new # DV: for effect calculation in regression part
210     fminer.smi = [] # AM LAST: needed for matching the patterns back
211
212     # Add data to fminer
213     fminer.add_fminer_data(@@bbrc, @value_map)
214
215     g_array=fminer.all_activities.values # DV: calculation of global median for effect calculation
216     g_median=g_array.to_scale.median
217
218     raise "No compounds in dataset #{fminer.training_dataset.uri}" if fminer.compounds.size==0
219     task.progress 10
220     step_width = 80 / @@bbrc.GetNoRootNodes().to_f
221     features = Set.new
222
223     # run @@bbrc
224     (0 .. @@bbrc.GetNoRootNodes()-1).each do |j|
225       results = @@bbrc.MineRoot(j)
226       task.progress 10+step_width*(j+1)
227       results.each do |result|
228         f = YAML.load(result)[0]
229         smarts = f[0]
230         p_value = f[1]
231
232         if (!@@bbrc.GetRegression)
233           id_arrs = f[2..-1].flatten
234           max = OpenTox::Algorithm.effect(f[2..-1].reverse, fminer.db_class_sizes) # f needs reversal for bbrc
235           effect = max+1
236         else #regression part
237           id_arrs = f[2]
238           # DV: effect calculation
239           f_arr=Array.new
240           f[2].each do |id|
241             id=id.keys[0] # extract id from hit count hash
242             f_arr.push(fminer.all_activities[id])
243           end
244           f_median=f_arr.to_scale.median
245           if g_median >= f_median
246             effect = 'activating'
247           else
248             effect = 'deactivating'
249           end
250         end
251
252         feature_uri = File.join feature_dataset.uri,"feature","bbrc", features.size.to_s
253         unless features.include? smarts
254           features << smarts
255           metadata = {
256             OT.hasSource => url_for('/fminer/bbrc', :full),
257             RDF.type => [OT.Feature, OT.Substructure],
258             OT.smarts => smarts,
259             OT.pValue => p_value.to_f,
260             OT.effect => effect,
261             OT.parameters => [
262               { DC.title => "dataset_uri", OT.paramValue => params[:dataset_uri] },
263               { DC.title => "prediction_feature", OT.paramValue => params[:prediction_feature] }
264           ]
265           }
266           feature_dataset.add_feature feature_uri, metadata
267           #feature_dataset.add_feature_parameters feature_uri, feature_dataset.parameters
268         end
269         id_arrs.each { |id_count_hash|
270           id=id_count_hash.keys[0].to_i
271           count=id_count_hash.values[0].to_i
272           if params[:nr_hits] == "true"
273             feature_dataset.add(fminer.compounds[id], feature_uri, count)
274           else
275             feature_dataset.add(fminer.compounds[id], feature_uri, 1)
276           end
277         }
278
279       end # end of
280     end   # feature parsing
281
282     # AM: add feature values for non-present features
283     # feature_dataset.complete_data_entries
284
285     feature_dataset.save(@subjectid)
286     feature_dataset.uri
287   end
288   response['Content-Type'] = 'text/uri-list'
289   raise OpenTox::ServiceUnavailableError.newtask.uri+"\n" if task.status == "Cancelled"
290   halt 202,task.uri.to_s+"\n"
291 end
292 #end
293
294
295 # Run bbrc/sample algorithm on a dataset
296 #
297 # @param [String] dataset_uri URI of the training dataset
298 # @param [String] prediction_feature URI of the prediction feature (i.e. dependent variable)
299 # @param [optional] BBRC sample parameters, accepted are
300 #   - num_boots Number of bootstrap samples (default 150)
301 #   - min_sampling_support Minimum sampling support (default 30% of num_boots)
302 #   - min_frequency  Minimum frequency (default 10% of dataset size)
303 #   - nr_hits Whether subgraphs should be weighted with their occurrence counts in the instances (frequency)
304 #   - random_seed Random seed ensures same datasets in bootBbrc
305 #   - backbone BBRC classes, pass 'false' to switch off mining for BBRC representatives. (default "true")
306 #   - method Chisq estimation method, pass 'mean' to use simple mean estimate (default 'mle').
307 #   - cache Whether cache files should be used for the combination of dataset, min_frequency, backbone, random seed (default "false")
308 #
309 # @return [text/uri-list] Task URI
310 post '/fminer/bbrc/sample/?' do
311
312   fminer=OpenTox::Algorithm::Fminer.new
313   fminer.check_params(params,100,@subjectid) # AM: 100 per-mil (10%) as default minfreq
314
315   # num_boots
316   unless params[:num_boots]
317     num_boots = 150
318     LOGGER.debug "Set num_boots to default value #{num_boots}"
319   else
320     raise OpenTox::BadRequestError.new "num_boots is not numeric" unless OpenTox::Algorithm.numeric? params[:num_boots]
321           num_boots = params[:num_boots].to_i.ceil
322   end
323
324   # min_sampling_support
325   unless params[:min_sampling_support]
326     min_sampling_support = (num_boots * 0.3).ceil
327     LOGGER.debug "Set min_sampling_support to default value #{min_sampling_support}"
328   else
329     raise OpenTox::BadRequestError.new "min_sampling_support is not numeric" unless OpenTox::Algorithm.numeric? params[:min_sampling_support]
330           min_sampling_support= params[:min_sampling_support].to_i.ceil
331   end
332
333   # random_seed
334   unless params[:random_seed]
335     random_seed = 1
336     LOGGER.debug "Set random seed to default value #{random_seed}"
337   else
338     raise OpenTox::BadRequestError.new "random_seed is not numeric" unless OpenTox::Algorithm.numeric? params[:random_seed]
339     random_seed= params[:random_seed].to_i.ceil
340   end
341
342   # backbone
343   unless params[:backbone]
344     backbone = "true"
345     LOGGER.debug "Set backbone to default value #{backbone}"
346   else
347     raise OpenTox::BadRequestError.new "backbone is neither 'true' nor 'false'" unless (params[:backbone] == "true" or params[:backbone] == "false")
348     backbone = params[:backbone]
349   end
350
351   # method
352   unless params[:method]
353     method="mle"
354     LOGGER.debug "Set method to default value #{method}"
355   else
356     raise OpenTox::BadRequestError.new "method is neither 'mle' nor 'mean'" unless (params[:method] == "mle" or params[:method] == "mean")
357     method = params[:method]
358   end
359
360   # cache
361   cache=false
362   if params[:cache] == "true"
363     cache=true
364     LOGGER.debug "Set cache to true"
365   end
366
367
368   task = OpenTox::Task.create("Mining BBRC sample features", url_for('/fminer',:full)) do |task|
369     if fminer.prediction_feature.feature_type == "regression"
370       raise OpenTox::BadRequestError.new "BBRC sampling is only for classification"
371     else
372       raise "no accept values for dataset '"+fminer.training_dataset.uri.to_s+"' and feature '"+fminer.prediction_feature.uri.to_s+
373         "'" unless fminer.training_dataset.accept_values(fminer.prediction_feature.uri)
374       @value_map=fminer.training_dataset.value_map(fminer.prediction_feature.uri)
375     end
376
377     feature_dataset = OpenTox::Dataset.new(nil, @subjectid)
378     feature_dataset.add_metadata({
379       DC.title => "BBRC representatives for " + fminer.training_dataset.metadata[DC.title].to_s + "(bootstrapped)",
380       DC.creator => url_for('/fminer/bbrc/sample',:full),
381       OT.hasSource => url_for('/fminer/bbrc/sample', :full)
382     })
383     feature_dataset.save(@subjectid)
384
385     # filled by add_fminer_data:
386     fminer.compounds = [] # indexed by id, starting from 1 (not 0)
387     fminer.db_class_sizes = Array.new # for effect calculation
388     fminer.all_activities = Hash.new # for effect calculation, indexed by id, starting from 1 (not 0)
389     fminer.smi = [] # needed for matching the patterns back, indexed by id, starting from 1 (not 0)
390     fminer.add_fminer_data(nil, @value_map) # To only fill in administrative data (no fminer priming) pass 'nil' as instance
391
392     raise "No compounds in dataset #{fminer.training_dataset.uri}" if fminer.compounds.size==0
393
394
395     # run bbrc-sample, obtain smarts and p-values
396     features = Set.new
397     task.progress 10
398     @r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests
399     @r.assign "dataset.uri", params[:dataset_uri]
400     @r.assign "prediction.feature.uri", fminer.prediction_feature.uri
401     @r.assign "num.boots", num_boots
402     @r.assign "min.frequency.per.sample", fminer.minfreq
403     @r.assign "min.sampling.support", min_sampling_support
404     @r.assign "random.seed", random_seed
405     @r.assign "backbone", backbone
406     @r.assign "bbrc.service", File.join(CONFIG[:services]["opentox-algorithm"], "fminer/bbrc")
407     @r.assign "dataset.service", CONFIG[:services]["opentox-dataset"]
408     @r.assign "method", method
409
410     require 'digest/md5'
411     fminer.smi.shift
412     cachedId = Digest::MD5.hexdigest(
413       fminer.smi.sort.join+
414       num_boots.to_s+
415       fminer.minfreq.to_s+
416       random_seed.to_s+
417       backbone.to_s
418     )
419     @r.assign "cachedId", cachedId
420     @r.eval "cachedId <- NULL" unless cache
421
422     @r.eval "source(\"bbrc-sample/bbrc-sample.R\")"
423     begin
424       @r.eval "bootBbrc(dataset.uri, prediction.feature.uri, num.boots, min.frequency.per.sample, min.sampling.support, cachedId, bbrc.service, dataset.service, T, random.seed, as.logical(backbone), method)"
425       smarts = (@r.pull "ans.patterns").collect! { |id| id.gsub(/\'/,"") } # remove extra quotes around smarts
426       r_p_values = @r.pull "ans.p.values"
427       smarts_p_values = {}; smarts.size.times { |i| smarts_p_values[ smarts[i] ] = r_p_values[i] }
428       merge_time = @r.pull "merge.time"
429       n_stripped_mss = @r.pull "n.stripped.mss"
430       n_stripped_cst = @r.pull "n.stripped.cst"
431     rescue Exception => e
432       LOGGER.debug "#{e.class}: #{e.message}"
433       LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
434     end
435     @r.quit # free R
436
437     # matching
438     task.progress 90
439     lu = LU.new                             # AM LAST: uses last-utils here
440     params[:nr_hits] == "true" ? hit_count=true: hit_count=false
441     matches, counts = lu.match_rb(fminer.smi,smarts,hit_count)       # AM LAST: creates instantiations
442     
443     feature_dataset.add_metadata({
444           OT.parameters => [
445         { DC.title => "dataset_uri", OT.paramValue => params[:dataset_uri] },
446         { DC.title => "prediction_feature", OT.paramValue => params[:prediction_feature] },
447         { DC.title => "min_sampling_support", OT.paramValue => min_sampling_support },
448         { DC.title => "num_boots", OT.paramValue => num_boots },
449         { DC.title => "min_frequency_per_sample", OT.paramValue => fminer.minfreq },
450         { DC.title => "nr_hits", OT.paramValue => hit_count.to_s },
451         { DC.title => "merge_time", OT.paramValue => merge_time.to_s },
452         { DC.title => "n_stripped_mss", OT.paramValue => n_stripped_mss.to_s },
453         { DC.title => "n_stripped_cst", OT.paramValue => n_stripped_cst.to_s },
454         { DC.title => "random_seed", OT.paramValue => random_seed.to_s },
455         { DC.title => "backbone", OT.paramValue => backbone.to_s },
456         { DC.title => "method", OT.paramValue => method.to_s }
457           ]
458     })
459
460     matches.each do |smarts, ids|
461       feat_hash = Hash[*(fminer.all_activities.select { |k,v| ids.include?(k) }.flatten)] # AM LAST: get activities of feature occurrences; see http://www.softiesonrails.com/2007/9/18/ruby-201-weird-hash-syntax
462       g = Array.new
463       @value_map.each { |y,act| g[y-1]=Array.new }
464       feat_hash.each  { |x,y|   g[y-1].push(x)   }
465       max = OpenTox::Algorithm.effect(g, fminer.db_class_sizes)
466       effect = max + 1
467       feature_uri = File.join feature_dataset.uri,"feature","bbrc", features.size.to_s
468       unless features.include? smarts
469         features << smarts
470         metadata = {
471           RDF.type => [OT.Feature, OT.Substructure],
472           OT.hasSource => feature_dataset.uri,
473           OT.smarts => smarts,
474           OT.pValue => smarts_p_values[smarts],
475           OT.effect => effect,
476           OT.parameters => [
477             { DC.title => "dataset_uri", OT.paramValue => params[:dataset_uri] },
478             { DC.title => "prediction_feature", OT.paramValue => params[:prediction_feature] }
479         ]
480         }
481         feature_dataset.add_feature feature_uri, metadata
482       end
483       if !hit_count
484         ids.each { |id| feature_dataset.add(fminer.compounds[id], feature_uri, 1)}
485       else
486         ids.each_with_index { |id,i| feature_dataset.add(fminer.compounds[id], feature_uri, counts[smarts][i])}
487       end
488     end
489
490     # AM: add feature values for non-present features
491     # feature_dataset.complete_data_entries
492
493     feature_dataset.save(@subjectid)
494     feature_dataset.uri
495   end
496   response['Content-Type'] = 'text/uri-list'
497   raise OpenTox::ServiceUnavailableError.newtask.uri+"\n" if task.status == "Cancelled"
498   halt 202,task.uri.to_s+"\n"
499 end
500
501 # Run last algorithm on a dataset
502 #
503 # @param [String] dataset_uri URI of the training dataset
504 # @param [String] prediction_feature URI of the prediction feature (i.e. dependent variable)
505 # @param [optional] parameters LAST parameters, accepted parameters are
506 #   - min_frequency freq  Minimum frequency (default 5)
507 #   - feature_type Feature type, can be 'paths' or 'trees' (default "trees")
508 #   - nr_hits Set to "true" to get hit count instead of presence
509 # @return [text/uri-list] Task URI
510 post '/fminer/last/?' do
511
512   fminer=OpenTox::Algorithm::Fminer.new
513   fminer.check_params(params,80,@subjectid)
514
515   task = OpenTox::Task.create("Mining LAST features", url_for('/fminer',:full)) do |task|
516     @@last.Reset
517     if fminer.prediction_feature.feature_type == "regression"
518       @@last.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations!
519     else
520       raise "no accept values for dataset '"+fminer.training_dataset.uri.to_s+"' and feature '"+fminer.prediction_feature.uri.to_s+
521         "'" unless fminer.training_dataset.accept_values(fminer.prediction_feature.uri)
522       @value_map=fminer.training_dataset.value_map(fminer.prediction_feature.uri)
523     end
524     @@last.SetMinfreq(fminer.minfreq)
525     @@last.SetType(1) if params[:feature_type] == "paths"
526     @@last.SetConsoleOut(false)
527
528
529     feature_dataset = OpenTox::Dataset.new(nil, @subjectid)
530     feature_dataset.add_metadata({
531       DC.title => "LAST representatives for " + fminer.training_dataset.metadata[DC.title].to_s,
532       DC.creator => url_for('/fminer/last',:full),
533       OT.hasSource => url_for('/fminer/last', :full),
534       OT.parameters => [
535         { DC.title => "dataset_uri", OT.paramValue => params[:dataset_uri] },
536         { DC.title => "prediction_feature", OT.paramValue => params[:prediction_feature] },
537         { DC.title => "min_frequency", OT.paramValue => fminer.minfreq },
538         { DC.title => "nr_hits", OT.paramValue => (params[:nr_hits] == "true" ? "true" : "false") }
539       ]
540     })
541     feature_dataset.save(@subjectid)
542
543     fminer.compounds = []
544     fminer.db_class_sizes = Array.new # AM: effect
545     fminer.all_activities = Hash.new # DV: for effect calculation (class and regr)
546     fminer.smi = [] # AM LAST: needed for matching the patterns back
547
548     # Add data to fminer
549     fminer.add_fminer_data(@@last, @value_map)
550
551     raise "No compounds in dataset #{fminer.training_dataset.uri}" if fminer.compounds.size==0
552
553     # run @@last
554     features = Set.new
555     xml = ""
556     task.progress 10
557     step_width = 80 / @@last.GetNoRootNodes().to_f
558
559     (0 .. @@last.GetNoRootNodes()-1).each do |j|
560       results = @@last.MineRoot(j)
561       task.progress 10+step_width*(j+1)
562       results.each do |result|
563         xml << result
564       end
565     end
566
567     lu = LU.new                             # AM LAST: uses last-utils here
568     dom=lu.read(xml)                        # AM LAST: parse GraphML
569     smarts=lu.smarts_rb(dom,'nls')          # AM LAST: converts patterns to LAST-SMARTS using msa variant (see last-pm.maunz.de)
570     params[:nr_hits] == "true" ? hit_count=true: hit_count=false
571     matches, counts = lu.match_rb(fminer.smi,smarts,hit_count)       # AM LAST: creates instantiations
572
573     matches.each do |smarts, ids|
574       feat_hash = Hash[*(fminer.all_activities.select { |k,v| ids.include?(k) }.flatten)] # AM LAST: get activities of feature occurrences; see http://www.softiesonrails.com/2007/9/18/ruby-201-weird-hash-syntax
575       if @@last.GetRegression()
576         p_value = @@last.KSTest(fminer.all_activities.values, feat_hash.values).to_f # AM LAST: use internal function for test
577         effect = (p_value > 0) ? "activating" : "deactivating"
578       else
579         p_value = @@last.ChisqTest(fminer.all_activities.values, feat_hash.values).to_f
580         g=Array.new
581         @value_map.each { |y,act| g[y-1]=Array.new }
582         feat_hash.each  { |x,y|   g[y-1].push(x)   }
583         max = OpenTox::Algorithm.effect(g, fminer.db_class_sizes)
584         effect = max+1
585       end
586       feature_uri = File.join feature_dataset.uri,"feature","last", features.size.to_s
587       unless features.include? smarts
588         features << smarts
589         metadata = {
590           RDF.type => [OT.Feature, OT.Substructure],
591           OT.hasSource => feature_dataset.uri,
592           OT.smarts => smarts,
593           OT.pValue => p_value.abs,
594           OT.effect => effect,
595           OT.parameters => [
596             { DC.title => "dataset_uri", OT.paramValue => params[:dataset_uri] },
597             { DC.title => "prediction_feature", OT.paramValue => params[:prediction_feature] }
598         ]
599         }
600         feature_dataset.add_feature feature_uri, metadata
601       end
602       if !hit_count
603         ids.each { |id| feature_dataset.add(fminer.compounds[id], feature_uri, 1)}
604       else
605         ids.each_with_index { |id,i| feature_dataset.add(fminer.compounds[id], feature_uri, counts[smarts][i])}
606       end
607     end
608
609     # AM: add feature values for non-present features
610     # feature_dataset.complete_data_entries
611
612     feature_dataset.save(@subjectid)
613     feature_dataset.uri
614   end
615   response['Content-Type'] = 'text/uri-list'
616   raise OpenTox::ServiceUnavailableError.newtask.uri+"\n" if task.status == "Cancelled"
617   halt 202,task.uri.to_s+"\n"
618 end
619
620 # Matches features of a a feature dataset onto instances of another dataset. 
621 # The latter is referred to as 'training dataset', since p-values are computed,
622 # if user passes a prediction feature, or if the training dataset has only one feature.
623 # The result does not contain the prediction feature.
624 # @param [String] dataset_uri URI of the dataset 
625 # @param [String] feature_dataset_uri URI of the feature dataset (i.e. dependent variable)
626 # @param [optional] parameters Accepted parameters are
627 # - prediction_feature URI of prediction feature to calculate p-values for
628 # @return [text/uri-list] Task URI
629 post '/fminer/:method/match?' do
630   raise OpenTox::BadRequestError.new "feature_dataset_uri not given" unless params[:feature_dataset_uri]
631   raise OpenTox::BadRequestError.new "dataset_uri not given" unless params[:dataset_uri]
632
633   training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}",@subjectid
634   unless params[:prediction_feature] # try to read prediction_feature from dataset
635     prediction_feature = OpenTox::Feature.find(training_dataset.features.keys.first,@subjectid) if training_dataset.features.size == 1
636   end
637   prediction_feature = OpenTox::Feature.find(params[:prediction_feature],@subjectid) if params[:prediction_feature]
638
639   task = OpenTox::Task.create("Matching features", url_for('/fminer/match',:full)) do |task|
640
641     # get endpoint statistics
642     if prediction_feature
643       db_class_sizes = Array.new # for effect calculation
644       all_activities = Hash.new # for effect calculation, indexed by id, starting from 1 (not 0)
645       id = 1
646       training_dataset.compounds.each do |compound|
647         entry=training_dataset.data_entries[compound]
648         entry.each do |feature,values|
649           if feature == prediction_feature.uri
650             values.each { |val|
651               if val.nil? 
652                 LOGGER.warn "No #{feature} activity for #{compound.to_s}."
653               else
654                 if prediction_feature.feature_type == "classification"
655                   activity= training_dataset.value_map(prediction_feature.uri).invert[val].to_i # activities are mapped to 1..n
656                   db_class_sizes[activity-1].nil? ? db_class_sizes[activity-1]=1 : db_class_sizes[activity-1]+=1 # AM effect
657                 elsif prediction_feature.feature_type == "regression"
658                   activity= val.to_f 
659                 end
660                 begin
661                   all_activities[id]=activity # DV: insert global information
662                   id += 1
663                 rescue Exception => e
664                   LOGGER.warn "Could not add " + smiles + "\t" + val.to_s + " to fminer"
665                   LOGGER.warn e.backtrace
666                 end
667               end
668             }
669           end
670         end
671       end
672     end
673
674     # Intialize result by adding compounds
675     f_dataset = OpenTox::Dataset.find params[:feature_dataset_uri],@subjectid
676     c_dataset = OpenTox::Dataset.find params[:dataset_uri],@subjectid
677     res_dataset = OpenTox::Dataset.create CONFIG[:services]["dataset"],@subjectid
678     c_dataset.compounds.each do |c|
679       res_dataset.add_compound(c)
680     end
681
682     # Run matching, put data entries in result. Features are recreated.
683     smi = [nil]; smi += c_dataset.compounds.collect { |c| OpenTox::Compound.new(c).to_smiles }
684     smarts = f_dataset.features.collect { |f,m| m[OT.smarts] }
685     params[:nr_hits] == "true" ? hit_count=true: hit_count=false
686     matches, counts = LU.new.match_rb(smi, smarts, hit_count) if smarts.size>0
687
688     f_dataset.features.each do |f,m|
689       if (matches[m[OT.smarts]] && matches[m[OT.smarts]].size>0)
690
691         feature_uri = File.join params[:feature_dataset_uri],"feature","bbrc","match", res_dataset.features.size.to_s
692         #feature_uri = File.join res_dataset.uri,"feature","match", res_dataset.features.size.to_s
693         metadata = {
694           RDF.type => [OT.Feature, OT.Substructure],
695           OT.hasSource => f_dataset.uri,
696           OT.smarts => m[OT.smarts],
697           OT.parameters => [
698             { DC.title => "dataset_uri", OT.paramValue => params[:dataset_uri] }
699           ]
700         }
701
702         if (prediction_feature) 
703           feat_hash = Hash[*(all_activities.select { |k,v| matches[m[OT.smarts]].include?(k) }.flatten)]
704           if prediction_feature.feature_type == "regression"
705             p_value = @@last.KSTest(all_activities.values, feat_hash.values).to_f # AM LAST: use internal function for test
706             effect = (p_value > 0) ? "activating" : "deactivating"
707           else
708             p_value = @@last.ChisqTest(all_activities.values, feat_hash.values).to_f
709             g=Array.new # g is filled in *a*scending activity
710             training_dataset.value_map(prediction_feature.uri).each { |y,act| g[y-1]=Array.new }
711             feat_hash.each  { |x,y|   g[y-1].push(x)   }
712             max = OpenTox::Algorithm.effect(g, db_class_sizes) # db_class_sizes is filled in *a*scending activity
713             effect = max+1
714           end
715           metadata[OT.effect] = effect
716           metadata[OT.pValue] = ((p_value.abs * 10000).round / 10000).to_f
717           metadata[OT.parameters] << { DC.title => "prediction_feature", OT.paramValue => prediction_feature.uri }
718         end
719         
720         res_dataset.add_feature feature_uri, metadata
721
722         matches[m[OT.smarts]].each_with_index {|id,idx| 
723           res_dataset.add(c_dataset.compounds[id-1],feature_uri,counts[m[OT.smarts]][idx])
724         }
725       end
726     end
727     res_dataset.save @subjectid
728     res_dataset.uri
729   end
730   return_task(task)
731 end
732