summaryrefslogtreecommitdiff
path: root/fminer.rb
blob: bfc68797a63f1b71f2150d35230a4c9d9782af2c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
=begin
* Name: fminer.rb
* Description: Subgraph descriptor calculation 
* Author: Andreas Maunz <andreas@maunz.de>
* Date: 10/2012
=end

ENV['FMINER_SMARTS'] = 'true'
ENV['FMINER_NO_AROMATIC'] = 'true'
ENV['FMINER_PVALUES'] = 'true'
ENV['FMINER_SILENT'] = 'true'
ENV['FMINER_NR_HITS'] = 'true'

@@bbrc = Bbrc::Bbrc.new
@@last = Last::Last.new

module OpenTox
  
  class Application < Service

    # Get list of fminer algorithms
    # @return [text/uri-list] URIs
    get '/fminer/?' do
      render [ uri('/fminer/bbrc'), uri('/fminer/last') ]
    end
    
    # Get representation of BBRC algorithm
    # @return [String] Representation
    get "/fminer/bbrc/?" do
      algorithm = OpenTox::Algorithm::Generic.new(to('/fminer/bbrc',:full))
      algorithm.metadata = {
        RDF::DC.title => 'Backbone Refinement Class Representatives',
        RDF::DC.creator => "andreas@maunz.de",
        RDF.type => [RDF::OT.Algorithm,RDF::OTA.PatternMiningSupervised]
      }
      algorithm.parameters = [
          { RDF::DC.description => "Dataset URI", RDF::OT.paramScope => "mandatory", RDF::DC.title => "dataset_uri" },
          { RDF::DC.description => "Feature URI for dependent variable", RDF::OT.paramScope => "optional", RDF::DC.title => "prediction_feature" },
          { RDF::DC.description => "Minimum frequency", RDF::OT.paramScope => "optional", RDF::DC.title => "min_frequency" },
          { RDF::DC.description => "Feature type, can be 'paths' or 'trees'", RDF::OT.paramScope => "optional", RDF::DC.title => "feature_type" },
          { RDF::DC.description => "BBRC classes, pass 'false' to switch off mining for BBRC representatives.", RDF::OT.paramScope => "optional", RDF::DC.title => "backbone" },
          { RDF::DC.description => "Significance threshold (between 0 and 1)", RDF::OT.paramScope => "optional", RDF::DC.title => "min_chisq_significance" },
          { RDF::DC.description => "Whether subgraphs should be weighted with their occurrence counts in the instances (frequency)", RDF::OT.paramScope => "optional", RDF::DC.title => "nr_hits" },
          { RDF::DC.description => "Set to 'true' to obtain target variables as a feature", RDF::OT.paramScope => "optional", RDF::DC.title => "get_target" }
      ]
      render(algorithm)
    end
    
    # Get representation of BBRC-sample algorithm
    # @return [String] Representation
    get "/fminer/bbrc/sample/?" do
      algorithm = OpenTox::Algorithm::Generic.new(to('/fminer/bbrc/sample',:full))
      algorithm.metadata = {
        RDF::DC.title => 'Backbone Refinement Class Representatives, obtained from samples of a dataset',
        RDF::DC.creator => "andreas@maunz.de",
        RDF.type => [RDF::OT.Algorithm,RDF::OTA.PatternMiningSupervised]
      }
      algorithm.parameters = [
          { RDF::DC.description => "Dataset URI", RDF::OT.paramScope => "mandatory", RDF::DC.title => "dataset_uri" },
          { RDF::DC.description => "Feature URI for dependent variable", RDF::OT.paramScope => "optional", RDF::DC.title => "prediction_feature" },
          { RDF::DC.description => "Number of bootstrap samples", RDF::OT.paramScope => "optional", RDF::DC.title => "num_boots" },
          { RDF::DC.description => "Minimum sampling support", RDF::OT.paramScope => "optional", RDF::DC.title => "min_sampling_support" },
          { RDF::DC.description => "Minimum frequency", RDF::OT.paramScope => "optional", RDF::DC.title => "min_frequency" },
          { RDF::DC.description => "Whether subgraphs should be weighted with their occurrence counts in the instances (frequency)", RDF::OT.paramScope => "optional", RDF::DC.title => "nr_hits" },
          { RDF::DC.description => "BBRC classes, pass 'false' to switch off mining for BBRC representatives.", RDF::OT.paramScope => "optional", RDF::DC.title => "backbone" },
          { RDF::DC.description => "Chisq estimation method, pass 'mean' to use simple mean estimate for chisq test.", RDF::OT.paramScope => "optional", RDF::DC.title => "method" }
      ]
      render(algorithm)
    end
    
    # Get representation of fminer LAST-PM algorithm
    # @return [String] Representation
    get "/fminer/last/?" do
      algorithm = OpenTox::Algorithm::Generic.new(to('/fminer/last',:full))
      algorithm.metadata = {
        RDF::DC.title => 'Latent Structure Pattern Mining descriptors',
        RDF::DC.creator => "andreas@maunz.de",
        RDF.type => [RDF::OT.Algorithm,RDF::OTA.PatternMiningSupervised]
      }
      algorithm.parameters = [
          { RDF::DC.description => "Dataset URI", RDF::OT.paramScope => "mandatory", RDF::DC.title => "dataset_uri" },
          { RDF::DC.description => "Feature URI for dependent variable", RDF::OT.paramScope => "optional", RDF::DC.title => "prediction_feature" },
          { RDF::DC.description => "Minimum frequency", RDF::OT.paramScope => "optional", RDF::DC.title => "min_frequency" },
          { RDF::DC.description => "Feature type, can be 'paths' or 'trees'", RDF::OT.paramScope => "optional", RDF::DC.title => "feature_type" },
          { RDF::DC.description => "Whether subgraphs should be weighted with their occurrence counts in the instances (frequency)", RDF::OT.paramScope => "optional", RDF::DC.title => "nr_hits" },
          { RDF::DC.description => "Set to 'true' to obtain target variables as a feature", RDF::OT.paramScope => "optional", RDF::DC.title => "get_target" }
      ]
      render(algorithm)
    end
    
    
    # Get representation of matching algorithm
    # @return [String] Representation
    get "/fminer/:method/match?" do
      algorithm = OpenTox::Algorithm::Generic.new(to("/fminer/#{params[:method]}/match",:full))
      algorithm.metadata = {
        RDF::DC.title => 'fminer feature matching',
        RDF::DC.creator => "mguetlein@gmail.com, andreas@maunz.de",
        RDF.type => [RDF::OT.Algorithm,RDF::OTA.PatternMiningSupervised]
      }
      algorithm.parameters = [
          { RDF::DC.description => "Dataset URI", RDF::OT.paramScope => "mandatory", RDF::DC.title => "dataset_uri" },
          { RDF::DC.description => "Feature Dataset URI", RDF::OT.paramScope => "mandatory", RDF::DC.title => "feature_dataset_uri" },
          { RDF::DC.description => "Feature URI for dependent variable", RDF::OT.paramScope => "optional", RDF::DC.title => "prediction_feature" }
      ]
      render(algorithm)
    end
    
    
   


    # Run last algorithm on a dataset
    #
    # @param [String] dataset_uri URI of the training dataset
    # @param [String] prediction_feature URI of the prediction feature (i.e. dependent variable)
    # @param [optional] parameters LAST parameters, accepted parameters are
    #   - min_frequency freq  Minimum frequency (default 5)
    #   - feature_type Feature type, can be 'paths' or 'trees' (default "trees")
    #   - nr_hits Set to "true" to get hit count instead of presence
    #   - get_target Set to "true" to obtain target variable as feature
    # @return [text/uri-list] Task URI
    post '/fminer/last/?' do
    
      @@fminer=OpenTox::Algorithm::Fminer.new(to('/fminer/last',:full))
      @@fminer.check_params(params,80)
    
      task = OpenTox::Task.run("Mining LAST features", uri('/fminer/last')) do |task|

        @@last.Reset
        if @@fminer.prediction_feature.feature_type == "regression"
          @@last.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations!
        else
          bad_request_error "No accept values for "\
                          "dataset '#{fminer.training_dataset.uri}' and "\
                          "feature '#{fminer.prediction_feature.uri}'" unless 
                           @@fminer.prediction_feature.accept_values
          value_map=@@fminer.prediction_feature.value_map
        end
        @@last.SetMinfreq(@@fminer.minfreq)
        @@last.SetType(1) if params[:feature_type] == "paths"
        @@last.SetConsoleOut(false)
  
  
        feature_dataset = OpenTox::Dataset.new
        feature_dataset.metadata = {
          RDF::DC.title => "LAST representatives for #{@@fminer.training_dataset.title}",
          RDF::DC.creator => to('/fminer/last'),
          RDF::OT.hasSource => to('/fminer/last')
        }
        feature_dataset.parameters = [
            { RDF::DC.title => "dataset_uri", RDF::OT.paramValue => params[:dataset_uri] },
            { RDF::DC.title => "prediction_feature", RDF::OT.paramValue => params[:prediction_feature] },
            { RDF::DC.title => "min_frequency", RDF::OT.paramValue => @@fminer.minfreq },
            { RDF::DC.title => "nr_hits", RDF::OT.paramValue => (params[:nr_hits] == "true" ? "true" : "false") }
        ]
        
        @@fminer.compounds = []
        @@fminer.db_class_sizes = Array.new # AM: effect
        @@fminer.all_activities = Hash.new # DV: for effect calculation (class and regr)
        @@fminer.smi = [] # needed for matching the patterns back
  
        # Add data to fminer
        @@fminer.add_fminer_data(@@last, value_map)
        #task.progress 10
        step_width = 80 / @@bbrc.GetNoRootNodes().to_f
        # run @@last
        xml = ""
        (0 .. @@last.GetNoRootNodes()-1).each do |j|
          results = @@last.MineRoot(j)
          #task.progress 10+step_width*(j+1)
          results.each do |result|
            xml << result
          end
        end
  
        lu = LU.new                             # uses last-utils here
        dom=lu.read(xml)                        # parse GraphML
        smarts=lu.smarts_rb(dom,'nls')          # converts patterns to LAST-SMARTS using msa variant (see last-pm.maunz.de)
        params[:nr_hits] == "true" ? hit_count=true : hit_count=false
        matches, counts = lu.match_rb(@@fminer.smi,smarts,hit_count,true)       # creates instantiations

        features = []
        # prepare to receive results as hash { c => [ [f,v], ... ] }
        fminer_results = {}
        matches.each do |smarts, ids|
          metadata, parameters = @@fminer.calc_metadata(smarts, ids, counts[smarts], @@last, nil, value_map, params)
          metadata[RDF::DC.title] = smarts.dup
          feature = OpenTox::Feature.find_or_create(metadata)
          features << feature
          ids.each_with_index { |id,idx| 
            fminer_results[@@fminer.compounds[id]] || fminer_results[@@fminer.compounds[id]] = {}
            fminer_results[@@fminer.compounds[id]][feature.uri] = counts[smarts][idx]
          }
        end

        fminer_compounds = @@fminer.training_dataset.compounds
        prediction_feature_idx = @@fminer.training_dataset.features.collect{|f| f.uri}.index @@fminer.prediction_feature.uri
        prediction_feature_all_acts = fminer_compounds.each_with_index.collect { |c,idx| 
          @@fminer.training_dataset.data_entries[idx][prediction_feature_idx] 
        }
        fminer_noact_compounds = fminer_compounds - @@fminer.compounds

        feature_dataset.features = features
        if (params[:get_target] == "true")
          feature_dataset.features = [ @@fminer.prediction_feature ] + feature_dataset.features
        end
        fminer_compounds.each_with_index { |c,idx|
          # TODO: fix value insertion
          row = [ c ]
          if (params[:get_target] == "true")
            row = row + [ prediction_feature_all_acts[idx] ]
          end
          features.each { |f|
            row << (fminer_results[c] ? fminer_results[c][f.uri] : nil)
          }
          row.collect! { |v| v ? v : 0 } unless fminer_noact_compounds.include? c
          feature_dataset << row
        }
        feature_dataset.put
        feature_dataset.uri

      end
      response['Content-Type'] = 'text/uri-list'
      halt 202,task.uri
    end

  end

end