diff options
author | mr <mr@mrautenberg.de> | 2011-08-04 18:18:40 +0200 |
---|---|---|
committer | mr <mr@mrautenberg.de> | 2011-08-04 18:18:40 +0200 |
commit | fb072193eebcd9dc1d64e66456846c9a08297163 (patch) | |
tree | d2a07dd5d051549f527778264bca6a5e677b60fe | |
parent | a9c32e08d4e01da1d63e8212fd0870d39b4f112f (diff) | |
parent | bce27bd8e66deebb6f438e56957f575399946a50 (diff) |
solve merge conflicts (take new development version of fminer.rb)v2.1.0
-rw-r--r-- | README.md | 30 | ||||
-rw-r--r-- | application.rb | 11 | ||||
-rw-r--r-- | fminer.rb | 332 | ||||
m--------- | last-utils | 0 | ||||
-rw-r--r-- | lazar.rb | 150 | ||||
m--------- | libfminer | 0 | ||||
-rw-r--r-- | openbabel.rb | 6 | ||||
-rw-r--r-- | similarity.rb | 20 |
8 files changed, 271 insertions, 278 deletions
@@ -23,15 +23,35 @@ REST operations [min_frequency=5 per-mil], [feature_type=trees], [backbone=true], - [min_chisq_significance=0.95] + [min_chisq_significance=0.95], + [nr_hits=false] Create last features POST /fminer/last dataset_uri, URI for feature dataset 200,400,404,500 feature_uri, [min_frequency=8 %], [feature_type=trees], - [max_hops=25], + [nr_hits=false] Create lazar model POST /lazar dataset_uri, URI for lazar model 200,400,404,500 prediction_feature, feature_generation_uri + prediction_algorithm + [local_svm_kernel=weighted_tanimoto] + [min_sim=0.3] + [nr_hits=false] + [activity_transform=<Log10 (regression),NOP (classification)>] + [conf_stdev=false] + +Synopsis +-------- + +- prediction\_algorithm: One of "weighted\_majority\_vote" (default for classification), "local\_svm\_classification", "local\_svm\_regression (default for regression)", "local\_mlr\_prop". "weighted\_majority\_vote" is not applicable for regression. "local\_mlr\_prop" is not applicable for classification. +- local\_svm\_kernel: One of "weighted\_tanimoto", "propositionalized". local\_svm\_kernel is not appplicable when prediction\_algorithm="weighted\_majority\_vote". +- min_sim: The minimum similarity threshold for neighbors. Numeric value in [0,1]. +- nr_hits: Whether for instantiated models (local\_svm\_kernel = "propositionalized" for prediction_algorithm="local\_svm\_classification" or "local\_svm\_regression", or for prediction_algorithm="local\_mlr\_prop") nominal features should be instantiated with their occurrence counts in the instances. For non-instantiated models (local\_svm\_kernel = "weighted\_tanimoto" for prediction_algorithm="local\_svm\_classification" or "local\_svm\_regression", or for prediction_algorithm="weighted\_majority\_vote") the neighbor-to-neighbor and neighbor-to-query similarity also integrates these counts, when the parameter is set. One of "true", "false". +- activity_transform: Normalizing transformations of the y-values (activities), applicable only to regression problems. One of "Log10", "Inverter", "NOP". "Log10" moves all values above zero and takes the log to base 10. "Inverter" moves all values above 1.0 and takes the inverted value. "NOP" is the identity transformation, which does nothing. Model predictions are output with reverse transformation applied. +- conf_stdev: Whether confidence integrates distribution of neighbor activity values. When "true", the exp(-1.0*(standard deviation of neighbor activities)) is multiplied on the similarity. One of "true", "false". + +See http://www.maunz.de/wordpress/opentox/2011/lazar-models-and-how-to-trigger-them for a graphical overview. + Supported MIME formats ---------------------- @@ -68,6 +88,7 @@ backbone=false reduces BBRC mining to frequent and correlated subtree mining (mu curl -X POST -d dataset_uri={datset_uri} -d prediction_feature={feature_uri} -d min_frequency={min_frequency} http://webservices.in-silico.ch/algorithm/fminer/bbrc feature_uri specifies the dependent variable from the dataset. +Adding -d nr_hits=true produces frequency counts per pattern and molecule. Please click [here](http://bbrc.maunz.de#usage) for more guidance on usage. ### Create [LAST-PM](http://last-pm.maunz.de) descriptors, recommended for small to medium-sized datasets. @@ -75,15 +96,16 @@ Please click [here](http://bbrc.maunz.de#usage) for more guidance on usage. curl -X POST -d dataset_uri={datset_uri} -d prediction_feature={feature_uri} -d min_frequency={min_frequency} http://webservices.in-silico.ch/algorithm/fminer/last feature_uri specifies the dependent variable from the dataset. +Adding -d nr_hits=true produces frequency counts per pattern and molecule. Please click [here](http://last-pm.maunz.de#usage) for guidance for more guidance on usage. * * * ### Create lazar model - curl -X POST -d dataset_uri={datset_uri} -d prediction_feature={feature_uri} -d feature_generation_uri=http://webservices.in-silico.ch/algorithm/fminer http://webservices.in-silico.ch/test/algorithm/lazar +Creates a standard Lazar model. -feature_uri specifies the dependent variable from the dataset + curl -X POST -d dataset_uri={datset_uri} -d prediction_feature={feature_uri} -d feature_generation_uri=http://webservices.in-silico.ch/algorithm/fminer/bbrc http://webservices.in-silico.ch/test/algorithm/lazar [API documentation](http://rdoc.info/github/opentox/algorithm) -------------------------------------------------------------- diff --git a/application.rb b/application.rb index 55a8ea4..32fea95 100644 --- a/application.rb +++ b/application.rb @@ -22,6 +22,13 @@ end # # @return [text/uri-list] algorithm URIs get '/?' do - response['Content-Type'] = 'text/uri-list' - [ url_for('/lazar', :full), url_for('/fminer/bbrc', :full), url_for('/fminer/last', :full) ].join("\n") + "\n" + list = [ url_for('/lazar', :full), url_for('/fminer/bbrc', :full), url_for('/fminer/last', :full) ].join("\n") + "\n" + case request.env['HTTP_ACCEPT'] + when /text\/html/ + content_type "text/html" + OpenTox.text_to_html list + else + content_type 'text/uri-list' + list + end end @@ -2,6 +2,7 @@ ENV['FMINER_SMARTS'] = 'true' ENV['FMINER_NO_AROMATIC'] = 'true' ENV['FMINER_PVALUES'] = 'true' ENV['FMINER_SILENT'] = 'true' +ENV['FMINER_NR_HITS'] = 'true' @@bbrc = Bbrc::Bbrc.new @@last = Last::Last.new @@ -10,20 +11,26 @@ ENV['FMINER_SILENT'] = 'true' # # @return [text/uri-list] URIs of fminer algorithms get '/fminer/?' do - response['Content-Type'] = 'text/uri-list' - [ url_for('/fminer/bbrc', :full), url_for('/fminer/last', :full) ].join("\n") + "\n" + list = [ url_for('/fminer/bbrc', :full), url_for('/fminer/last', :full) ].join("\n") + "\n" + case request.env['HTTP_ACCEPT'] + when /text\/html/ + content_type "text/html" + OpenTox.text_to_html list + else + content_type 'text/uri-list' + list + end end # Get RDF/XML representation of fminer bbrc algorithm # @return [application/rdf+xml] OWL-DL representation of fminer bbrc algorithm get "/fminer/bbrc/?" do - response['Content-Type'] = 'application/rdf+xml' algorithm = OpenTox::Algorithm::Generic.new(url_for('/fminer/bbrc',:full)) algorithm.metadata = { DC.title => 'fminer backbone refinement class representatives', DC.creator => "andreas@maunz.de, helma@in-silico.ch", DC.contributor => "vorgrimmlerdavid@gmx.de", - RDF.type => [OTA.PatternMiningSupervised], + RDF.type => [OT.Algorithm,OTA.PatternMiningSupervised], OT.parameters => [ { DC.description => "Dataset URI", OT.paramScope => "mandatory", DC.title => "dataset_uri" }, { DC.description => "Feature URI for dependent variable", OT.paramScope => "mandatory", DC.title => "prediction_feature" }, @@ -33,7 +40,17 @@ get "/fminer/bbrc/?" do { DC.description => "Significance threshold (between 0 and 1)", OT.paramScope => "optional", DC.title => "min_chisq_significance" }, ] } - algorithm.to_rdfxml + case request.env['HTTP_ACCEPT'] + when /text\/html/ + content_type "text/html" + OpenTox.text_to_html algorithm.to_yaml + when /application\/x-yaml/ + content_type "application/x-yaml" + algorithm.to_yaml + else + response['Content-Type'] = 'application/rdf+xml' + algorithm.to_rdfxml + end end # Get RDF/XML representation of fminer last algorithm @@ -44,7 +61,7 @@ get "/fminer/last/?" do DC.title => 'fminer latent structure class representatives', DC.creator => "andreas@maunz.de, helma@in-silico.ch", DC.contributor => "vorgrimmlerdavid@gmx.de", - RDF.type => [OTA.PatternMiningSupervised], + RDF.type => [OT.Algorithm,OTA.PatternMiningSupervised], OT.parameters => [ { DC.description => "Dataset URI", OT.paramScope => "mandatory", DC.title => "dataset_uri" }, { DC.description => "Feature URI for dependent variable", OT.paramScope => "mandatory", DC.title => "prediction_feature" }, @@ -53,7 +70,17 @@ get "/fminer/last/?" do { DC.description => "Maximum number of hops", OT.paramScope => "optional", DC.title => "hops" }, ] } - algorithm.to_rdfxml + case request.env['HTTP_ACCEPT'] + when /text\/html/ + content_type "text/html" + OpenTox.text_to_html algorithm.to_yaml + when /application\/x-yaml/ + content_type "application/x-yaml" + algorithm.to_yaml + else + response['Content-Type'] = 'application/rdf+xml' + algorithm.to_rdfxml + end end # Run bbrc algorithm on dataset @@ -61,36 +88,29 @@ end # @param [String] dataset_uri URI of the training dataset # @param [String] prediction_feature URI of the prediction feature (i.e. dependent variable) # @param [optional] parameters BBRC parameters, accepted parameters are -# - minfreq Minimum frequency (default 5) +# - min_frequency Minimum frequency (default 5) # - feature_type Feature type, can be 'paths' or 'trees' (default "trees") # - backbone BBRC classes, pass 'false' to switch off mining for BBRC representatives. (default "true") # - min_chisq_significance Significance threshold (between 0 and 1) +# - nr_hits Set to "true" to get hit count instead of presence # @return [text/uri-list] Task URI post '/fminer/bbrc/?' do - halt 404, "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil? - halt 404, "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil? - prediction_feature = OpenTox::Feature.find params[:prediction_feature], @subjectid - training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}", @subjectid - halt 404, "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless training_dataset.features and training_dataset.features.include?(params[:prediction_feature]) - - unless params[:min_frequency].nil? - minfreq=params[:min_frequency].to_i - raise "Minimum frequency must be a number >0!" unless minfreq>0 - else - minfreq = 5*training_dataset.compounds.size/1000 # AM sugg. 8-10 per mil - minfreq = 2 unless minfreq > 2 - end - - task = OpenTox::Task.create("Mining BBRC features", url_for('/fminer',:full)) do + fminer=OpenTox::Algorithm::Fminer.new + fminer.check_params(params,5,@subjectid) + task = OpenTox::Task.create("Mining BBRC features", url_for('/fminer',:full)) do |task| @@bbrc.Reset - if prediction_feature.feature_type == "regression" + if fminer.prediction_feature.feature_type == "regression" @@bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations! else - @training_classes = training_dataset.feature_classes(prediction_feature.uri, @subjectid) + raise "no accept values for dataset '"+fminer.training_dataset.uri.to_s+"' and feature '"+fminer.prediction_feature.uri.to_s+ + "'" unless fminer.training_dataset.accept_values(fminer.prediction_feature.uri) + @training_classes = fminer.training_dataset.accept_values(fminer.prediction_feature.uri).sort + @value_map=Hash.new + @training_classes.each_with_index { |c,i| @value_map[i+1] = c } end - @@bbrc.SetMinfreq(minfreq) + @@bbrc.SetMinfreq(fminer.minfreq) @@bbrc.SetType(1) if params[:feature_type] == "paths" @@bbrc.SetBackbone(eval params[:backbone]) if params[:backbone] and ( params[:backbone] == "true" or params[:backbone] == "false" ) # convert string to boolean @@bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance] @@ -98,7 +118,7 @@ post '/fminer/bbrc/?' do feature_dataset = OpenTox::Dataset.new(nil, @subjectid) feature_dataset.add_metadata({ - DC.title => "BBRC representatives for " + training_dataset.metadata[DC.title].to_s, + DC.title => "BBRC representatives for " + fminer.training_dataset.metadata[DC.title].to_s, DC.creator => url_for('/fminer/bbrc',:full), OT.hasSource => url_for('/fminer/bbrc', :full), OT.parameters => [ @@ -108,106 +128,44 @@ post '/fminer/bbrc/?' do }) feature_dataset.save(@subjectid) - id = 1 # fminer start id is not 0 - compounds = [] - nr_active=0 - nr_inactive=0 - all_activities = Hash.new# DV: for effect calculation in regression part + fminer.compounds = [] + fminer.db_class_sizes = Array.new # AM: effect + fminer.all_activities = Hash.new # DV: for effect calculation in regression part + fminer.smi = [] # AM LAST: needed for matching the patterns back - training_dataset.data_entries.each do |compound,entry| - begin - smiles = OpenTox::Compound.new(compound.to_s).to_smiles - rescue - LOGGER.warn "No resource for #{compound.to_s}" - next - end - if smiles == '' or smiles.nil? - LOGGER.warn "Cannot find smiles for #{compound.to_s}." - next - end + # Add data to fminer + fminer.add_fminer_data(@@bbrc, params, @value_map) - # AM: take log if appropriate - take_logs=true - entry.each do |feature,values| - values.each do |value| - if prediction_feature.feature_type == "regression" - if (! value.nil?) && (value.to_f < 1) - take_logs=false - end - end - end - end - entry.each do |feature,values| - if feature == prediction_feature.uri - values.each do |value| - if value.nil? - LOGGER.warn "No #{feature} activiity for #{compound.to_s}." - else - if prediction_feature.feature_type == "classification" - case value.to_s - when "true" - nr_active += 1 - activity = 1 - when "false" - nr_inactive += 1 - activity = 0 - when /#{@training_classes.last}/ - nr_active += 1 - activity = 1 - when /#{@training_classes.first}/ - nr_inactive += 1 - activity = 0 - else - LOGGER.warn "Unknown class \"#{value.to_s}\"." - end - elsif prediction_feature.feature_type == "regression" - activity= take_logs ? Math.log10(value.to_f) : value.to_f - end - begin - @@bbrc.AddCompound(smiles,id) - @@bbrc.AddActivity(activity, id) - all_activities[id]=activity # DV: insert global information - compounds[id] = compound - id += 1 - rescue - LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer" - end - end - end - end - end - end - - g_array=all_activities.values # DV: calculation of global median for effect calculation - g_median=OpenTox::Algorithm.median(g_array) + g_array=fminer.all_activities.values # DV: calculation of global median for effect calculation + g_median=g_array.to_scale.median - raise "No compounds in dataset #{training_dataset.uri}" if compounds.size==0 - + raise "No compounds in dataset #{fminer.training_dataset.uri}" if fminer.compounds.size==0 + task.progress 10 + step_width = 80 / @@bbrc.GetNoRootNodes().to_f features = Set.new + # run @@bbrc (0 .. @@bbrc.GetNoRootNodes()-1).each do |j| - results = @@bbrc.MineRoot(j) + task.progress 10+step_width*(j+1) results.each do |result| f = YAML.load(result)[0] smarts = f[0] p_value = f[1] if (!@@bbrc.GetRegression) - ids = f[2] + f[3] - if f[2].size.to_f/ids.size > nr_active.to_f/(nr_active+nr_inactive) - effect = 'activating' - else - effect = 'deactivating' - end + id_arrs = f[2..-1].flatten + max = OpenTox::Algorithm.effect(f[2..-1], fminer.db_class_sizes) + effect = f[2..-1].size-max else #regression part - ids = f[2] + id_arrs = f[2] # DV: effect calculation f_arr=Array.new f[2].each do |id| - f_arr.push(all_activities[id]) + id=id.keys[0] # extract id from hit count hash + f_arr.push(fminer.all_activities[id]) end - f_median=OpenTox::Algorithm.median(f_arr) + f_median=f_arr.to_scale.median if g_median >= f_median effect = 'activating' else @@ -220,7 +178,7 @@ post '/fminer/bbrc/?' do features << smarts metadata = { OT.hasSource => url_for('/fminer/bbrc', :full), - RDF.type => [OT.Substructure], + RDF.type => [OT.Feature, OT.Substructure], OT.smarts => smarts, OT.pValue => p_value.to_f, OT.effect => effect, @@ -232,14 +190,22 @@ post '/fminer/bbrc/?' do feature_dataset.add_feature feature_uri, metadata #feature_dataset.add_feature_parameters feature_uri, feature_dataset.parameters end - ids.each { |id| feature_dataset.add(compounds[id], feature_uri, true)} + id_arrs.each { |id_count_hash| + id=id_count_hash.keys[0].to_i + count=id_count_hash.values[0].to_i + if params[:nr_hits] == "true" + feature_dataset.add(fminer.compounds[id], feature_uri, count) + else + feature_dataset.add(fminer.compounds[id], feature_uri, 1) + end + } end end feature_dataset.save(@subjectid) feature_dataset.uri end response['Content-Type'] = 'text/uri-list' - halt 503,task.uri+"\n" if task.status == "Cancelled" + raise OpenTox::ServiceUnavailableError.newtask.uri+"\n" if task.status == "Cancelled" halt 202,task.uri.to_s+"\n" end #end @@ -249,36 +215,28 @@ end # @param [String] dataset_uri URI of the training dataset # @param [String] prediction_feature URI of the prediction feature (i.e. dependent variable) # @param [optional] parameters LAST parameters, accepted parameters are -# - minfreq Minimum frequency (default 5) +# - min_frequency freq Minimum frequency (default 5) # - feature_type Feature type, can be 'paths' or 'trees' (default "trees") # - hops Maximum number of hops +# - nr_hits Set to "true" to get hit count instead of presence # @return [text/uri-list] Task URI post '/fminer/last/?' do - halt 404, "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil? - halt 404, "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil? - prediction_feature = OpenTox::Feature.find params[:prediction_feature], @subjectid - training_dataset = OpenTox::Dataset.new "#{params[:dataset_uri]}", @subjectid - training_dataset.load_all(@subjectid) - halt 404, "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless training_dataset.features and training_dataset.features.include?(params[:prediction_feature]) - - unless params[:min_frequency].nil? - minfreq=params[:min_frequency].to_i - raise "Minimum frequency must be a number >0!" unless minfreq>0 - else - minfreq = 8*training_dataset.compounds.size/100 # AM sugg. 5-10% - minfreq = 2 unless minfreq > 2 - end - - task = OpenTox::Task.create("Mining LAST features", url_for('/fminer',:full)) do + fminer=OpenTox::Algorithm::Fminer.new + fminer.check_params(params,80,@subjectid) + task = OpenTox::Task.create("Mining LAST features", url_for('/fminer',:full)) do |task| @@last.Reset - if prediction_feature.feature_type == "regression" + if fminer.prediction_feature.feature_type == "regression" @@last.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations! else - @training_classes = training_dataset.feature_classes(prediction_feature.uri) + raise "no accept values for dataset '"+fminer.training_dataset.uri.to_s+"' and feature '"+fminer.prediction_feature.uri.to_s+ + "'" unless fminer.training_dataset.accept_values(fminer.prediction_feature.uri) + @training_classes = fminer.training_dataset.accept_values(fminer.prediction_feature.uri).sort + @value_map=Hash.new + @training_classes.each_with_index { |c,i| @value_map[i+1] = c } end - @@last.SetMinfreq(minfreq) + @@last.SetMinfreq(fminer.minfreq) @@last.SetType(1) if params[:feature_type] == "paths" @@last.SetMaxHops(params[:hops]) if params[:hops] @@last.SetConsoleOut(false) @@ -286,7 +244,7 @@ post '/fminer/last/?' do feature_dataset = OpenTox::Dataset.new(nil, @subjectid) feature_dataset.add_metadata({ - DC.title => "LAST representatives for " + training_dataset.metadata[DC.title].to_s, + DC.title => "LAST representatives for " + fminer.training_dataset.metadata[DC.title].to_s, DC.creator => url_for('/fminer/last',:full), OT.hasSource => url_for('/fminer/last', :full), OT.parameters => [ @@ -296,74 +254,25 @@ post '/fminer/last/?' do }) feature_dataset.save(@subjectid) - id = 1 # fminer start id is not 0 - compounds = [] - smi = [] # AM LAST: needed for matching the patterns back - nr_active=0 - nr_inactive=0 - all_activities = Hash.new #DV: for effect calculation (class and regr) - - training_dataset.data_entries.each do |compound,entry| - begin - smiles = OpenTox::Compound.new(compound.to_s).to_smiles - rescue - LOGGER.warn "No resource for #{compound.to_s}" - next - end - if smiles == '' or smiles.nil? - LOGGER.warn "Cannot find smiles for #{compound.to_s}." - next - end - entry.each do |feature,values| - if feature == prediction_feature.uri - values.each do |value| - if value.nil? - LOGGER.warn "No #{feature} activiity for #{compound.to_s}." - else - if prediction_feature.feature_type == "classification" - case value.to_s - when "true" - nr_active += 1 - activity = 1 - when "false" - nr_inactive += 1 - activity = 0 - when /#{@training_classes.last}/ - nr_active += 1 - activity = 1 - when /#{@training_classes.first}/ - nr_inactive += 1 - activity = 0 - else - LOGGER.warn "Unknown class \"#{value.to_s}." - end - elsif prediction_feature.feature_type == "regression" - activity = value.to_f - end - begin - @@last.AddCompound(smiles,id) - @@last.AddActivity(activity, id) - all_activities[id]=activity # DV: insert global information - compounds[id] = compound - smi[id] = smiles # AM LAST: changed this to store SMILES. - id += 1 - rescue - LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer" - end - end - end - end - end - end + fminer.compounds = [] + fminer.db_class_sizes = Array.new # AM: effect + fminer.all_activities = Hash.new # DV: for effect calculation (class and regr) + fminer.smi = [] # AM LAST: needed for matching the patterns back - raise "No compounds in dataset #{training_dataset.uri}" if compounds.size==0 + # Add data to fminer + fminer.add_fminer_data(@@last, params, @value_map) + + raise "No compounds in dataset #{fminer.training_dataset.uri}" if fminer.compounds.size==0 # run @@last features = Set.new xml = "" + task.progress 10 + step_width = 80 / @@last.GetNoRootNodes().to_f (0 .. @@last.GetNoRootNodes()-1).each do |j| results = @@last.MineRoot(j) + task.progress 10+step_width*(j+1) results.each do |result| xml << result end @@ -372,21 +281,30 @@ post '/fminer/last/?' do lu = LU.new # AM LAST: uses last-utils here dom=lu.read(xml) # AM LAST: parse GraphML smarts=lu.smarts_rb(dom,'nls') # AM LAST: converts patterns to LAST-SMARTS using msa variant (see last-pm.maunz.de) - instances=lu.match_rb(smi,smarts) # AM LAST: creates instantiations - instances.each do |smarts, ids| - feat_hash = Hash[*(all_activities.select { |k,v| ids.include?(k) }.flatten)] # AM LAST: get activities of feature occurrences; see http://www.softiesonrails.com/2007/9/18/ruby-201-weird-hash-syntax - @@last.GetRegression() ? p_value = @@last.KSTest(all_activities.values, feat_hash.values).to_f : p_value = @@last.ChisqTest(all_activities.values, feat_hash.values).to_f # AM LAST: use internal function for test - - - effect = (p_value > 0) ? "activating" : "deactivating" + params[:nr_hits] != "true" ? hit_count=false: hit_count=true + matches, counts = lu.match_rb(fminer.smi,smarts,hit_count) # AM LAST: creates instantiations + + matches.each do |smarts, ids| + feat_hash = Hash[*(fminer.all_activities.select { |k,v| ids.include?(k) }.flatten)] # AM LAST: get activities of feature occurrences; see http://www.softiesonrails.com/2007/9/18/ruby-201-weird-hash-syntax + if @@last.GetRegression() + p_value = @@last.KSTest(fminer.all_activities.values, feat_hash.values).to_f # AM LAST: use internal function for test + effect = (p_value > 0) ? "activating" : "deactivating" + else + p_value = @@last.ChisqTest(fminer.all_activities.values, feat_hash.values).to_f + g=Array.new + @value_map.each { |y,act| g[y-1]=Array.new } + feat_hash.each { |x,y| g[y-1].push(x) } + max = OpenTox::Algorithm.effect(g, fminer.db_class_sizes) + effect = g.size-max + end feature_uri = File.join feature_dataset.uri,"feature","last", features.size.to_s unless features.include? smarts features << smarts metadata = { - RDF.type => [OT.Substructure], + RDF.type => [OT.Feature, OT.Substructure], OT.hasSource => feature_dataset.uri, OT.smarts => smarts, - OT.pValue => p_value.to_f.abs, + OT.pValue => p_value.abs, OT.effect => effect, OT.parameters => [ { DC.title => "dataset_uri", OT.paramValue => params[:dataset_uri] }, @@ -395,12 +313,16 @@ post '/fminer/last/?' do } feature_dataset.add_feature feature_uri, metadata end - ids.each { |id| feature_dataset.add(compounds[id], feature_uri, true)} + if !hit_count + ids.each { |id| feature_dataset.add(fminer.compounds[id], feature_uri, true)} + else + ids.each_with_index { |id,i| feature_dataset.add(fminer.compounds[id], feature_uri, counts[smarts][i])} + end end feature_dataset.save(@subjectid) feature_dataset.uri end response['Content-Type'] = 'text/uri-list' - halt 503,task.uri+"\n" if task.status == "Cancelled" + raise OpenTox::ServiceUnavailableError.newtask.uri+"\n" if task.status == "Cancelled" halt 202,task.uri.to_s+"\n" -end +end
\ No newline at end of file diff --git a/last-utils b/last-utils -Subproject 75bea7645601fd296aa68c6678ee9b0a49a7b91 +Subproject 04bd1b73f54bb7422d3c08bb5a81bc02af04f6f @@ -3,7 +3,6 @@ # Get RDF/XML representation of the lazar algorithm # @return [application/rdf+xml] OWL-DL representation of the lazar algorithm get '/lazar/?' do - response['Content-Type'] = 'application/rdf+xml' algorithm = OpenTox::Algorithm::Generic.new(url_for('/lazar',:full)) algorithm.metadata = { DC.title => 'lazar', @@ -17,7 +16,17 @@ get '/lazar/?' do { DC.description => "Further parameters for the feaature generation service", OT.paramScope => "optional" } ] } - algorithm.to_rdfxml + case request.env['HTTP_ACCEPT'] + when /text\/html/ + content_type "text/html" + OpenTox.text_to_html algorithm.to_yaml + when /application\/x-yaml/ + content_type "application/x-yaml" + algorithm.to_yaml + else + response['Content-Type'] = 'application/rdf+xml' + algorithm.to_rdfxml + end end # Create a lazar prediction model @@ -28,29 +37,42 @@ end # @return [text/uri-list] Task URI post '/lazar/?' do + LOGGER.debug "building lazar model with params: "+params.inspect params[:subjectid] = @subjectid - halt 404, "No dataset_uri parameter." unless params[:dataset_uri] + raise OpenTox::NotFoundError.new "No dataset_uri parameter." unless params[:dataset_uri] dataset_uri = params[:dataset_uri] - halt 404, "Dataset #{dataset_uri} not found." unless training_activities = OpenTox::Dataset.new(dataset_uri) - training_activities.load_all(@subjectid) - - prediction_feature = OpenTox::Feature.find(params[:prediction_feature],@subjectid) - unless params[:prediction_feature] # try to read prediction_feature from dataset - halt 404, "#{training_activities.features.size} features in dataset #{dataset_uri}. Please provide a prediction_feature parameter." unless training_activities.features.size == 1 - prediction_feature = OpenTox::Feature.find(training_activities.features.keys.first,@subjectid) - params[:prediction_feature] = prediction_feature.uri # pass to feature mining service - end + task = OpenTox::Task.create("Create lazar model",url_for('/lazar',:full)) do |task| - feature_generation_uri = @@feature_generation_default unless feature_generation_uri = params[:feature_generation_uri] + raise OpenTox::NotFoundError.new "Dataset #{dataset_uri} not found." unless training_activities = OpenTox::Dataset.new(dataset_uri) + training_activities.load_all(@subjectid) - halt 404, "No feature #{prediction_feature.uri} in dataset #{params[:dataset_uri]}. (features: "+ - training_activities.features.inspect+")" unless training_activities.features and training_activities.features.include?(prediction_feature.uri) + prediction_feature = OpenTox::Feature.find(params[:prediction_feature],@subjectid) + unless params[:prediction_feature] # try to read prediction_feature from dataset + raise OpenTox::NotFoundError.new "#{training_activities.features.size} features in dataset #{dataset_uri}. Please provide a prediction_feature parameter." unless training_activities.features.size == 1 + prediction_feature = OpenTox::Feature.find(training_activities.features.keys.first,@subjectid) + params[:prediction_feature] = prediction_feature.uri # pass to feature mining service + end + + feature_generation_uri = @@feature_generation_default unless feature_generation_uri = params[:feature_generation_uri] - task = OpenTox::Task.create("Create lazar model",url_for('/lazar',:full)) do |task| + raise OpenTox::NotFoundError.new "No feature #{prediction_feature.uri} in dataset #{params[:dataset_uri]}. (features: "+ + training_activities.features.inspect+")" unless training_activities.features and training_activities.features.include?(prediction_feature.uri) lazar = OpenTox::Model::Lazar.new - lazar.min_sim = params[:min_sim] if params[:min_sim] + lazar.min_sim = params[:min_sim].to_f if params[:min_sim] + lazar.nr_hits = true if params[:nr_hits] == "true" + + if prediction_feature.feature_type == "classification" + @training_classes = training_activities.accept_values(prediction_feature.uri).sort + @training_classes.each_with_index { |c,i| + lazar.value_map[i+1] = c # don't use '0': we must take the weighted mean later. + params[:value_map] = lazar.value_map + } + elsif prediction_feature.feature_type == "regression" + lazar.prediction_algorithm = "Neighbors.local_svm_regression" + end + task.progress 10 if params[:feature_dataset_uri] feature_dataset_uri = params[:feature_dataset_uri] @@ -66,31 +88,36 @@ post '/lazar/?' do if feature_generation_uri.match(/fminer/) lazar.feature_calculation_algorithm = "Substructure.match" else - halt 404, "External feature generation services not yet supported" + raise OpenTox::NotFoundError.new "External feature generation services not yet supported" end params[:subjectid] = @subjectid prediction_feature = OpenTox::Feature.find params[:prediction_feature], @subjectid if prediction_feature.feature_type == "regression" && feature_generation_uri.match(/fminer/) params[:feature_type] = "paths" end - feature_dataset_uri = OpenTox::Algorithm::Generic.new(feature_generation_uri).run(params).to_s + feature_dataset_uri = OpenTox::Algorithm::Generic.new(feature_generation_uri).run(params, OpenTox::SubTask.new(task,10,70)).to_s training_features = OpenTox::Dataset.new(feature_dataset_uri) end training_features.load_all(@subjectid) - halt 404, "Dataset #{feature_dataset_uri} not found." if training_features.nil? + raise OpenTox::NotFoundError.new "Dataset #{feature_dataset_uri} not found." if training_features.nil? # sorted features for index lookups lazar.features = training_features.features.sort if prediction_feature.feature_type == "regression" and lazar.feature_calculation_algorithm != "Substructure.match" training_features.data_entries.each do |compound,entry| - lazar.fingerprints[compound] = [] unless lazar.fingerprints[compound] + lazar.fingerprints[compound] = {} unless lazar.fingerprints[compound] entry.keys.each do |feature| if lazar.feature_calculation_algorithm == "Substructure.match" if training_features.features[feature] smarts = training_features.features[feature][OT.smarts] - lazar.fingerprints[compound] << smarts + #lazar.fingerprints[compound] << smarts + if params[:nr_hits] + lazar.fingerprints[compound][smarts] = entry[feature].flatten.first + else + lazar.fingerprints[compound][smarts] = 1 + end unless lazar.features.include? smarts lazar.features << smarts lazar.p_values[smarts] = training_features.features[feature][OT.pValue] @@ -102,7 +129,8 @@ post '/lazar/?' do when "classification" # fingerprints are sets if entry[feature].flatten.size == 1 - lazar.fingerprints[compound] << feature if entry[feature].flatten.first.to_s.match(TRUE_REGEXP) + #lazar.fingerprints[compound] << feature if entry[feature].flatten.first.to_s.match(TRUE_REGEXP) + lazar.fingerprints[compound][feature] = entry[feature].flatten.first if entry[feature].flatten.first.to_s.match(TRUE_REGEXP) lazar.features << feature unless lazar.features.include? feature else LOGGER.warn "More than one entry (#{entry[feature].inspect}) for compound #{compound}, feature #{feature}" @@ -111,6 +139,7 @@ post '/lazar/?' do # fingerprints are arrays if entry[feature].flatten.size == 1 lazar.fingerprints[compound][lazar.features.index(feature)] = entry[feature].flatten.first + #lazar.fingerprints[compound][feature] = entry[feature].flatten.first else LOGGER.warn "More than one entry (#{entry[feature].inspect}) for compound #{compound}, feature #{feature}" end @@ -118,45 +147,58 @@ post '/lazar/?' do end end end - - @training_classes = training_activities.feature_classes(prediction_feature.uri, @subjectid) if prediction_feature.feature_type == "classification" - lazar.prediction_algorithm = "Neighbors.local_svm_regression" if prediction_feature.feature_type == "regression" - - training_activities.data_entries.each do |compound,entry| - lazar.activities[compound] = [] unless lazar.activities[compound] - unless entry[prediction_feature.uri].empty? - entry[prediction_feature.uri].each do |value| - if prediction_feature.feature_type == "classification" - case value.to_s - when "true" - lazar.activities[compound] << true - when "false" - lazar.activities[compound] << false - when /#{@training_classes.last}/ - lazar.activities[compound] << true - when /#{@training_classes.first}/ - lazar.activities[compound] << false - else - LOGGER.warn "Unknown class \"#{value.to_s}\"." - end - elsif prediction_feature.feature_type == "regression" - halt 404, "0 values not allowed in training dataset. log10 is calculated internally." if value.to_f == 0 - lazar.activities[compound] << value.to_f + task.progress 80 + + # AM: allow settings override by user + lazar.prediction_algorithm = "Neighbors.#{params[:prediction_algorithm]}" unless params[:prediction_algorithm].nil? + if prediction_feature.feature_type == "regression" + lazar.transform["class"] = "Log10" if lazar.transform["class"] == "NOP" + end + lazar.transform["class"] = params[:activity_transform] unless params[:activity_transform].nil? + lazar.prop_kernel = true if (params[:local_svm_kernel] == "propositionalized" || params[:prediction_algorithm] == "local_mlr_prop") + lazar.conf_stdev = false + lazar.conf_stdev = true if params[:conf_stdev] == "true" + + # AM: Feed Data using Transformations + if prediction_feature.feature_type == "regression" + transformed_acts = [] + training_activities.data_entries.each do |compound,entry| + transformed_acts.concat entry[prediction_feature.uri] unless entry[prediction_feature.uri].empty? + end + transformer = eval "OpenTox::Algorithm::Transform::#{lazar.transform["class"]}.new(transformed_acts)" + transformed_acts = transformer.values + lazar.transform["offset"] = transformer.offset + t_count=0 + training_activities.data_entries.each do |compound,entry| + lazar.activities[compound] = [] unless lazar.activities[compound] + unless entry[prediction_feature.uri].empty? + entry[prediction_feature.uri].each do |value| + lazar.activities[compound] << transformed_acts[t_count].to_s + t_count+=1 + end + end + end + elsif prediction_feature.feature_type == "classification" + training_activities.data_entries.each do |compound,entry| + lazar.activities[compound] = [] unless lazar.activities[compound] + unless entry[prediction_feature.uri].empty? + entry[prediction_feature.uri].each do |value| + lazar.activities[compound] << lazar.value_map.invert[value] # insert mapped values, not originals end end end end + task.progress 90 lazar.metadata[DC.title] = "lazar model for #{URI.decode(File.basename(prediction_feature.uri))}" - # TODO: fix dependentVariable lazar.metadata[OT.dependentVariables] = prediction_feature.uri lazar.metadata[OT.trainingDataset] = dataset_uri lazar.metadata[OT.featureDataset] = feature_dataset_uri - - if prediction_feature.feature_type == "classification" - lazar.metadata[RDF.type] = [OTA.ClassificationLazySingleTarget] - elsif prediction_feature.feature_type == "regression" - lazar.metadata[RDF.type] = [OTA.RegressionLazySingleTarget] + case training_activities.feature_type(@subjectid) + when "classification" + lazar.metadata[RDF.type] = [OT.Model, OTA.ClassificationLazySingleTarget] + when "regression" + lazar.metadata[RDF.type] = [OT.Model, OTA.RegressionLazySingleTarget] end lazar.metadata[OT.parameters] = [ @@ -170,7 +212,7 @@ post '/lazar/?' do model_uri end response['Content-Type'] = 'text/uri-list' - halt 503,task.uri+"\n" if task.status == "Cancelled" + raise OpenTox::ServiceUnavailableError.newtask.uri+"\n" if task.status == "Cancelled" halt 202,task.uri end diff --git a/libfminer b/libfminer -Subproject d51f5e784ce0f5b7ef1c47c52ea55d1c874ec2e +Subproject 07679a625a7acad864fd3abd80654a1a0a61e69 diff --git a/openbabel.rb b/openbabel.rb index 1644455..463663e 100644 --- a/openbabel.rb +++ b/openbabel.rb @@ -49,7 +49,7 @@ get '/openbabel/:property' do response['Content-Type'] = 'application/rdf+xml' algorithm.to_rdfxml else - halt 404, "Unknown OpenBabel descriptor #{params[:property]}." + raise OpenTox::NotFoundError.new "Unknown OpenBabel descriptor #{params[:property]}." end end @@ -89,7 +89,7 @@ post '/openbabel/:property' do descriptor = OpenBabel::OBDescriptor.find_type(params[:property]) descriptor.predict(obmol).to_s else - halt 404, "Cannot calculate property #{params[:property]} with OpenBabel" + raise OpenTox::NotFoundError.new "Cannot calculate property #{params[:property]} with OpenBabel" end end @@ -143,6 +143,6 @@ post '/openbabel' do result_dataset.uri end response['Content-Type'] = 'text/uri-list' - halt 503,task.uri+"\n" if task.status == "Cancelled" + raise OpenTox::ServiceUnavailableError.newtask.uri+"\n" if task.status == "Cancelled" halt 202,task.uri.to_s+"\n" end diff --git a/similarity.rb b/similarity.rb index 060bd2b..faf43f9 100644 --- a/similarity.rb +++ b/similarity.rb @@ -2,25 +2,25 @@ require File.join(File.dirname(__FILE__),'dataset.rb') helpers do def find -# + charges are dropped -uri = uri(params[:splat].first.gsub(/(InChI.*) (.*)/,'\1+\2')) # reinsert dropped '+' signs in InChIs -halt 404, "Dataset \"#{uri}\" not found." unless @set = Dataset.find(uri) + # + charges are dropped + uri = uri(params[:splat].first.gsub(/(InChI.*) (.*)/,'\1+\2')) # reinsert dropped '+' signs in InChIs + raise OpenTox::NotFoundError.new "Dataset \"#{uri}\" not found." unless @set = Dataset.find(uri) end def uri(name) -name = URI.encode(name) -uri = File.join Dataset.base_uri, name -end + name = URI.encode(name) + uri = File.join Dataset.base_uri, name + end end get '/tanimoto/dataset/*/dataset/*/?' do -find -@set.tanimoto(uri(params[:splat][1])) + find + @set.tanimoto(uri(params[:splat][1])) end get '/weighted_tanimoto/dataset/*/dataset/*/?' do -find -@set.weighted_tanimoto(uri(params[:splat][1])) + find + @set.weighted_tanimoto(uri(params[:splat][1])) end |