diff options
author | Christoph Helma <helma@in-silico.ch> | 2010-11-04 11:15:59 +0100 |
---|---|---|
committer | Christoph Helma <helma@in-silico.ch> | 2010-11-04 11:15:59 +0100 |
commit | e72bba4cdaa6fd68d62b567e21be730a49963207 (patch) | |
tree | 22c088db8e266b420bd7055edad778bd2b8ce375 | |
parent | 19dd7247be22e637419d79406041a4548b169c2c (diff) |
fminer with annotations, commit before merging andreas new version
-rw-r--r-- | fminer.rb | 194 | ||||
-rw-r--r-- | lazar.rb | 143 | ||||
m--------- | libfminer | 0 | ||||
-rw-r--r-- | smarts.rb | 2 |
4 files changed, 147 insertions, 192 deletions
@@ -2,159 +2,157 @@ ENV['FMINER_SMARTS'] = 'true' ENV['FMINER_NO_AROMATIC'] = 'true' ENV['FMINER_PVALUES'] = 'true' @@fminer = Bbrc::Bbrc.new +@@fminer.SetMinfreq(5) get '/fminer/?' do - owl = OpenTox::OwlSerializer.create 'Algorithm', url_for('/fminer',:full) - owl.annotate 'title',"fminer" - owl.annotate 'creator',"http://github.com/amaunz/fminer2" -# owl.set_data( { -# "parameters" => [ -# { "title" => "Dataset URI", "paramScope" => "mandatory", "paramValue" => "dataset_uri" }, -# { "title" => "Feature URI for dependent variable", "paramScope" => "mandatory", "paramValue" => "feature_uri" } -# ] -# } ) - -# owl.parameters = { -# "Dataset URI" => { :scope => "mandatory", :value => "dataset_uri" }, -# "Feature URI for dependent variable" => { :scope => "mandatory", :value => "feature_uri" } -# } - rdf = owl.rdf - #File.open('public/fminer.owl', 'w') {|f| f.print rdf} + + metadata = { + DC.title => 'fminer', + DC.identifier => url_for("",:full), + DC.creator => "andreas@maunz.de, helma@in-silico.ch", + DC.contributor => "vorgrimmlerdavid@gmx.de", + OT.isA => OTA.PatternMiningSupervised + } + + parameters = [ + { DC.description => "Dataset URI", OT.paramScope => "mandatory", OT.title => "dataset_uri" }, + { DC.description => "Feature URI for dependent variable", OT.paramScope => "mandatory", OT.title => "prediction_feature" } + ] + + s = OpenTox::Serializer::Owl.new + s.add_algorithm(url_for('/fminer',:full),metadata,parameters) response['Content-Type'] = 'application/rdf+xml' - rdf + s.to_rdfxml + end post '/fminer/?' do halt 404, "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil? - halt 404, "Please submit a feature_uri." unless params[:feature_uri] and !params[:feature_uri].nil? - LOGGER.debug "Dataset: " + params[:dataset_uri] - LOGGER.debug "Endpoint: " + params[:feature_uri] - feature_uri = params[:feature_uri] - begin - LOGGER.debug "Retrieving #{params[:dataset_uri]}" - training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}" - rescue - LOGGER.error "Dataset #{params[:dataset_uri]} not found" - halt 404, "Dataset #{params[:dataset_uri]} not found." if training_dataset.nil? - end - halt 404, "No feature #{params[:feature_uri]} in dataset #{params[:dataset_uri]}" unless training_dataset.features and training_dataset.features.include?(params[:feature_uri]) + halt 404, "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil? + prediction_feature = params[:prediction_feature] - task_uri = OpenTox::Task.as_task("Mine features", url_for('/fminer',:full)) do + training_dataset = OpenTox::Dataset.new "#{params[:dataset_uri]}" + training_dataset.load_all + halt 404, "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless training_dataset.features and training_dataset.features.include?(params[:prediction_feature]) + + task_uri = OpenTox::Task.as_task("Mining BBRC features", url_for('/fminer',:full)) do feature_dataset = OpenTox::Dataset.new - title = "BBRC representatives for " + training_dataset.title - feature_dataset.title = title - feature_dataset.creator = url_for('/fminer',:full) - bbrc_uri = url_for("/fminer#BBRC_representative",:full) - feature_dataset.features << bbrc_uri + feature_dataset.add_metadata({ + DC.title => "BBRC representatives for " + training_dataset.metadata[DC.title], + DC.creator => url_for('/fminer',:full), + OT.hasSource => url_for('/fminer', :full), + }) + feature_dataset.add_parameters({ + "dataset_uri" => params[:dataset_uri], + "prediction_feature" => params[:prediction_feature] + }) + feature_dataset.save id = 1 # fminer start id is not 0 compounds = [] - + nr_active=0 + nr_inactive=0 g_hash = Hash.new# DV: for effect calculation in regression part + @@fminer.Reset - #@@fminer.SetChisqSig(0.99) - LOGGER.debug "Fminer: initialising ..." - training_dataset.data.each do |c,features| + training_dataset.data_entries.each do |compound,entry| begin - smiles = OpenTox::Compound.new(:uri => c.to_s).smiles + smiles = OpenTox::Compound.new(compound.to_s).smiles rescue - LOGGER.warn "No resource for #{c.to_s}" + LOGGER.warn "No resource for #{compound.to_s}" next end if smiles == '' or smiles.nil? - LOGGER.warn "Cannot find smiles for #{c.to_s}." - else - feature_dataset.compounds << c.to_s - features.each do |feature| - act = feature[feature_uri] - if act.nil? - LOGGER.warn "No #{feature_uri} activiity for #{c.to_s}." + LOGGER.warn "Cannot find smiles for #{compound.to_s}." + next + end + entry.each do |feature,values| + values.each do |value| + if value.nil? + LOGGER.warn "No #{feature} activiity for #{compound.to_s}." else - case act.to_s + case value.to_s when "true" - #LOGGER.debug id.to_s + ' "' + smiles +'"' + "\t" + true.to_s + nr_active += 1 activity = 1 when "false" - #LOGGER.debug id.to_s + ' "' + smiles +'"' + "\t" + false.to_s + nr_inactive += 1 activity = 0 else - # AM: add quantitative activity - activity = act.to_f + activity = value.to_f @@fminer.SetRegression(true) end - compounds[id] = c.to_s begin @@fminer.AddCompound(smiles,id) @@fminer.AddActivity(activity, id) g_hash[id]=activity # DV: insert global information + compounds[id] = compound + id += 1 rescue - LOGGER.warn "Could not add " + smiles + "\t" + act.to_s + " to fminer" + LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer" end - end - end - id += 1 - end - end - g_array=g_hash.values # DV: calculation of global median for effect calculation - g_median=OpenTox::Utils.median(g_array) - minfreq = (0.02*id).round - @@fminer.SetMinfreq(minfreq) - LOGGER.debug "Fminer: initialised with #{id} compounds, minimum frequency #{minfreq}" + end + end + end + end - raise "no compounds" if compounds.size==0 + g_array=g_hash.values # DV: calculation of global median for effect calculation + g_median=OpenTox::Algorithm.median(g_array) + + # TODO read from params + raise "No compounds in dataset #{training_dataset.uri}" if compounds.size==0 - values = {} + features = Set.new # run @@fminer - LOGGER.debug "Fminer: mining ..." (0 .. @@fminer.GetNoRootNodes()-1).each do |j| + results = @@fminer.MineRoot(j) results.each do |result| f = YAML.load(result)[0] smarts = f[0] p_value = f[1] - # AM: f[3] missing on regression + if (!@@fminer.GetRegression) ids = f[2] + f[3] - if f[2].size > f[3].size + if f[2].size.to_f/ids.size > nr_active.to_f/(nr_active+nr_inactive) effect = 'activating' else effect = 'deactivating' end else #regression part ids = f[2] - # DV: effect calculation - f_arr=Array.new - f[2].each do |id| - f_arr.push(g_hash[id]) - end - f_median=OpenTox::Utils.median(f_arr) - if g_median >= f_median - effect = 'activating' - else - effect = 'deactivating' - end - end - - tuple = { - url_for('/fminer#smarts',:full) => smarts, - url_for('/fminer#p_value',:full) => p_value.to_f, - url_for('/fminer#effect',:full) => effect - } - #LOGGER.debug "#{f[0]}\t#{f[1]}\t#{effect}" - ids.each do |id| - feature_dataset.data[compounds[id]] = [] unless feature_dataset.data[compounds[id]] - feature_dataset.data[compounds[id]] << {bbrc_uri => tuple} - end + # DV: effect calculation + f_arr=Array.new + f[2].each do |id| + f_arr.push(g_hash[id]) + end + f_median=OpenTox::Algorithm.median(f_arr) + if g_median >= f_median + effect = 'activating' + else + effect = 'deactivating' + end + end + + feature_uri = File.join feature_dataset.uri,"feature","bbrc", features.size.to_s + unless features.include? smarts + features << smarts + # TODO insert correct ontology entries + metadata = { + OT.hasSource => feature_dataset.uri, + OT.smarts => smarts, + OT.p_value => p_value.to_f, + OT.effect => effect } + feature_dataset.add_feature feature_uri, metadata + end + ids.each { |id| feature_dataset.add(compounds[id], feature_uri, true)} end end - - uri = feature_dataset.save - LOGGER.debug "Fminer finished, dataset #{uri} created." - uri + feature_dataset.save + feature_dataset.uri end - LOGGER.debug "Fimer task started: "+task_uri.to_s response['Content-Type'] = 'text/uri-list' halt 202,task_uri.to_s+"\n" end @@ -1,55 +1,35 @@ get '/lazar/?' do - uri = url_for('/lazar',:full) - owl = OpenTox::OwlSerializer.create 'Algorithm', uri - owl.annotation_property uri, DC.creator, "helma@in-silico.ch", XSD.string - owl.annotation_property uri, DC.contributor, "andreas@maunz.de", XSD.string - owl.annotation_property uri, DC.title, "lazar", XSD.string - owl.annotation_property uri, DC.source, "http://github.com/helma/opentox-algorithm", XSD.anyUri - owl.object_property uri, OT.parameters, File.join(uri,"dataset_uri"), XSD.anyUri - owl.object_property uri, OT.parameters, File.join(uri,"prediction_feature"), XSD.anyUri - owl.object_property uri, OT.parameters, File.join(uri,"feature_generation_uri"), XSD.anyUri - response['Content-Type'] = 'application/rdf+xml' - owl.rdf -end - -get '/lazar/prediction_feature?' do - uri = url_for('/lazar/prediction_feature',:full) - owl = OpenTox::OwlSerializer.create 'Parameter', uri - owl.annotation_property uri, DC.description, "URI of the feature to be predicted", XSD.string - owl.annotation_property uri, OT.paramScope, "mandatory", XSD.string - response['Content-Type'] = 'application/rdf+xml' - owl.rdf -end -get '/lazar/feature_generation_uri?' do - uri = url_for('/lazar/feature_generation_uri',:full) - owl = OpenTox::OwlSerializer.create 'Parameter', uri - owl.annotation_property uri, DC.description, "URI of the feature_generation_algorithm", XSD.string - owl.annotation_property uri, OT.paramScope, "mandatory", XSD.string + metadata = { + DC.title => 'lazar', + DC.identifier => url_for("",:full), + DC.creator => "helma@in-silico.ch, andreas@maunz.de", + DC.contributor => "vorgrimmlerdavid@gmx.de", + OT.isA => OTA.ClassificationLazySingleTarget + } + + parameters = [ + { DC.description => "Dataset URI", OT.paramScope => "mandatory", OT.title => "dataset_uri" }, + { DC.description => "Feature URI for dependent variable", OT.paramScope => "mandatory", OT.title => "prediction_feature" }, + { DC.description => "URI of feature genration service", OT.paramScope => "mandatory", OT.title => "feature_generation_uri" } + ] + + s = OpenTox::Serializer::Owl.new + s.add_algorithm(url_for('/lazar',:full),metadata,parameters) response['Content-Type'] = 'application/rdf+xml' - owl.rdf -end + s.to_rdfxml -get '/lazar/dataset_uri?' do - uri = url_for('/lazar/dataset_uri',:full) - owl = OpenTox::OwlSerializer.create 'Parameter', uri - owl.annotation_property uri, DC.description, "URI of the training dataset", XSD.string - owl.annotation_property uri, OT.paramScope, "mandatory", XSD.string - response['Content-Type'] = 'application/rdf+xml' - owl.rdf end post '/lazar/?' do # create a model - LOGGER.debug "Dataset: '" + params[:dataset_uri].to_s + "'" - LOGGER.debug "Endpoint: '" + params[:prediction_feature].to_s + "'" - LOGGER.debug "Feature generation: '" + params[:feature_generation_uri].to_s + "'" dataset_uri = "#{params[:dataset_uri]}" begin - training_activities = OpenTox::Dataset.find(dataset_uri) - rescue - halt 404, "Dataset #{dataset_uri} not found" + training_activities = OpenTox::Dataset.new(dataset_uri) + training_activities.load_all + rescue => e + halt 404, "Dataset #{dataset_uri} not found (#{e.inspect})." end halt 404, "No prediction_feature parameter." unless params[:prediction_feature] @@ -61,69 +41,46 @@ post '/lazar/?' do # create a model task_uri = OpenTox::Task.as_task("Create lazar model",url_for('/lazar',:full)) do |task| # create features - LOGGER.debug "Starting fminer" - params[:feature_uri] = params[:prediction_feature] - fminer_task_uri = OpenTox::Algorithm::Fminer.create_feature_dataset(params) - fminer_task = OpenTox::Task.find(fminer_task_uri) - fminer_task.wait_for_completion - raise "fminer failed" unless fminer_task.completed? - - LOGGER.debug "Fminer finished #{Time.now}" - feature_dataset_uri = fminer_task.resultURI.to_s - training_features = OpenTox::Dataset.find(feature_dataset_uri) + feature_dataset_uri = OpenTox::Algorithm::Fminer.new.run(params).to_s + + training_features = OpenTox::Dataset.new(feature_dataset_uri) + training_features.load_all halt 404, "Dataset #{feature_dataset_uri} not found." if training_features.nil? + lazar = OpenTox::Model::Lazar.new - lazar.trainingDataset = dataset_uri - lazar.feature_dataset_uri = feature_dataset_uri - halt 404, "More than one descriptor type" unless training_features.features.size == 1 - bbrc = training_features.features.first - training_features.data.each do |compound,features| - lazar.fingerprints[compound] = [] unless lazar.fingerprints[compound] - features.each do |feature| - tuple = feature[bbrc] - if tuple - smarts =nil; p_value = nil; effect = nil - tuple.each do |k,v| - case k - when /fminer#smarts/ - smarts = v - lazar.features << smarts - lazar.fingerprints[compound] << smarts - when /fminer#p_value/ - p_value = v - when /fminer#effect/ - effect = v - end - end - lazar.p_values[smarts] = p_value - lazar.effects[smarts] = effect - end - end - end - activities = {} - classification = true - training_activities.data.each do |compound,features| + # TODO: dataset method for iterating over data entries + training_features.data_entries.each do |compound,entry| + lazar.fingerprints[compound] = [] unless lazar.fingerprints[compound] + entry.keys.each do |feature| + # TODO fix URI + fminer_uri = File.join CONFIG[:services]["opentox-algorithm"], "fminer" + smarts = training_features.features[feature]["#{fminer_uri}#smarts"] + lazar.fingerprints[compound] << smarts + unless lazar.features.include? smarts + lazar.features << smarts + lazar.p_values[smarts] = training_features.features[feature]["#{fminer_uri}#p_value"] + lazar.effects[smarts] = training_features.features[feature]["#{fminer_uri}#effect"] + end + end + lazar.activities[compound] = [] unless lazar.activities[compound] - features.each do |feature| - case feature[params[:prediction_feature]].to_s + training_activities.data_entries[compound][params[:prediction_feature]].each do |value| + case value.to_s when "true" lazar.activities[compound] << true when "false" lazar.activities[compound] << false - # AM: handle quantitative activity values of features else - lazar.activities[compound] << feature[params[:prediction_feature]].to_f - classification = false + lazar.activities[compound] << value.to_f + lazar.type = "regression" end end - end - # TODO: insert regression - if classification - lazar.dependentVariables = params[:prediction_feature]+"_lazar_classification" - else - lazar.dependentVariables = params[:prediction_feature]+"_lazar_regression" - end + end + + lazar.metadata[OT.dependentVariables] = params[:prediction_feature] + lazar.metadata[OT.trainingDataset] = dataset_uri + lazar.metadata[OT.featureDataset] = feature_dataset_uri model_uri = lazar.save LOGGER.info model_uri + " created #{Time.now}" diff --git a/libfminer b/libfminer -Subproject 5a97d006e0ccfc48e53d5f24842a898ec9e912e +Subproject e955cc6b24d577d7187e5660716ee69d12174a8 @@ -1,3 +1,3 @@ get '/match/compound/*/smarts/*/?' do - "#{OpenTox::Compound.new(:inchi => params[:splat][0]).match?(params[:splat][1])}" + "#{OpenTox::Compound.from_inchi(params[:splat][0]).match?(params[:splat][1])}" end |