From f2e90040c0c39370d2ba227ce086c58f47dd0d67 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 21 Jul 2015 10:51:45 +0200 Subject: intermediary commit before switching to generalised storage model --- lib/bbrc.rb | 284 ++++++++++++++++++++++---------------------------- lib/classification.rb | 56 ++++++++++ lib/descriptor.rb | 39 ++++--- lib/fminer.rb | 12 +-- lib/last.rb | 2 +- lib/lazar.rb | 16 ++- 6 files changed, 222 insertions(+), 187 deletions(-) create mode 100644 lib/classification.rb diff --git a/lib/bbrc.rb b/lib/bbrc.rb index 40de186..2c2b8a2 100644 --- a/lib/bbrc.rb +++ b/lib/bbrc.rb @@ -18,178 +18,148 @@ module OpenTox @fminer=OpenTox::Algorithm::Fminer.new @fminer.check_params(params,5) - - # TODO introduce task again - #task = OpenTox::Task.run("Mining BBRC features", __FILE__ ) do |task| - time = Time.now + time = Time.now - @bbrc = Bbrc::Bbrc.new - @bbrc.Reset - if @fminer.prediction_feature.feature_type == "regression" - @bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations! - else - bad_request_error "No accept values for "\ - "dataset '#{@fminer.training_dataset.id}' and "\ - "feature '#{@fminer.prediction_feature.id}'" unless - @fminer.prediction_feature.accept_values - value_map=@fminer.prediction_feature.value_map - end - @bbrc.SetMinfreq(@fminer.minfreq) - @bbrc.SetType(1) if params[:feature_type] == "paths" - @bbrc.SetBackbone(false) if params[:backbone] == "false" - @bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance] - @bbrc.SetConsoleOut(false) + @bbrc = Bbrc::Bbrc.new + @bbrc.Reset + if @fminer.prediction_feature.feature_type == "regression" + @bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations! + else + bad_request_error "No accept values for "\ + "dataset '#{@fminer.training_dataset.id}' and "\ + "feature '#{@fminer.prediction_feature.id}'" unless + @fminer.prediction_feature.accept_values + value_map=@fminer.prediction_feature.value_map + end + @bbrc.SetMinfreq(@fminer.minfreq) + @bbrc.SetType(1) if params[:feature_type] == "paths" + @bbrc.SetBackbone(false) if params[:backbone] == "false" + @bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance] + @bbrc.SetConsoleOut(false) - feature_dataset = OpenTox::Dataset.new - feature_dataset.title = "BBRC representatives" - feature_dataset.creator = __FILE__ - feature_dataset.parameters = [ - { "title" => "dataset_id", "paramValue" => params[:dataset].id }, - { "title" => "prediction_feature_id", "paramValue" => params[:prediction_feature].id }, - { "title" => "min_frequency", "paramValue" => @fminer.minfreq }, - { "title" => "nr_hits", "paramValue" => (params[:nr_hits] == "true" ? "true" : "false") }, - { "title" => "backbone", "paramValue" => (params[:backbone] == "false" ? "false" : "true") } - ] + feature_dataset = OpenTox::CalculatedDataset.new + feature_dataset.title = "BBRC representatives" + feature_dataset.creator = __FILE__ + feature_dataset.parameters = [ + { "title" => "dataset_id", "paramValue" => params[:dataset].id }, + { "title" => "prediction_feature_id", "paramValue" => params[:prediction_feature].id }, + { "title" => "min_frequency", "paramValue" => @fminer.minfreq }, + { "title" => "nr_hits", "paramValue" => (params[:nr_hits] == "true" ? "true" : "false") }, + { "title" => "backbone", "paramValue" => (params[:backbone] == "false" ? "false" : "true") } + ] - @fminer.compounds = [] - @fminer.db_class_sizes = Array.new # AM: effect - @fminer.all_activities = Hash.new # DV: for effect calculation in regression part - @fminer.smi = [] # AM LAST: needed for matching the patterns back - - # Add data to fminer - @fminer.add_fminer_data(@bbrc, value_map) - g_median=@fminer.all_activities.values.to_scale.median + @fminer.compounds = [] + @fminer.db_class_sizes = Array.new # AM: effect + @fminer.all_activities = Hash.new # DV: for effect calculation in regression part + @fminer.smi = [] # AM LAST: needed for matching the patterns back + + # Add data to fminer + @fminer.add_fminer_data(@bbrc, value_map) + g_median=@fminer.all_activities.values.to_scale.median - #task.progress 10 - step_width = 80 / @bbrc.GetNoRootNodes().to_f - #features_smarts = Set.new - features = [] - data_entries = Array.new(params[:dataset].compounds.size) {[]} + #task.progress 10 + step_width = 80 / @bbrc.GetNoRootNodes().to_f + #features_smarts = Set.new + features = [] + data_entries = Array.new(params[:dataset].compounds.size) {[]} - puts "Setup: #{Time.now-time}" - time = Time.now - ftime = 0 - - # run @bbrc - - fminer_results = {} + $logger.debug "Setup: #{Time.now-time}" + time = Time.now + ftime = 0 + + # run @bbrc + + fminer_results = {} - (0 .. @bbrc.GetNoRootNodes()-1).each do |j| - results = @bbrc.MineRoot(j) - #task.progress 10+step_width*(j+1) - results.each do |result| - f = YAML.load(result)[0] - smarts = f[0] - p_value = f[1] - - if (!@bbrc.GetRegression) - id_arrs = f[2..-1].flatten - max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc - effect = max+1 - else #regression part - id_arrs = f[2] - # DV: effect calculation - f_arr=Array.new - f[2].each do |id| - id=id.keys[0] # extract id from hit count hash - f_arr.push(@fminer.all_activities[id]) - end - f_median=f_arr.to_scale.median - if g_median >= f_median - effect = 'activating' - else - effect = 'deactivating' - end + (0 .. @bbrc.GetNoRootNodes()-1).each do |j| + results = @bbrc.MineRoot(j) + #task.progress 10+step_width*(j+1) + results.each do |result| + f = YAML.load(result)[0] + smarts = f[0] + p_value = f[1] + + if (!@bbrc.GetRegression) + id_arrs = f[2..-1].flatten + max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc + effect = max+1 + else #regression part + id_arrs = f[2] + # DV: effect calculation + f_arr=Array.new + f[2].each do |id| + id=id.keys[0] # extract id from hit count hash + f_arr.push(@fminer.all_activities[id]) end - - ft = Time.now - feature = OpenTox::Feature.find_or_create_by({ - "title" => smarts.dup, - "numeric" => true, - "substructure" => true, - "smarts" => smarts.dup, - "pValue" => p_value.to_f.abs.round(5), - "effect" => effect, - "parameters" => [ - { "title" => "dataset_id", "paramValue" => params[:dataset].id }, - { "title" => "prediction_feature_id", "paramValue" => params[:prediction_feature].id } - ] - }) - features << feature - ftime += Time.now - ft + f_median=f_arr.to_scale.median + if g_median >= f_median + effect = 'activating' + else + effect = 'deactivating' + end + end + + ft = Time.now + feature = OpenTox::Feature.find_or_create_by({ + "title" => smarts.dup, + "numeric" => true, + "substructure" => true, + "smarts" => smarts.dup, + "pValue" => p_value.to_f.abs.round(5), + "effect" => effect, + "parameters" => [ + { "title" => "dataset_id", "paramValue" => params[:dataset].id }, + { "title" => "prediction_feature_id", "paramValue" => params[:prediction_feature].id } + ] + }) + features << feature + ftime += Time.now - ft - id_arrs.each { |id_count_hash| - id=id_count_hash.keys[0].to_i - count=id_count_hash.values[0].to_i - fminer_results[@fminer.compounds[id]] || fminer_results[@fminer.compounds[id]] = {} - compound_idx = params[:dataset].compounds.index @fminer.compounds[id] - feature_idx = features.index feature - data_entries[compound_idx] ||= [] - if params[:nr_hits] == "true" - fminer_results[@fminer.compounds[id]][feature] = count - data_entries[compound_idx][feature_idx] = count - else - fminer_results[@fminer.compounds[id]][feature] = 1 - data_entries[compound_idx][feature_idx] = 1 - end - } - - end # end of - end # feature parsing + id_arrs.each { |id_count_hash| + id=id_count_hash.keys[0].to_i + count=id_count_hash.values[0].to_i + fminer_results[@fminer.compounds[id]] || fminer_results[@fminer.compounds[id]] = {} + compound_idx = params[:dataset].compounds.index @fminer.compounds[id] + feature_idx = features.index feature + data_entries[compound_idx] ||= [] + if params[:nr_hits] == "true" + fminer_results[@fminer.compounds[id]][feature] = count + data_entries[compound_idx][feature_idx] = count + else + fminer_results[@fminer.compounds[id]][feature] = 1 + data_entries[compound_idx][feature_idx] = 1 + end + } + + end # end of + end # feature parsing - puts "Fminer: #{Time.now-time} (find/create Features: #{ftime})" - time = Time.now + $logger.debug "Fminer: #{Time.now-time} (find/create Features: #{ftime})" + time = Time.now - # convert nil entries to 0 - data_entries.collect! do |r| - if r.empty? - Array.new(features.size,0) - else - r[features.size-1] = 0 if r.size < features.size # grow array to match feature size - r.collect!{|c| c.nil? ? 0 : c} # remove nils - end + # convert nil entries to 0 + data_entries.collect! do |r| + if r.empty? + Array.new(features.size,0) + else + r[features.size-1] = 0 if r.size < features.size # grow array to match feature size + r.collect!{|c| c.nil? ? 0 : c} # remove nils end + end -=begin - # This part increases runtime by a factor of ~65 - # TODO: check if any information is lost due to simplification - fminer_compounds = @fminer.training_dataset.compounds - prediction_feature_idx = @fminer.training_dataset.features.index @fminer.prediction_feature - prediction_feature_all_acts = fminer_compounds.each_with_index.collect { |c,idx| - @fminer.training_dataset.data_entries[idx][prediction_feature_idx] - } - fminer_noact_compounds = fminer_compounds - @fminer.compounds - - feature_dataset.features = features - feature_dataset.features = [ @fminer.prediction_feature ] + feature_dataset.features if params[:get_target] == "true" - feature_dataset.compounds = fminer_compounds - fminer_compounds.each_with_index { |c,idx| - # TODO: reenable option - #if (params[:get_target] == "true") - #row = row + [ prediction_feature_all_acts[idx] ] - #end - features.each { |f| - v = fminer_results[c][f] if fminer_results[c] - unless fminer_noact_compounds.include? c - v = 0 if v.nil? - end - feature_dataset.add_data_entry c, f, v.to_i - } - } -=end - feature_dataset.compounds = params[:dataset].compounds - feature_dataset.features = features - feature_dataset.data_entries = data_entries + feature_dataset.compounds = params[:dataset].compounds + feature_dataset.features = features + feature_dataset.data_entries = data_entries - puts "Prepare save: #{Time.now-time}" - time = Time.now - feature_dataset.save + $logger.debug "Prepare save: #{Time.now-time}" + time = Time.now + #File.open("kazius.json","w+"){|f| f.puts feature_dataset.inspect} + feature_dataset.save - puts "Save: #{Time.now-time}" - feature_dataset - - #end + $logger.debug "Save: #{Time.now-time}" + feature_dataset + end end end diff --git a/lib/classification.rb b/lib/classification.rb new file mode 100644 index 0000000..f6c9b11 --- /dev/null +++ b/lib/classification.rb @@ -0,0 +1,56 @@ +module OpenTox + module Algorithm + + class Classification + + # Classification with majority vote from neighbors weighted by similarity + # @param [Hash] params Keys `:activities, :sims, :value_map` are required + # @return [Numeric] A prediction value. + def self.weighted_majority_vote(neighbors) + + return {:prediction => nil, :confidence => nil} if neighbors.empty? + + neighbor_contribution = 0.0 + confidence_sum = 0.0 + confidence = 0.0 + prediction = nil + + $logger.debug "Weighted Majority Vote Classification." + + values = neighbors.collect{|n| n[1]}.uniq + neighbors.each do |neighbor| + neighbor_weight = neighbor[2] + activity = values.index(neighbor[1]) + 1 # map values to integers > 1 + neighbor_contribution += activity * neighbor_weight + if values.size == 2 # AM: provide compat to binary classification: 1=>false 2=>true + case activity + when 1 + confidence_sum -= neighbor_weight + when 2 + confidence_sum += neighbor_weight + end + else + confidence_sum += neighbor_weight + end + end + if values.size == 2 + if confidence_sum >= 0.0 + prediction = values[1] + elsif confidence_sum < 0.0 + prediction = values[0] + end + else + prediction = (neighbor_contribution/confidence_sum).round # AM: new multinomial prediction + end + + $logger.debug "Prediction: '" + prediction.to_s + "'." unless prediction.nil? + confidence = (confidence_sum/neighbors.size).abs + $logger.debug "Confidence: '" + confidence.to_s + "'." unless prediction.nil? + return {:prediction => prediction, :confidence => confidence.abs} + end + + end + + end +end + diff --git a/lib/descriptor.rb b/lib/descriptor.rb index d862a41..f556df7 100644 --- a/lib/descriptor.rb +++ b/lib/descriptor.rb @@ -63,7 +63,6 @@ module OpenTox obmol = OpenBabel::OBMol.new obconversion.set_in_format('inchi') smarts_pattern = OpenBabel::OBSmartsPattern.new - #fingerprint = {} smarts = [smarts] unless smarts.is_a? Array fingerprint = Array.new(compounds.size){Array.new(smarts.size,false)} compounds.each_with_index do |compound,c| @@ -87,6 +86,8 @@ module OpenTox def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS compounds = parse compounds + dataset = OpenTox::CalculatedDataset.new + dataset.compounds = compounds des = {} descriptors.each do |d| lib, descriptor = d.split(".",2) @@ -95,13 +96,27 @@ module OpenTox des[lib] << descriptor end result = {} - des.each do |lib,d| - send(lib, compounds, d).each do |compound,values| - result[compound] ||= {} - result[compound].merge! values - end + features = [] + data_entries = Array.new(compounds.size){Array.new(des.size)} + n = 0 + des.each do |lib,descriptors| + features += descriptors.collect do |d| + OpenTox::Feature.find_or_create_by( + :title => "#{lib}.#{d}", + :creator => __FILE__ + ) + end + r = send(lib, compounds, descriptors) + #p r + r.each_with_index do |values,i| + data_entries[i][n] = values + end + n += 1 end - result + #dataset.features = features + #dataset.data_entries = data_entries + #dataset + data_entries end def self.openbabel compounds, descriptors @@ -111,12 +126,11 @@ module OpenTox obmol = OpenBabel::OBMol.new obconversion = OpenBabel::OBConversion.new obconversion.set_in_format 'inchi' - fingerprint = {} - compounds.each do |compound| + fingerprint = Array.new(compounds.size){Array.new(obdescriptors.size)} + compounds.each_with_index do |compound,c| obconversion.read_string obmol, compound.inchi - fingerprint[compound] = {} - obdescriptors.each_with_index do |descriptor,i| - fingerprint[compound]["Openbabel."+descriptors[i]] = fix_value(descriptor.predict(obmol)) + obdescriptors.each_with_index do |descriptor,d| + fingerprint[c][d] = fix_value(descriptor.predict(obmol)) end end fingerprint @@ -238,6 +252,7 @@ module OpenTox end def self.parse compounds + p compounds case compounds.class.to_s when "OpenTox::Compound" compounds = [compounds] diff --git a/lib/fminer.rb b/lib/fminer.rb index 59ee224..c26fe2f 100644 --- a/lib/fminer.rb +++ b/lib/fminer.rb @@ -33,8 +33,12 @@ module OpenTox resource_not_found_error "No feature '#{params[:prediction_feature]}' in dataset '#{params[:dataset]}'" unless @training_dataset.features.include?( params[:prediction_feature] ) unless params[:min_frequency].nil? + # set minfreq directly + if params[:min_frequency].numeric? + @minfreq=params[:min_frequency].to_i + $logger.debug "min_frequency #{@minfreq}" # check for percentage - if params[:min_frequency].include? "pc" + elsif params[:min_frequency].include? "pc" per_mil=params[:min_frequency].gsub(/pc/,"") if per_mil.numeric? per_mil = per_mil.to_i * 10 @@ -49,14 +53,8 @@ module OpenTox else bad_request=true end - # set minfreq directly else - if params[:min_frequency].numeric? - @minfreq=params[:min_frequency].to_i - $logger.debug "min_frequency #{@minfreq}" - else bad_request=true - end end bad_request_error "Minimum frequency must be integer [n], or a percentage [n]pc, or a per-mil [n]pm , with n greater 0" if bad_request end diff --git a/lib/last.rb b/lib/last.rb index 944d95e..3828c82 100644 --- a/lib/last.rb +++ b/lib/last.rb @@ -36,7 +36,7 @@ module OpenTox @last.SetConsoleOut(false) - feature_dataset = OpenTox::Dataset.new + feature_dataset = OpenTox::CalculatedDataset.new feature_dataset["title"] = "LAST representatives for #{@fminer.training_dataset.title}", feature_dataset.creator = __FILE__ feature_dataset.parameters = [ diff --git a/lib/lazar.rb b/lib/lazar.rb index 4a59c01..d6a6f47 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -46,13 +46,14 @@ module OpenTox # Prepare lazar object (includes graph mining) # @param[Array] lazar parameters as strings # @param[Hash] REST parameters, as input by user - def self.create feature_dataset, prediction_feature=nil, params={} + def self.create training_dataset, feature_dataset, prediction_feature=nil, params={} lazar = OpenTox::Model::Lazar.new bad_request_error "No features found in feature dataset #{feature_dataset.id}." if feature_dataset.features.empty? lazar.feature_dataset_id = feature_dataset.id - @training_dataset = OpenTox::Dataset.find(feature_dataset.parameters.select{|p| p["title"] == "dataset_id"}.first["paramValue"]) + @training_dataset = training_dataset + #@training_dataset = OpenTox::Dataset.find(feature_dataset.parameters.select{|p| p["title"] == "dataset_id"}.first["paramValue"]) bad_request_error "Training dataset compounds do not match feature dataset compounds. Please ensure that they are in the same order." unless @training_dataset.compounds == feature_dataset.compounds lazar.training_dataset_id = @training_dataset.id @@ -141,19 +142,17 @@ module OpenTox bad_request_error "Please provide one of the parameters: :compound, :compounds, :dataset" end - puts "Setup: #{Time.now-time}" + $logger.debug "Setup: #{Time.now-time}" time = Time.now @query_fingerprint = OpenTox::Algorithm::Descriptor.send( feature_calculation_algorithm, compounds, @feature_dataset.features.collect{|f| f["title"]} ) - puts "Fingerprint calculation: #{Time.now-time}" + $logger.debug "Fingerprint calculation: #{Time.now-time}" time = Time.now # AM: transform to cosine space min_sim = (min_sim.to_f*2.0-1.0).to_s if similarity_algorithm =~ /cosine/ - p compounds.size - i = 0 compounds.each_with_index do |compound,c| $logger.debug "predict compound #{c+1}/#{compounds.size} #{compound.inchi}" @@ -172,9 +171,6 @@ module OpenTox #mtf.transform # - puts "Transform: #{Time.now-time}" - time = Time.now - # find neighbors neighbors = [] @feature_dataset.data_entries.each_with_index do |fingerprint, i| @@ -186,7 +182,7 @@ module OpenTox prediction = OpenTox::Algorithm::Classification.send(prediction_algorithm, neighbors) - puts "Prediction: #{Time.now-time}" + $logger.debug "Prediction: #{Time.now-time}" time = Time.now # AM: transform to original space (TODO) -- cgit v1.2.3