From 57dca303f3c936c60e8113b1cfddac5f1436dbef Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 27 Jul 2015 20:56:22 +0200 Subject: reasonable query performace for data_entries --- lib/algorithm.rb | 12 +++++ lib/bbrc.rb | 125 +++++++++++++++++++++++------------------------ lib/fminer.rb | 24 ++++----- lib/lazar.rb | 46 ++++++++++------- lib/opentox-algorithm.rb | 1 + lib/similarity.rb | 4 +- 6 files changed, 119 insertions(+), 93 deletions(-) create mode 100644 lib/algorithm.rb diff --git a/lib/algorithm.rb b/lib/algorithm.rb new file mode 100644 index 0000000..1b97584 --- /dev/null +++ b/lib/algorithm.rb @@ -0,0 +1,12 @@ +module OpenTox + + module Algorithm + + def self.run algorithm, arg1, arg2 #parameters + klass,method = algorithm.split('.') + Object.const_get(klass).send(method, arg1,arg2) + end + + end +end + diff --git a/lib/bbrc.rb b/lib/bbrc.rb index 2c2b8a2..1c04a6d 100644 --- a/lib/bbrc.rb +++ b/lib/bbrc.rb @@ -15,6 +15,9 @@ module OpenTox # - get_target Set to "true" to obtain target variable as feature # @return [text/uri-list] Task URI def self.bbrc params + + table_of_elements = [ +"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"] @fminer=OpenTox::Algorithm::Fminer.new @fminer.check_params(params,5) @@ -23,14 +26,13 @@ module OpenTox @bbrc = Bbrc::Bbrc.new @bbrc.Reset - if @fminer.prediction_feature.feature_type == "regression" + if @fminer.prediction_feature.numeric @bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations! else bad_request_error "No accept values for "\ "dataset '#{@fminer.training_dataset.id}' and "\ - "feature '#{@fminer.prediction_feature.id}'" unless - @fminer.prediction_feature.accept_values - value_map=@fminer.prediction_feature.value_map + "feature '#{@fminer.prediction_feature.id}'" unless @fminer.prediction_feature.accept_values + value_map = @fminer.prediction_feature.accept_values.each_index.inject({}) { |h,idx| h[idx+1]=@fminer.prediction_feature.accept_values[idx]; h } end @bbrc.SetMinfreq(@fminer.minfreq) @bbrc.SetType(1) if params[:feature_type] == "paths" @@ -38,16 +40,18 @@ module OpenTox @bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance] @bbrc.SetConsoleOut(false) - feature_dataset = OpenTox::CalculatedDataset.new - feature_dataset.title = "BBRC representatives" - feature_dataset.creator = __FILE__ - feature_dataset.parameters = [ - { "title" => "dataset_id", "paramValue" => params[:dataset].id }, - { "title" => "prediction_feature_id", "paramValue" => params[:prediction_feature].id }, - { "title" => "min_frequency", "paramValue" => @fminer.minfreq }, - { "title" => "nr_hits", "paramValue" => (params[:nr_hits] == "true" ? "true" : "false") }, - { "title" => "backbone", "paramValue" => (params[:backbone] == "false" ? "false" : "true") } - ] + feature_dataset = FminerDataset.new( + :training_dataset_id => params[:dataset].id, + :training_algorithm => "#{self.to_s}.bbrc", + :training_feature_id => params[:prediction_feature].id , + :training_parameters => { + :min_frequency => @fminer.minfreq, + :nr_hits => (params[:nr_hits] == "true" ? "true" : "false"), + :backbone => (params[:backbone] == "false" ? "false" : "true") + } + + ) + feature_dataset.compounds = params[:dataset].compounds @fminer.compounds = [] @fminer.db_class_sizes = Array.new # AM: effect @@ -59,27 +63,32 @@ module OpenTox g_median=@fminer.all_activities.values.to_scale.median #task.progress 10 - step_width = 80 / @bbrc.GetNoRootNodes().to_f - #features_smarts = Set.new + #step_width = 80 / @bbrc.GetNoRootNodes().to_f features = [] - data_entries = Array.new(params[:dataset].compounds.size) {[]} + feature_ids = [] + matches = {} $logger.debug "Setup: #{Time.now-time}" time = Time.now ftime = 0 + itime = 0 + rtime = 0 # run @bbrc - - fminer_results = {} - (0 .. @bbrc.GetNoRootNodes()-1).each do |j| results = @bbrc.MineRoot(j) - #task.progress 10+step_width*(j+1) results.each do |result| + rt = Time.now f = YAML.load(result)[0] - smarts = f[0] - p_value = f[1] + smarts = f.shift + # convert fminer representation into a more human readable format + smarts.gsub!(%r{\[#(\d+)&(\w)\]}) do + element = table_of_elements[$1.to_i-1] + $2 == "a" ? element.downcase : element + end + p_value = f.shift +=begin if (!@bbrc.GetRegression) id_arrs = f[2..-1].flatten max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc @@ -99,62 +108,52 @@ module OpenTox effect = 'deactivating' end end +=end + rtime += Time.now - rt ft = Time.now - feature = OpenTox::Feature.find_or_create_by({ - "title" => smarts.dup, - "numeric" => true, - "substructure" => true, - "smarts" => smarts.dup, + feature = OpenTox::FminerSmarts.find_or_create_by({ + "smarts" => smarts, "pValue" => p_value.to_f.abs.round(5), - "effect" => effect, - "parameters" => [ - { "title" => "dataset_id", "paramValue" => params[:dataset].id }, - { "title" => "prediction_feature_id", "paramValue" => params[:prediction_feature].id } - ] + #"effect" => effect, + "dataset_id" => feature_dataset.id }) - features << feature + feature_dataset.add_feature feature + feature_ids << feature.id.to_s ftime += Time.now - ft - id_arrs.each { |id_count_hash| - id=id_count_hash.keys[0].to_i - count=id_count_hash.values[0].to_i - fminer_results[@fminer.compounds[id]] || fminer_results[@fminer.compounds[id]] = {} - compound_idx = params[:dataset].compounds.index @fminer.compounds[id] - feature_idx = features.index feature - data_entries[compound_idx] ||= [] - if params[:nr_hits] == "true" - fminer_results[@fminer.compounds[id]][feature] = count - data_entries[compound_idx][feature_idx] = count - else - fminer_results[@fminer.compounds[id]][feature] = 1 - data_entries[compound_idx][feature_idx] = 1 + it = Time.now + f.first.each do |id_count_hash| + id_count_hash.each do |id,count| + matches[@fminer.compounds[id].id.to_s] = {feature.id.to_s => count} end - } + end + itime += Time.now - it - end # end of - end # feature parsing + end + end - $logger.debug "Fminer: #{Time.now-time} (find/create Features: #{ftime})" + $logger.debug "Fminer: #{Time.now-time} (read: #{rtime}, iterate: #{itime}, find/create Features: #{ftime})" time = Time.now - # convert nil entries to 0 - data_entries.collect! do |r| - if r.empty? - Array.new(features.size,0) - else - r[features.size-1] = 0 if r.size < features.size # grow array to match feature size - r.collect!{|c| c.nil? ? 0 : c} # remove nils + n = 0 + feature_dataset.compound_ids.each do |cid| + cid = cid.to_s + feature_dataset.feature_ids.each_with_index do |fid,i| + fid = fid.to_s + unless matches[cid] and matches[cid][fid]# fminer returns only matches + count = 0 + else + count = matches[cid][fid] + end + feature_dataset.bulk << [cid,fid,count] + n +=1 end end - feature_dataset.compounds = params[:dataset].compounds - feature_dataset.features = features - feature_dataset.data_entries = data_entries - $logger.debug "Prepare save: #{Time.now-time}" time = Time.now - #File.open("kazius.json","w+"){|f| f.puts feature_dataset.inspect} + feature_dataset.bulk_write feature_dataset.save $logger.debug "Save: #{Time.now-time}" diff --git a/lib/fminer.rb b/lib/fminer.rb index c26fe2f..d708d5f 100644 --- a/lib/fminer.rb +++ b/lib/fminer.rb @@ -97,11 +97,12 @@ module OpenTox # @param[Hash] Maps dependent variable values to Integers def add_fminer_data(fminer_instance, value_map) + # TODO store warnings in dataset id=1 @training_dataset.compounds.each do |compound| compound_activities = @training_dataset.values(compound, @prediction_feature) begin - if @prediction_feature.feature_type == "classification" + if @prediction_feature.nominal compound_activities = compound_activities.to_scale.mode else compound_activities = compound_activities.to_scale.median @@ -113,7 +114,7 @@ module OpenTox if compound_activities.nil? $logger.warn "No activity for '#{compound.inchi}' and feature '#{@prediction_feature.title}'" else - if @prediction_feature.feature_type == "classification" + if @prediction_feature.nominal activity= value_map.invert[compound_activities].to_i # activities are mapped to 1..n bad_request_error "activity could not be mapped, is #{compound_activities} (#{compound_activities.class}), available: #{value_map.values} (#{value_map.values.collect{|k| k.class}})" if activity<1 @db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect @@ -180,15 +181,16 @@ module OpenTox # @param [Integer] per-mil value # return [Integer] min-frequency def min_frequency(training_dataset,prediction_feature,per_mil) - nr_labeled_cmpds=0 - f_idx=training_dataset.features.index prediction_feature - training_dataset.compounds.each_with_index { |cmpd, c_idx| - if ( training_dataset.data_entries[c_idx] ) - unless training_dataset.data_entries[c_idx][f_idx].nil? - nr_labeled_cmpds += 1 - end - end - } + nr_labeled_cmpds = DataEntry.where(dataset_id: training_dataset.id, feature_id: prediction_feature.id).in(compound_id: training_dataset.compound_ids).count + #nr_labeled_cmpds=0 + #f_idx=training_dataset.features.index prediction_feature + #training_dataset.compounds.each_with_index { |cmpd, c_idx| + #if ( training_dataset.data_entries[c_idx] ) + #unless training_dataset.data_entries[c_idx][f_idx].nil? + #nr_labeled_cmpds += 1 + #end + #end + #} minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST minfreq = 2 unless minfreq > 2 Integer (minfreq) diff --git a/lib/lazar.rb b/lib/lazar.rb index d6a6f47..2c83f38 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -25,13 +25,14 @@ module OpenTox # algorithms field :feature_generation, type: String field :feature_calculation_algorithm, type: String - field :prediction_algorithm, type: Symbol - field :similarity_algorithm, type: Symbol + field :prediction_algorithm, type: String + field :similarity_algorithm, type: String # prediction features field :prediction_feature_id, type: BSON::ObjectId field :predicted_value_id, type: BSON::ObjectId field :predicted_variables, type: Array # parameters + field :nr_hits, type: Boolean field :min_sim, type: Float field :propositionalized, type:Boolean field :min_train_performance, type: Float @@ -46,7 +47,7 @@ module OpenTox # Prepare lazar object (includes graph mining) # @param[Array] lazar parameters as strings # @param[Hash] REST parameters, as input by user - def self.create training_dataset, feature_dataset, prediction_feature=nil, params={} + def self.create training_dataset, feature_dataset, prediction_feature=nil, nr_hits=false, params={} lazar = OpenTox::Model::Lazar.new @@ -79,22 +80,20 @@ module OpenTox lazar.prediction_algorithm =~ /majority_vote/ ? lazar.propositionalized = false : lazar.propositionalized = true lazar.min_sim = params[:min_sim].to_f if params[:min_sim] and params[:min_sim].numeric? - lazar.nr_hits = params[:nr_hits] if params[:nr_hits] - lazar.feature_generation = feature_dataset.creator + lazar.nr_hits = nr_hits + lazar.feature_generation = feature_dataset.training_algorithm #lazar.parameters << {"title" => "feature_generation_uri", "paramValue" => params[:feature_generation_uri]} - # TODO insert algorithm into feature dataset - # TODO store algorithms in mongodb? if lazar.feature_generation =~ /fminer|bbrc|last/ - if (lazar[:nr_hits] == "true") - lazar.feature_calculation_algorithm = "smarts_count" + if lazar[:nr_hits] + lazar.feature_calculation_algorithm = "OpenTox::Algorithm::Descriptor.smarts_count" else - lazar.feature_calculation_algorithm = "smarts_match" + lazar.feature_calculation_algorithm = "OpenTox::Algorithm::Descriptor.smarts_match" end - lazar.similarity_algorithm = "tanimoto" + lazar.similarity_algorithm = "OpenTox::Algorithm::Similarity.tanimoto" lazar.min_sim = 0.3 unless lazar.min_sim elsif lazar.feature_generation =~/descriptor/ or lazar.feature_generation.nil? # cosine similartiy is default (e.g. used when no fetature_generation_uri is given and a feature_dataset_uri is provided instead) - lazar.similarity_algorithm = "cosine" + lazar.similarity_algorithm = "OpenTox::Algorithm::Similarity.cosine" lazar.min_sim = 0.7 unless lazar.min_sim else bad_request_error "unkown feature generation method #{lazar.feature_generation}" @@ -116,7 +115,7 @@ module OpenTox time = Time.now # prepare prediction dataset - prediction_dataset = OpenTox::Dataset.new + prediction_dataset = LazarPrediction.new prediction_feature = OpenTox::Feature.find prediction_feature_id prediction_dataset.title = "Lazar prediction for #{prediction_feature.title}", prediction_dataset.creator = __FILE__, @@ -145,7 +144,11 @@ module OpenTox $logger.debug "Setup: #{Time.now-time}" time = Time.now - @query_fingerprint = OpenTox::Algorithm::Descriptor.send( feature_calculation_algorithm, compounds, @feature_dataset.features.collect{|f| f["title"]} ) + # TODO: remove eval + #p ("#{feature_calculation_algorithm}(#{compounds}, #{@feature_dataset.features.collect{|f| f.smarts}})") + #@query_fingerprint = eval("#{feature_calculation_algorithm}(#{compounds}, #{@feature_dataset.features.collect{|f| f.smarts}})") + @query_fingerprint = Algorithm.run(feature_calculation_algorithm, compounds, @feature_dataset.features.collect{|f| f.smarts} ) + #p @query_fingerprint $logger.debug "Fingerprint calculation: #{Time.now-time}" time = Time.now @@ -173,14 +176,21 @@ module OpenTox # find neighbors neighbors = [] - @feature_dataset.data_entries.each_with_index do |fingerprint, i| - - sim = OpenTox::Algorithm::Similarity.send(similarity_algorithm,fingerprint, @query_fingerprint[c]) + #@feature_dataset.data_entries.each_with_index do |fingerprint, i| + @feature_dataset.compounds.each_with_index do |compound, i| + #p compound + #p @feature_dataset.features.size + fingerprint = @feature_dataset.feature_values(compound) + #fingerprint = @feature_dataset.features(compound) + #p fingerprint + + sim = Algorithm.run(similarity_algorithm,[fingerprint, @query_fingerprint[c]]) # TODO fix for multi feature datasets neighbors << [@feature_dataset.compounds[i],@training_dataset.data_entries[i].first,sim] if sim > self.min_sim end + #p neighbors - prediction = OpenTox::Algorithm::Classification.send(prediction_algorithm, neighbors) + prediction = Algorithm.run(prediction_algorithm, neighbors) $logger.debug "Prediction: #{Time.now-time}" time = Time.now diff --git a/lib/opentox-algorithm.rb b/lib/opentox-algorithm.rb index 46cd474..d768cfd 100644 --- a/lib/opentox-algorithm.rb +++ b/lib/opentox-algorithm.rb @@ -13,6 +13,7 @@ require_relative '../libfminer/liblast/last' # require_relative '../last-utils/lu.rb' #Dir[File.join(File.dirname(__FILE__),"*.rb")].each{ |f| require_relative f} +require_relative "algorithm.rb" require_relative "descriptor.rb" require_relative "fminer.rb" require_relative "lazar.rb" diff --git a/lib/similarity.rb b/lib/similarity.rb index 5f02577..59c86ff 100644 --- a/lib/similarity.rb +++ b/lib/similarity.rb @@ -14,7 +14,9 @@ module OpenTox # @param [Array] a fingerprints of first compound # @param [Array] b fingerprints of second compound # @return [Float] Tanimoto similarity - def self.tanimoto(a,b) + def self.tanimoto(fingerprints) + a = fingerprints.first + b = fingerprints.last common_p_sum = 0.0 all_p_sum = 0.0 size = [ a.size, b.size ].min -- cgit v1.2.3