summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2015-07-27 20:56:22 +0200
committerChristoph Helma <helma@in-silico.ch>2015-07-27 20:56:22 +0200
commit57dca303f3c936c60e8113b1cfddac5f1436dbef (patch)
tree4533f715b284589c6f6126fe71173c753f55c0f3
parentf2e90040c0c39370d2ba227ce086c58f47dd0d67 (diff)
reasonable query performace for data_entries
-rw-r--r--lib/algorithm.rb12
-rw-r--r--lib/bbrc.rb125
-rw-r--r--lib/fminer.rb24
-rw-r--r--lib/lazar.rb46
-rw-r--r--lib/opentox-algorithm.rb1
-rw-r--r--lib/similarity.rb4
6 files changed, 119 insertions, 93 deletions
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
new file mode 100644
index 0000000..1b97584
--- /dev/null
+++ b/lib/algorithm.rb
@@ -0,0 +1,12 @@
+module OpenTox
+
+ module Algorithm
+
+ def self.run algorithm, arg1, arg2 #parameters
+ klass,method = algorithm.split('.')
+ Object.const_get(klass).send(method, arg1,arg2)
+ end
+
+ end
+end
+
diff --git a/lib/bbrc.rb b/lib/bbrc.rb
index 2c2b8a2..1c04a6d 100644
--- a/lib/bbrc.rb
+++ b/lib/bbrc.rb
@@ -15,6 +15,9 @@ module OpenTox
# - get_target Set to "true" to obtain target variable as feature
# @return [text/uri-list] Task URI
def self.bbrc params
+
+ table_of_elements = [
+"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"]
@fminer=OpenTox::Algorithm::Fminer.new
@fminer.check_params(params,5)
@@ -23,14 +26,13 @@ module OpenTox
@bbrc = Bbrc::Bbrc.new
@bbrc.Reset
- if @fminer.prediction_feature.feature_type == "regression"
+ if @fminer.prediction_feature.numeric
@bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations!
else
bad_request_error "No accept values for "\
"dataset '#{@fminer.training_dataset.id}' and "\
- "feature '#{@fminer.prediction_feature.id}'" unless
- @fminer.prediction_feature.accept_values
- value_map=@fminer.prediction_feature.value_map
+ "feature '#{@fminer.prediction_feature.id}'" unless @fminer.prediction_feature.accept_values
+ value_map = @fminer.prediction_feature.accept_values.each_index.inject({}) { |h,idx| h[idx+1]=@fminer.prediction_feature.accept_values[idx]; h }
end
@bbrc.SetMinfreq(@fminer.minfreq)
@bbrc.SetType(1) if params[:feature_type] == "paths"
@@ -38,16 +40,18 @@ module OpenTox
@bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance]
@bbrc.SetConsoleOut(false)
- feature_dataset = OpenTox::CalculatedDataset.new
- feature_dataset.title = "BBRC representatives"
- feature_dataset.creator = __FILE__
- feature_dataset.parameters = [
- { "title" => "dataset_id", "paramValue" => params[:dataset].id },
- { "title" => "prediction_feature_id", "paramValue" => params[:prediction_feature].id },
- { "title" => "min_frequency", "paramValue" => @fminer.minfreq },
- { "title" => "nr_hits", "paramValue" => (params[:nr_hits] == "true" ? "true" : "false") },
- { "title" => "backbone", "paramValue" => (params[:backbone] == "false" ? "false" : "true") }
- ]
+ feature_dataset = FminerDataset.new(
+ :training_dataset_id => params[:dataset].id,
+ :training_algorithm => "#{self.to_s}.bbrc",
+ :training_feature_id => params[:prediction_feature].id ,
+ :training_parameters => {
+ :min_frequency => @fminer.minfreq,
+ :nr_hits => (params[:nr_hits] == "true" ? "true" : "false"),
+ :backbone => (params[:backbone] == "false" ? "false" : "true")
+ }
+
+ )
+ feature_dataset.compounds = params[:dataset].compounds
@fminer.compounds = []
@fminer.db_class_sizes = Array.new # AM: effect
@@ -59,27 +63,32 @@ module OpenTox
g_median=@fminer.all_activities.values.to_scale.median
#task.progress 10
- step_width = 80 / @bbrc.GetNoRootNodes().to_f
- #features_smarts = Set.new
+ #step_width = 80 / @bbrc.GetNoRootNodes().to_f
features = []
- data_entries = Array.new(params[:dataset].compounds.size) {[]}
+ feature_ids = []
+ matches = {}
$logger.debug "Setup: #{Time.now-time}"
time = Time.now
ftime = 0
+ itime = 0
+ rtime = 0
# run @bbrc
-
- fminer_results = {}
-
(0 .. @bbrc.GetNoRootNodes()-1).each do |j|
results = @bbrc.MineRoot(j)
- #task.progress 10+step_width*(j+1)
results.each do |result|
+ rt = Time.now
f = YAML.load(result)[0]
- smarts = f[0]
- p_value = f[1]
+ smarts = f.shift
+ # convert fminer representation into a more human readable format
+ smarts.gsub!(%r{\[#(\d+)&(\w)\]}) do
+ element = table_of_elements[$1.to_i-1]
+ $2 == "a" ? element.downcase : element
+ end
+ p_value = f.shift
+=begin
if (!@bbrc.GetRegression)
id_arrs = f[2..-1].flatten
max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc
@@ -99,62 +108,52 @@ module OpenTox
effect = 'deactivating'
end
end
+=end
+ rtime += Time.now - rt
ft = Time.now
- feature = OpenTox::Feature.find_or_create_by({
- "title" => smarts.dup,
- "numeric" => true,
- "substructure" => true,
- "smarts" => smarts.dup,
+ feature = OpenTox::FminerSmarts.find_or_create_by({
+ "smarts" => smarts,
"pValue" => p_value.to_f.abs.round(5),
- "effect" => effect,
- "parameters" => [
- { "title" => "dataset_id", "paramValue" => params[:dataset].id },
- { "title" => "prediction_feature_id", "paramValue" => params[:prediction_feature].id }
- ]
+ #"effect" => effect,
+ "dataset_id" => feature_dataset.id
})
- features << feature
+ feature_dataset.add_feature feature
+ feature_ids << feature.id.to_s
ftime += Time.now - ft
- id_arrs.each { |id_count_hash|
- id=id_count_hash.keys[0].to_i
- count=id_count_hash.values[0].to_i
- fminer_results[@fminer.compounds[id]] || fminer_results[@fminer.compounds[id]] = {}
- compound_idx = params[:dataset].compounds.index @fminer.compounds[id]
- feature_idx = features.index feature
- data_entries[compound_idx] ||= []
- if params[:nr_hits] == "true"
- fminer_results[@fminer.compounds[id]][feature] = count
- data_entries[compound_idx][feature_idx] = count
- else
- fminer_results[@fminer.compounds[id]][feature] = 1
- data_entries[compound_idx][feature_idx] = 1
+ it = Time.now
+ f.first.each do |id_count_hash|
+ id_count_hash.each do |id,count|
+ matches[@fminer.compounds[id].id.to_s] = {feature.id.to_s => count}
end
- }
+ end
+ itime += Time.now - it
- end # end of
- end # feature parsing
+ end
+ end
- $logger.debug "Fminer: #{Time.now-time} (find/create Features: #{ftime})"
+ $logger.debug "Fminer: #{Time.now-time} (read: #{rtime}, iterate: #{itime}, find/create Features: #{ftime})"
time = Time.now
- # convert nil entries to 0
- data_entries.collect! do |r|
- if r.empty?
- Array.new(features.size,0)
- else
- r[features.size-1] = 0 if r.size < features.size # grow array to match feature size
- r.collect!{|c| c.nil? ? 0 : c} # remove nils
+ n = 0
+ feature_dataset.compound_ids.each do |cid|
+ cid = cid.to_s
+ feature_dataset.feature_ids.each_with_index do |fid,i|
+ fid = fid.to_s
+ unless matches[cid] and matches[cid][fid]# fminer returns only matches
+ count = 0
+ else
+ count = matches[cid][fid]
+ end
+ feature_dataset.bulk << [cid,fid,count]
+ n +=1
end
end
- feature_dataset.compounds = params[:dataset].compounds
- feature_dataset.features = features
- feature_dataset.data_entries = data_entries
-
$logger.debug "Prepare save: #{Time.now-time}"
time = Time.now
- #File.open("kazius.json","w+"){|f| f.puts feature_dataset.inspect}
+ feature_dataset.bulk_write
feature_dataset.save
$logger.debug "Save: #{Time.now-time}"
diff --git a/lib/fminer.rb b/lib/fminer.rb
index c26fe2f..d708d5f 100644
--- a/lib/fminer.rb
+++ b/lib/fminer.rb
@@ -97,11 +97,12 @@ module OpenTox
# @param[Hash] Maps dependent variable values to Integers
def add_fminer_data(fminer_instance, value_map)
+ # TODO store warnings in dataset
id=1
@training_dataset.compounds.each do |compound|
compound_activities = @training_dataset.values(compound, @prediction_feature)
begin
- if @prediction_feature.feature_type == "classification"
+ if @prediction_feature.nominal
compound_activities = compound_activities.to_scale.mode
else
compound_activities = compound_activities.to_scale.median
@@ -113,7 +114,7 @@ module OpenTox
if compound_activities.nil?
$logger.warn "No activity for '#{compound.inchi}' and feature '#{@prediction_feature.title}'"
else
- if @prediction_feature.feature_type == "classification"
+ if @prediction_feature.nominal
activity= value_map.invert[compound_activities].to_i # activities are mapped to 1..n
bad_request_error "activity could not be mapped, is #{compound_activities} (#{compound_activities.class}), available: #{value_map.values} (#{value_map.values.collect{|k| k.class}})" if activity<1
@db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
@@ -180,15 +181,16 @@ module OpenTox
# @param [Integer] per-mil value
# return [Integer] min-frequency
def min_frequency(training_dataset,prediction_feature,per_mil)
- nr_labeled_cmpds=0
- f_idx=training_dataset.features.index prediction_feature
- training_dataset.compounds.each_with_index { |cmpd, c_idx|
- if ( training_dataset.data_entries[c_idx] )
- unless training_dataset.data_entries[c_idx][f_idx].nil?
- nr_labeled_cmpds += 1
- end
- end
- }
+ nr_labeled_cmpds = DataEntry.where(dataset_id: training_dataset.id, feature_id: prediction_feature.id).in(compound_id: training_dataset.compound_ids).count
+ #nr_labeled_cmpds=0
+ #f_idx=training_dataset.features.index prediction_feature
+ #training_dataset.compounds.each_with_index { |cmpd, c_idx|
+ #if ( training_dataset.data_entries[c_idx] )
+ #unless training_dataset.data_entries[c_idx][f_idx].nil?
+ #nr_labeled_cmpds += 1
+ #end
+ #end
+ #}
minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
minfreq = 2 unless minfreq > 2
Integer (minfreq)
diff --git a/lib/lazar.rb b/lib/lazar.rb
index d6a6f47..2c83f38 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -25,13 +25,14 @@ module OpenTox
# algorithms
field :feature_generation, type: String
field :feature_calculation_algorithm, type: String
- field :prediction_algorithm, type: Symbol
- field :similarity_algorithm, type: Symbol
+ field :prediction_algorithm, type: String
+ field :similarity_algorithm, type: String
# prediction features
field :prediction_feature_id, type: BSON::ObjectId
field :predicted_value_id, type: BSON::ObjectId
field :predicted_variables, type: Array
# parameters
+ field :nr_hits, type: Boolean
field :min_sim, type: Float
field :propositionalized, type:Boolean
field :min_train_performance, type: Float
@@ -46,7 +47,7 @@ module OpenTox
# Prepare lazar object (includes graph mining)
# @param[Array] lazar parameters as strings
# @param[Hash] REST parameters, as input by user
- def self.create training_dataset, feature_dataset, prediction_feature=nil, params={}
+ def self.create training_dataset, feature_dataset, prediction_feature=nil, nr_hits=false, params={}
lazar = OpenTox::Model::Lazar.new
@@ -79,22 +80,20 @@ module OpenTox
lazar.prediction_algorithm =~ /majority_vote/ ? lazar.propositionalized = false : lazar.propositionalized = true
lazar.min_sim = params[:min_sim].to_f if params[:min_sim] and params[:min_sim].numeric?
- lazar.nr_hits = params[:nr_hits] if params[:nr_hits]
- lazar.feature_generation = feature_dataset.creator
+ lazar.nr_hits = nr_hits
+ lazar.feature_generation = feature_dataset.training_algorithm
#lazar.parameters << {"title" => "feature_generation_uri", "paramValue" => params[:feature_generation_uri]}
- # TODO insert algorithm into feature dataset
- # TODO store algorithms in mongodb?
if lazar.feature_generation =~ /fminer|bbrc|last/
- if (lazar[:nr_hits] == "true")
- lazar.feature_calculation_algorithm = "smarts_count"
+ if lazar[:nr_hits]
+ lazar.feature_calculation_algorithm = "OpenTox::Algorithm::Descriptor.smarts_count"
else
- lazar.feature_calculation_algorithm = "smarts_match"
+ lazar.feature_calculation_algorithm = "OpenTox::Algorithm::Descriptor.smarts_match"
end
- lazar.similarity_algorithm = "tanimoto"
+ lazar.similarity_algorithm = "OpenTox::Algorithm::Similarity.tanimoto"
lazar.min_sim = 0.3 unless lazar.min_sim
elsif lazar.feature_generation =~/descriptor/ or lazar.feature_generation.nil?
# cosine similartiy is default (e.g. used when no fetature_generation_uri is given and a feature_dataset_uri is provided instead)
- lazar.similarity_algorithm = "cosine"
+ lazar.similarity_algorithm = "OpenTox::Algorithm::Similarity.cosine"
lazar.min_sim = 0.7 unless lazar.min_sim
else
bad_request_error "unkown feature generation method #{lazar.feature_generation}"
@@ -116,7 +115,7 @@ module OpenTox
time = Time.now
# prepare prediction dataset
- prediction_dataset = OpenTox::Dataset.new
+ prediction_dataset = LazarPrediction.new
prediction_feature = OpenTox::Feature.find prediction_feature_id
prediction_dataset.title = "Lazar prediction for #{prediction_feature.title}",
prediction_dataset.creator = __FILE__,
@@ -145,7 +144,11 @@ module OpenTox
$logger.debug "Setup: #{Time.now-time}"
time = Time.now
- @query_fingerprint = OpenTox::Algorithm::Descriptor.send( feature_calculation_algorithm, compounds, @feature_dataset.features.collect{|f| f["title"]} )
+ # TODO: remove eval
+ #p ("#{feature_calculation_algorithm}(#{compounds}, #{@feature_dataset.features.collect{|f| f.smarts}})")
+ #@query_fingerprint = eval("#{feature_calculation_algorithm}(#{compounds}, #{@feature_dataset.features.collect{|f| f.smarts}})")
+ @query_fingerprint = Algorithm.run(feature_calculation_algorithm, compounds, @feature_dataset.features.collect{|f| f.smarts} )
+ #p @query_fingerprint
$logger.debug "Fingerprint calculation: #{Time.now-time}"
time = Time.now
@@ -173,14 +176,21 @@ module OpenTox
# find neighbors
neighbors = []
- @feature_dataset.data_entries.each_with_index do |fingerprint, i|
-
- sim = OpenTox::Algorithm::Similarity.send(similarity_algorithm,fingerprint, @query_fingerprint[c])
+ #@feature_dataset.data_entries.each_with_index do |fingerprint, i|
+ @feature_dataset.compounds.each_with_index do |compound, i|
+ #p compound
+ #p @feature_dataset.features.size
+ fingerprint = @feature_dataset.feature_values(compound)
+ #fingerprint = @feature_dataset.features(compound)
+ #p fingerprint
+
+ sim = Algorithm.run(similarity_algorithm,[fingerprint, @query_fingerprint[c]])
# TODO fix for multi feature datasets
neighbors << [@feature_dataset.compounds[i],@training_dataset.data_entries[i].first,sim] if sim > self.min_sim
end
+ #p neighbors
- prediction = OpenTox::Algorithm::Classification.send(prediction_algorithm, neighbors)
+ prediction = Algorithm.run(prediction_algorithm, neighbors)
$logger.debug "Prediction: #{Time.now-time}"
time = Time.now
diff --git a/lib/opentox-algorithm.rb b/lib/opentox-algorithm.rb
index 46cd474..d768cfd 100644
--- a/lib/opentox-algorithm.rb
+++ b/lib/opentox-algorithm.rb
@@ -13,6 +13,7 @@ require_relative '../libfminer/liblast/last' #
require_relative '../last-utils/lu.rb'
#Dir[File.join(File.dirname(__FILE__),"*.rb")].each{ |f| require_relative f}
+require_relative "algorithm.rb"
require_relative "descriptor.rb"
require_relative "fminer.rb"
require_relative "lazar.rb"
diff --git a/lib/similarity.rb b/lib/similarity.rb
index 5f02577..59c86ff 100644
--- a/lib/similarity.rb
+++ b/lib/similarity.rb
@@ -14,7 +14,9 @@ module OpenTox
# @param [Array] a fingerprints of first compound
# @param [Array] b fingerprints of second compound
# @return [Float] Tanimoto similarity
- def self.tanimoto(a,b)
+ def self.tanimoto(fingerprints)
+ a = fingerprints.first
+ b = fingerprints.last
common_p_sum = 0.0
all_p_sum = 0.0
size = [ a.size, b.size ].min