summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2015-07-21 10:51:45 +0200
committerChristoph Helma <helma@in-silico.ch>2015-07-21 10:51:45 +0200
commitf2e90040c0c39370d2ba227ce086c58f47dd0d67 (patch)
tree024ec62155dc7048de27b8c12f0ec6ece1d54e19
parent4da6a50d9f73e12783e6baa1362c2f137deaebd9 (diff)
intermediary commit before switching to generalised storage model
-rw-r--r--lib/bbrc.rb284
-rw-r--r--lib/classification.rb56
-rw-r--r--lib/descriptor.rb39
-rw-r--r--lib/fminer.rb12
-rw-r--r--lib/last.rb2
-rw-r--r--lib/lazar.rb16
6 files changed, 222 insertions, 187 deletions
diff --git a/lib/bbrc.rb b/lib/bbrc.rb
index 40de186..2c2b8a2 100644
--- a/lib/bbrc.rb
+++ b/lib/bbrc.rb
@@ -18,178 +18,148 @@ module OpenTox
@fminer=OpenTox::Algorithm::Fminer.new
@fminer.check_params(params,5)
-
- # TODO introduce task again
- #task = OpenTox::Task.run("Mining BBRC features", __FILE__ ) do |task|
- time = Time.now
+ time = Time.now
- @bbrc = Bbrc::Bbrc.new
- @bbrc.Reset
- if @fminer.prediction_feature.feature_type == "regression"
- @bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations!
- else
- bad_request_error "No accept values for "\
- "dataset '#{@fminer.training_dataset.id}' and "\
- "feature '#{@fminer.prediction_feature.id}'" unless
- @fminer.prediction_feature.accept_values
- value_map=@fminer.prediction_feature.value_map
- end
- @bbrc.SetMinfreq(@fminer.minfreq)
- @bbrc.SetType(1) if params[:feature_type] == "paths"
- @bbrc.SetBackbone(false) if params[:backbone] == "false"
- @bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance]
- @bbrc.SetConsoleOut(false)
+ @bbrc = Bbrc::Bbrc.new
+ @bbrc.Reset
+ if @fminer.prediction_feature.feature_type == "regression"
+ @bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations!
+ else
+ bad_request_error "No accept values for "\
+ "dataset '#{@fminer.training_dataset.id}' and "\
+ "feature '#{@fminer.prediction_feature.id}'" unless
+ @fminer.prediction_feature.accept_values
+ value_map=@fminer.prediction_feature.value_map
+ end
+ @bbrc.SetMinfreq(@fminer.minfreq)
+ @bbrc.SetType(1) if params[:feature_type] == "paths"
+ @bbrc.SetBackbone(false) if params[:backbone] == "false"
+ @bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance]
+ @bbrc.SetConsoleOut(false)
- feature_dataset = OpenTox::Dataset.new
- feature_dataset.title = "BBRC representatives"
- feature_dataset.creator = __FILE__
- feature_dataset.parameters = [
- { "title" => "dataset_id", "paramValue" => params[:dataset].id },
- { "title" => "prediction_feature_id", "paramValue" => params[:prediction_feature].id },
- { "title" => "min_frequency", "paramValue" => @fminer.minfreq },
- { "title" => "nr_hits", "paramValue" => (params[:nr_hits] == "true" ? "true" : "false") },
- { "title" => "backbone", "paramValue" => (params[:backbone] == "false" ? "false" : "true") }
- ]
+ feature_dataset = OpenTox::CalculatedDataset.new
+ feature_dataset.title = "BBRC representatives"
+ feature_dataset.creator = __FILE__
+ feature_dataset.parameters = [
+ { "title" => "dataset_id", "paramValue" => params[:dataset].id },
+ { "title" => "prediction_feature_id", "paramValue" => params[:prediction_feature].id },
+ { "title" => "min_frequency", "paramValue" => @fminer.minfreq },
+ { "title" => "nr_hits", "paramValue" => (params[:nr_hits] == "true" ? "true" : "false") },
+ { "title" => "backbone", "paramValue" => (params[:backbone] == "false" ? "false" : "true") }
+ ]
- @fminer.compounds = []
- @fminer.db_class_sizes = Array.new # AM: effect
- @fminer.all_activities = Hash.new # DV: for effect calculation in regression part
- @fminer.smi = [] # AM LAST: needed for matching the patterns back
-
- # Add data to fminer
- @fminer.add_fminer_data(@bbrc, value_map)
- g_median=@fminer.all_activities.values.to_scale.median
+ @fminer.compounds = []
+ @fminer.db_class_sizes = Array.new # AM: effect
+ @fminer.all_activities = Hash.new # DV: for effect calculation in regression part
+ @fminer.smi = [] # AM LAST: needed for matching the patterns back
+
+ # Add data to fminer
+ @fminer.add_fminer_data(@bbrc, value_map)
+ g_median=@fminer.all_activities.values.to_scale.median
- #task.progress 10
- step_width = 80 / @bbrc.GetNoRootNodes().to_f
- #features_smarts = Set.new
- features = []
- data_entries = Array.new(params[:dataset].compounds.size) {[]}
+ #task.progress 10
+ step_width = 80 / @bbrc.GetNoRootNodes().to_f
+ #features_smarts = Set.new
+ features = []
+ data_entries = Array.new(params[:dataset].compounds.size) {[]}
- puts "Setup: #{Time.now-time}"
- time = Time.now
- ftime = 0
-
- # run @bbrc
-
- fminer_results = {}
+ $logger.debug "Setup: #{Time.now-time}"
+ time = Time.now
+ ftime = 0
+
+ # run @bbrc
+
+ fminer_results = {}
- (0 .. @bbrc.GetNoRootNodes()-1).each do |j|
- results = @bbrc.MineRoot(j)
- #task.progress 10+step_width*(j+1)
- results.each do |result|
- f = YAML.load(result)[0]
- smarts = f[0]
- p_value = f[1]
-
- if (!@bbrc.GetRegression)
- id_arrs = f[2..-1].flatten
- max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc
- effect = max+1
- else #regression part
- id_arrs = f[2]
- # DV: effect calculation
- f_arr=Array.new
- f[2].each do |id|
- id=id.keys[0] # extract id from hit count hash
- f_arr.push(@fminer.all_activities[id])
- end
- f_median=f_arr.to_scale.median
- if g_median >= f_median
- effect = 'activating'
- else
- effect = 'deactivating'
- end
+ (0 .. @bbrc.GetNoRootNodes()-1).each do |j|
+ results = @bbrc.MineRoot(j)
+ #task.progress 10+step_width*(j+1)
+ results.each do |result|
+ f = YAML.load(result)[0]
+ smarts = f[0]
+ p_value = f[1]
+
+ if (!@bbrc.GetRegression)
+ id_arrs = f[2..-1].flatten
+ max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc
+ effect = max+1
+ else #regression part
+ id_arrs = f[2]
+ # DV: effect calculation
+ f_arr=Array.new
+ f[2].each do |id|
+ id=id.keys[0] # extract id from hit count hash
+ f_arr.push(@fminer.all_activities[id])
end
-
- ft = Time.now
- feature = OpenTox::Feature.find_or_create_by({
- "title" => smarts.dup,
- "numeric" => true,
- "substructure" => true,
- "smarts" => smarts.dup,
- "pValue" => p_value.to_f.abs.round(5),
- "effect" => effect,
- "parameters" => [
- { "title" => "dataset_id", "paramValue" => params[:dataset].id },
- { "title" => "prediction_feature_id", "paramValue" => params[:prediction_feature].id }
- ]
- })
- features << feature
- ftime += Time.now - ft
+ f_median=f_arr.to_scale.median
+ if g_median >= f_median
+ effect = 'activating'
+ else
+ effect = 'deactivating'
+ end
+ end
+
+ ft = Time.now
+ feature = OpenTox::Feature.find_or_create_by({
+ "title" => smarts.dup,
+ "numeric" => true,
+ "substructure" => true,
+ "smarts" => smarts.dup,
+ "pValue" => p_value.to_f.abs.round(5),
+ "effect" => effect,
+ "parameters" => [
+ { "title" => "dataset_id", "paramValue" => params[:dataset].id },
+ { "title" => "prediction_feature_id", "paramValue" => params[:prediction_feature].id }
+ ]
+ })
+ features << feature
+ ftime += Time.now - ft
- id_arrs.each { |id_count_hash|
- id=id_count_hash.keys[0].to_i
- count=id_count_hash.values[0].to_i
- fminer_results[@fminer.compounds[id]] || fminer_results[@fminer.compounds[id]] = {}
- compound_idx = params[:dataset].compounds.index @fminer.compounds[id]
- feature_idx = features.index feature
- data_entries[compound_idx] ||= []
- if params[:nr_hits] == "true"
- fminer_results[@fminer.compounds[id]][feature] = count
- data_entries[compound_idx][feature_idx] = count
- else
- fminer_results[@fminer.compounds[id]][feature] = 1
- data_entries[compound_idx][feature_idx] = 1
- end
- }
-
- end # end of
- end # feature parsing
+ id_arrs.each { |id_count_hash|
+ id=id_count_hash.keys[0].to_i
+ count=id_count_hash.values[0].to_i
+ fminer_results[@fminer.compounds[id]] || fminer_results[@fminer.compounds[id]] = {}
+ compound_idx = params[:dataset].compounds.index @fminer.compounds[id]
+ feature_idx = features.index feature
+ data_entries[compound_idx] ||= []
+ if params[:nr_hits] == "true"
+ fminer_results[@fminer.compounds[id]][feature] = count
+ data_entries[compound_idx][feature_idx] = count
+ else
+ fminer_results[@fminer.compounds[id]][feature] = 1
+ data_entries[compound_idx][feature_idx] = 1
+ end
+ }
+
+ end # end of
+ end # feature parsing
- puts "Fminer: #{Time.now-time} (find/create Features: #{ftime})"
- time = Time.now
+ $logger.debug "Fminer: #{Time.now-time} (find/create Features: #{ftime})"
+ time = Time.now
- # convert nil entries to 0
- data_entries.collect! do |r|
- if r.empty?
- Array.new(features.size,0)
- else
- r[features.size-1] = 0 if r.size < features.size # grow array to match feature size
- r.collect!{|c| c.nil? ? 0 : c} # remove nils
- end
+ # convert nil entries to 0
+ data_entries.collect! do |r|
+ if r.empty?
+ Array.new(features.size,0)
+ else
+ r[features.size-1] = 0 if r.size < features.size # grow array to match feature size
+ r.collect!{|c| c.nil? ? 0 : c} # remove nils
end
+ end
-=begin
- # This part increases runtime by a factor of ~65
- # TODO: check if any information is lost due to simplification
- fminer_compounds = @fminer.training_dataset.compounds
- prediction_feature_idx = @fminer.training_dataset.features.index @fminer.prediction_feature
- prediction_feature_all_acts = fminer_compounds.each_with_index.collect { |c,idx|
- @fminer.training_dataset.data_entries[idx][prediction_feature_idx]
- }
- fminer_noact_compounds = fminer_compounds - @fminer.compounds
-
- feature_dataset.features = features
- feature_dataset.features = [ @fminer.prediction_feature ] + feature_dataset.features if params[:get_target] == "true"
- feature_dataset.compounds = fminer_compounds
- fminer_compounds.each_with_index { |c,idx|
- # TODO: reenable option
- #if (params[:get_target] == "true")
- #row = row + [ prediction_feature_all_acts[idx] ]
- #end
- features.each { |f|
- v = fminer_results[c][f] if fminer_results[c]
- unless fminer_noact_compounds.include? c
- v = 0 if v.nil?
- end
- feature_dataset.add_data_entry c, f, v.to_i
- }
- }
-=end
- feature_dataset.compounds = params[:dataset].compounds
- feature_dataset.features = features
- feature_dataset.data_entries = data_entries
+ feature_dataset.compounds = params[:dataset].compounds
+ feature_dataset.features = features
+ feature_dataset.data_entries = data_entries
- puts "Prepare save: #{Time.now-time}"
- time = Time.now
- feature_dataset.save
+ $logger.debug "Prepare save: #{Time.now-time}"
+ time = Time.now
+ #File.open("kazius.json","w+"){|f| f.puts feature_dataset.inspect}
+ feature_dataset.save
- puts "Save: #{Time.now-time}"
- feature_dataset
-
- #end
+ $logger.debug "Save: #{Time.now-time}"
+ feature_dataset
+
end
end
end
diff --git a/lib/classification.rb b/lib/classification.rb
new file mode 100644
index 0000000..f6c9b11
--- /dev/null
+++ b/lib/classification.rb
@@ -0,0 +1,56 @@
+module OpenTox
+ module Algorithm
+
+ class Classification
+
+ # Classification with majority vote from neighbors weighted by similarity
+ # @param [Hash] params Keys `:activities, :sims, :value_map` are required
+ # @return [Numeric] A prediction value.
+ def self.weighted_majority_vote(neighbors)
+
+ return {:prediction => nil, :confidence => nil} if neighbors.empty?
+
+ neighbor_contribution = 0.0
+ confidence_sum = 0.0
+ confidence = 0.0
+ prediction = nil
+
+ $logger.debug "Weighted Majority Vote Classification."
+
+ values = neighbors.collect{|n| n[1]}.uniq
+ neighbors.each do |neighbor|
+ neighbor_weight = neighbor[2]
+ activity = values.index(neighbor[1]) + 1 # map values to integers > 1
+ neighbor_contribution += activity * neighbor_weight
+ if values.size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
+ case activity
+ when 1
+ confidence_sum -= neighbor_weight
+ when 2
+ confidence_sum += neighbor_weight
+ end
+ else
+ confidence_sum += neighbor_weight
+ end
+ end
+ if values.size == 2
+ if confidence_sum >= 0.0
+ prediction = values[1]
+ elsif confidence_sum < 0.0
+ prediction = values[0]
+ end
+ else
+ prediction = (neighbor_contribution/confidence_sum).round # AM: new multinomial prediction
+ end
+
+ $logger.debug "Prediction: '" + prediction.to_s + "'." unless prediction.nil?
+ confidence = (confidence_sum/neighbors.size).abs
+ $logger.debug "Confidence: '" + confidence.to_s + "'." unless prediction.nil?
+ return {:prediction => prediction, :confidence => confidence.abs}
+ end
+
+ end
+
+ end
+end
+
diff --git a/lib/descriptor.rb b/lib/descriptor.rb
index d862a41..f556df7 100644
--- a/lib/descriptor.rb
+++ b/lib/descriptor.rb
@@ -63,7 +63,6 @@ module OpenTox
obmol = OpenBabel::OBMol.new
obconversion.set_in_format('inchi')
smarts_pattern = OpenBabel::OBSmartsPattern.new
- #fingerprint = {}
smarts = [smarts] unless smarts.is_a? Array
fingerprint = Array.new(compounds.size){Array.new(smarts.size,false)}
compounds.each_with_index do |compound,c|
@@ -87,6 +86,8 @@ module OpenTox
def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS
compounds = parse compounds
+ dataset = OpenTox::CalculatedDataset.new
+ dataset.compounds = compounds
des = {}
descriptors.each do |d|
lib, descriptor = d.split(".",2)
@@ -95,13 +96,27 @@ module OpenTox
des[lib] << descriptor
end
result = {}
- des.each do |lib,d|
- send(lib, compounds, d).each do |compound,values|
- result[compound] ||= {}
- result[compound].merge! values
- end
+ features = []
+ data_entries = Array.new(compounds.size){Array.new(des.size)}
+ n = 0
+ des.each do |lib,descriptors|
+ features += descriptors.collect do |d|
+ OpenTox::Feature.find_or_create_by(
+ :title => "#{lib}.#{d}",
+ :creator => __FILE__
+ )
+ end
+ r = send(lib, compounds, descriptors)
+ #p r
+ r.each_with_index do |values,i|
+ data_entries[i][n] = values
+ end
+ n += 1
end
- result
+ #dataset.features = features
+ #dataset.data_entries = data_entries
+ #dataset
+ data_entries
end
def self.openbabel compounds, descriptors
@@ -111,12 +126,11 @@ module OpenTox
obmol = OpenBabel::OBMol.new
obconversion = OpenBabel::OBConversion.new
obconversion.set_in_format 'inchi'
- fingerprint = {}
- compounds.each do |compound|
+ fingerprint = Array.new(compounds.size){Array.new(obdescriptors.size)}
+ compounds.each_with_index do |compound,c|
obconversion.read_string obmol, compound.inchi
- fingerprint[compound] = {}
- obdescriptors.each_with_index do |descriptor,i|
- fingerprint[compound]["Openbabel."+descriptors[i]] = fix_value(descriptor.predict(obmol))
+ obdescriptors.each_with_index do |descriptor,d|
+ fingerprint[c][d] = fix_value(descriptor.predict(obmol))
end
end
fingerprint
@@ -238,6 +252,7 @@ module OpenTox
end
def self.parse compounds
+ p compounds
case compounds.class.to_s
when "OpenTox::Compound"
compounds = [compounds]
diff --git a/lib/fminer.rb b/lib/fminer.rb
index 59ee224..c26fe2f 100644
--- a/lib/fminer.rb
+++ b/lib/fminer.rb
@@ -33,8 +33,12 @@ module OpenTox
resource_not_found_error "No feature '#{params[:prediction_feature]}' in dataset '#{params[:dataset]}'" unless
@training_dataset.features.include?( params[:prediction_feature] )
unless params[:min_frequency].nil?
+ # set minfreq directly
+ if params[:min_frequency].numeric?
+ @minfreq=params[:min_frequency].to_i
+ $logger.debug "min_frequency #{@minfreq}"
# check for percentage
- if params[:min_frequency].include? "pc"
+ elsif params[:min_frequency].include? "pc"
per_mil=params[:min_frequency].gsub(/pc/,"")
if per_mil.numeric?
per_mil = per_mil.to_i * 10
@@ -49,14 +53,8 @@ module OpenTox
else
bad_request=true
end
- # set minfreq directly
else
- if params[:min_frequency].numeric?
- @minfreq=params[:min_frequency].to_i
- $logger.debug "min_frequency #{@minfreq}"
- else
bad_request=true
- end
end
bad_request_error "Minimum frequency must be integer [n], or a percentage [n]pc, or a per-mil [n]pm , with n greater 0" if bad_request
end
diff --git a/lib/last.rb b/lib/last.rb
index 944d95e..3828c82 100644
--- a/lib/last.rb
+++ b/lib/last.rb
@@ -36,7 +36,7 @@ module OpenTox
@last.SetConsoleOut(false)
- feature_dataset = OpenTox::Dataset.new
+ feature_dataset = OpenTox::CalculatedDataset.new
feature_dataset["title"] = "LAST representatives for #{@fminer.training_dataset.title}",
feature_dataset.creator = __FILE__
feature_dataset.parameters = [
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 4a59c01..d6a6f47 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -46,13 +46,14 @@ module OpenTox
# Prepare lazar object (includes graph mining)
# @param[Array] lazar parameters as strings
# @param[Hash] REST parameters, as input by user
- def self.create feature_dataset, prediction_feature=nil, params={}
+ def self.create training_dataset, feature_dataset, prediction_feature=nil, params={}
lazar = OpenTox::Model::Lazar.new
bad_request_error "No features found in feature dataset #{feature_dataset.id}." if feature_dataset.features.empty?
lazar.feature_dataset_id = feature_dataset.id
- @training_dataset = OpenTox::Dataset.find(feature_dataset.parameters.select{|p| p["title"] == "dataset_id"}.first["paramValue"])
+ @training_dataset = training_dataset
+ #@training_dataset = OpenTox::Dataset.find(feature_dataset.parameters.select{|p| p["title"] == "dataset_id"}.first["paramValue"])
bad_request_error "Training dataset compounds do not match feature dataset compounds. Please ensure that they are in the same order." unless @training_dataset.compounds == feature_dataset.compounds
lazar.training_dataset_id = @training_dataset.id
@@ -141,19 +142,17 @@ module OpenTox
bad_request_error "Please provide one of the parameters: :compound, :compounds, :dataset"
end
- puts "Setup: #{Time.now-time}"
+ $logger.debug "Setup: #{Time.now-time}"
time = Time.now
@query_fingerprint = OpenTox::Algorithm::Descriptor.send( feature_calculation_algorithm, compounds, @feature_dataset.features.collect{|f| f["title"]} )
- puts "Fingerprint calculation: #{Time.now-time}"
+ $logger.debug "Fingerprint calculation: #{Time.now-time}"
time = Time.now
# AM: transform to cosine space
min_sim = (min_sim.to_f*2.0-1.0).to_s if similarity_algorithm =~ /cosine/
- p compounds.size
- i = 0
compounds.each_with_index do |compound,c|
$logger.debug "predict compound #{c+1}/#{compounds.size} #{compound.inchi}"
@@ -172,9 +171,6 @@ module OpenTox
#mtf.transform
#
- puts "Transform: #{Time.now-time}"
- time = Time.now
-
# find neighbors
neighbors = []
@feature_dataset.data_entries.each_with_index do |fingerprint, i|
@@ -186,7 +182,7 @@ module OpenTox
prediction = OpenTox::Algorithm::Classification.send(prediction_algorithm, neighbors)
- puts "Prediction: #{Time.now-time}"
+ $logger.debug "Prediction: #{Time.now-time}"
time = Time.now
# AM: transform to original space (TODO)