summaryrefslogtreecommitdiff
path: root/balancer.rb
diff options
context:
space:
mode:
Diffstat (limited to 'balancer.rb')
-rw-r--r--balancer.rb98
1 files changed, 0 insertions, 98 deletions
diff --git a/balancer.rb b/balancer.rb
deleted file mode 100644
index 4ed2fd7..0000000
--- a/balancer.rb
+++ /dev/null
@@ -1,98 +0,0 @@
-# cuts a classification dataset into balanced pieces
-# let inact_act_ratio := majority_class.size/minority_class.size
-# then: nr pieces = ceil(inact_act_ratio) if inact_act_ratio > 1.5
-# each piece contains the complete minority class and ceil(inact_act_ratio) majority class compounds.
-
-class Balancer
-
- attr_accessor :inact_act_ratio, :act_hash, :inact_hash, :majority_splits, :nr_majority_splits, :errors, :datasets
-
- # Supply a OpenTox::Dataset here
- # Calculates inact_act_ratio, iff inact_act_ratio != +/-Infinity and no regression dataset is given
- def initialize(dataset, feature_uri, creator_url)
- @act_arr = []
- @inact_arr = []
- @inact_act_ratio = 1.0/0 # trick to define +infinity
- @nr_majority_splits = 1 # +/-1 means: no split
- @split = [] # splitted arrays with ids
- @datasets = [] # result datasets
- @errors = []
-
- classification = true
- if dataset.features.include?(feature_uri)
- dataset.data.each do |i,a|
- inchi = i
- acts = a
- acts.each do |act|
- value = act[feature_uri]
- if OpenTox::Utils.is_true?(value)
- @act_arr << inchi
- elsif OpenTox::Utils.classification?(value)
- @inact_arr << inchi
- else
- classification = false
- break;
- end
- end
- end
- @inact_act_ratio = @inact_arr.size.to_f / @act_arr.size.to_f unless (@act_arr.size == 0 or !classification) # leave alone for regression
- set_nr_majority_splits
- # perform majority split
- @split = @nr_majority_splits > 0 ? shuffle_split(@inact_arr) : shuffle_split(@act_arr) unless @nr_majority_splits.abs == 1
- @split.each do |s|
- new_c = @nr_majority_splits > 0 ? s.concat(@act_arr) : s.concat(@inac_arr)
- @datasets << dataset.create_new_dataset(new_c, [feature_uri], dataset.title, creator_url)
- end
-
- else
- errors << "Feature not present in dataset."
- end
- errors << "Can not split regression dataset." unless classification
- end
-
-
-
- # sets nr of splits for majority class ('+', if inact_cnt > act_cnt, or '-' else), or leaves unchanged for illegal values.
- def set_nr_majority_splits
- @nr_majority_splits = @inact_act_ratio >= 1.5 ? @inact_act_ratio.ceil : ( @inact_act_ratio <= (2.0/3.0) ? -(1.0/@inact_act_ratio).ceil : ( @inact_act_ratio>1.0 ? 1 : -1) ) unless OpenTox::Utils.infinity?(@inact_act_ratio) # leave alone for regression
- end
-
- # does the actual shuffle and split
- def shuffle_split (arr)
- arr = arr.shuffle
- arr.chunk(@nr_majority_splits.abs)
- end
-
- # turns a hash into a 2 col csv
- def hsh2csv (hsh)
- res=""
- hsh.each do |k,v|
- arr = [v,(@nr_majority_splits > 0 ? 0 : 1)]
- res += arr.join(", ") + "\n"
- end
- res
- end
-
-end
-
-class Array
-
- # cuts an array into <num-pieces> chunks - returns a two-dimensional array
- def chunk(pieces)
- q, r = length.divmod(pieces)
- (0..pieces).map { |i| i * q + [r, i].min }.enum_cons(2) \
- .map { |a, b| slice(a...b) }
- end
-
- # shuffles the elements of an array
- def shuffle( seed=nil )
- srand seed.to_i if seed
- sort_by { Kernel.rand }
- end
-
- # shuffels self
- def shuffle!( seed=nil )
- self.replace shuffle( seed )
- end
-
-end