diff options
-rw-r--r-- | balancer.rb | 98 | ||||
-rw-r--r-- | lazar.rb | 53 |
2 files changed, 151 insertions, 0 deletions
diff --git a/balancer.rb b/balancer.rb new file mode 100644 index 0000000..4ed2fd7 --- /dev/null +++ b/balancer.rb @@ -0,0 +1,98 @@ +# cuts a classification dataset into balanced pieces +# let inact_act_ratio := majority_class.size/minority_class.size +# then: nr pieces = ceil(inact_act_ratio) if inact_act_ratio > 1.5 +# each piece contains the complete minority class and ceil(inact_act_ratio) majority class compounds. + +class Balancer + + attr_accessor :inact_act_ratio, :act_hash, :inact_hash, :majority_splits, :nr_majority_splits, :errors, :datasets + + # Supply a OpenTox::Dataset here + # Calculates inact_act_ratio, iff inact_act_ratio != +/-Infinity and no regression dataset is given + def initialize(dataset, feature_uri, creator_url) + @act_arr = [] + @inact_arr = [] + @inact_act_ratio = 1.0/0 # trick to define +infinity + @nr_majority_splits = 1 # +/-1 means: no split + @split = [] # splitted arrays with ids + @datasets = [] # result datasets + @errors = [] + + classification = true + if dataset.features.include?(feature_uri) + dataset.data.each do |i,a| + inchi = i + acts = a + acts.each do |act| + value = act[feature_uri] + if OpenTox::Utils.is_true?(value) + @act_arr << inchi + elsif OpenTox::Utils.classification?(value) + @inact_arr << inchi + else + classification = false + break; + end + end + end + @inact_act_ratio = @inact_arr.size.to_f / @act_arr.size.to_f unless (@act_arr.size == 0 or !classification) # leave alone for regression + set_nr_majority_splits + # perform majority split + @split = @nr_majority_splits > 0 ? shuffle_split(@inact_arr) : shuffle_split(@act_arr) unless @nr_majority_splits.abs == 1 + @split.each do |s| + new_c = @nr_majority_splits > 0 ? s.concat(@act_arr) : s.concat(@inac_arr) + @datasets << dataset.create_new_dataset(new_c, [feature_uri], dataset.title, creator_url) + end + + else + errors << "Feature not present in dataset." + end + errors << "Can not split regression dataset." unless classification + end + + + + # sets nr of splits for majority class ('+', if inact_cnt > act_cnt, or '-' else), or leaves unchanged for illegal values. + def set_nr_majority_splits + @nr_majority_splits = @inact_act_ratio >= 1.5 ? @inact_act_ratio.ceil : ( @inact_act_ratio <= (2.0/3.0) ? -(1.0/@inact_act_ratio).ceil : ( @inact_act_ratio>1.0 ? 1 : -1) ) unless OpenTox::Utils.infinity?(@inact_act_ratio) # leave alone for regression + end + + # does the actual shuffle and split + def shuffle_split (arr) + arr = arr.shuffle + arr.chunk(@nr_majority_splits.abs) + end + + # turns a hash into a 2 col csv + def hsh2csv (hsh) + res="" + hsh.each do |k,v| + arr = [v,(@nr_majority_splits > 0 ? 0 : 1)] + res += arr.join(", ") + "\n" + end + res + end + +end + +class Array + + # cuts an array into <num-pieces> chunks - returns a two-dimensional array + def chunk(pieces) + q, r = length.divmod(pieces) + (0..pieces).map { |i| i * q + [r, i].min }.enum_cons(2) \ + .map { |a, b| slice(a...b) } + end + + # shuffles the elements of an array + def shuffle( seed=nil ) + srand seed.to_i if seed + sort_by { Kernel.rand } + end + + # shuffels self + def shuffle!( seed=nil ) + self.replace shuffle( seed ) + end + +end @@ -112,3 +112,56 @@ post '/lazar/?' do # create a model end halt 202,task_uri end + + +# AM: Balancer wraps around /lazar +post '/lazar-balanced/?' do # create a balanced model + LOGGER.debug "Dataset: '" + params[:dataset_uri].to_s + "'" + LOGGER.debug "Endpoint: '" + params[:prediction_feature].to_s + "'" + LOGGER.debug "Feature generation: '" + params[:feature_generation_uri].to_s + "'" + dataset_uri = "#{params[:dataset_uri]}" + + begin + training_activities = OpenTox::Dataset.find(dataset_uri) + rescue + halt 404, "Dataset #{dataset_uri} not found" + end + + halt 404, "No prediction_feature parameter." unless params[:prediction_feature] + halt 404, "No feature_generation_uri parameter." unless params[:feature_generation_uri] + halt 404, "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}. (features: "+ + training_activities.features.inspect+")" unless training_activities.features and training_activities.features.include?(params[:prediction_feature]) + + response['Content-Type'] = 'text/uri-list' + task_uri = OpenTox::Task.as_task do |task| + + # Split the dataset + bal = Balancer.new(training_activities, params[:prediction_feature], training_activities.creator) + balanced_datasets = [] + if bal.datasets.size > 0 + balanced_datasets = bal.datasets + end + + model_uris = [] + if balanced_datasets.size == 0 + mtu = OpenTox::Algorithm::Lazar.create_model(:dataset_uri => params[:dataset_uri], :prediction_feature => params[:prediction_feature]) + t = OpenTox::Task.find(mtu) + t.wait_for_completion + model_uris << t.resultURI + else + balanced_datasets.each do |bd| + mtu = OpenTox::Algorithm::Lazar.create_model(:dataset_uri => bd, :prediction_feature => params[:prediction_feature]) + t = OpenTox::Task.find(mtu) + t.wait_for_completion + model_uris << t.resultURI + end + end + lazar = OpenTox::Model::Lazar.new + lazar.models = model_uris + + model_uri = lazar.save + LOGGER.info model_uri + " created #{Time.now}" + model_uri + end + halt 202,task_uri +end |