diff options
Diffstat (limited to 'validation/validation_service.rb')
-rwxr-xr-x | validation/validation_service.rb | 217 |
1 files changed, 97 insertions, 120 deletions
diff --git a/validation/validation_service.rb b/validation/validation_service.rb index 686a287..c433161 100755 --- a/validation/validation_service.rb +++ b/validation/validation_service.rb @@ -111,7 +111,7 @@ module Validation end # validates an algorithm by building a model and validating this model - def validate_algorithm( algorithm_params=nil, task=nil ) + def validate_algorithm( task=nil ) raise "validation_type missing" unless self.validation_type raise OpenTox::BadRequestError.new "no algorithm uri: '"+self.algorithm_uri.to_s+"'" if self.algorithm_uri==nil or self.algorithm_uri.to_s.size<1 @@ -301,9 +301,9 @@ module Validation class Crossvalidation - def perform_cv ( prediction_feature, algorithm_params=nil, task=nil ) - create_cv_datasets( prediction_feature, OpenTox::SubTask.create(task, 0, 33) ) - perform_cv_validations( algorithm_params, OpenTox::SubTask.create(task, 33, 100) ) + def perform_cv ( task=nil ) + create_cv_datasets( OpenTox::SubTask.create(task, 0, 33) ) + perform_cv_validations( OpenTox::SubTask.create(task, 33, 100) ) end def clean_loo_files( delete_feature_datasets ) @@ -349,27 +349,27 @@ module Validation end # creates the cv folds - def create_cv_datasets( prediction_feature, task=nil ) + def create_cv_datasets( task=nil ) if self.loo=="true" orig_dataset = Lib::DatasetCache.find(self.dataset_uri,self.subjectid) self.num_folds = orig_dataset.compounds.size self.random_seed = 0 - self.stratified = false + self.stratified = "false" else self.random_seed = 1 unless self.random_seed self.num_folds = 10 unless self.num_folds - self.stratified = false unless self.stratified + self.stratified = "false" unless self.stratified end - if copy_cv_datasets( prediction_feature ) + if copy_cv_datasets() # dataset folds of a previous crossvalidaiton could be used task.progress(100) if task else - create_new_cv_datasets( prediction_feature, task ) + create_new_cv_datasets( task ) end end # executes the cross-validation (build models and validates them) - def perform_cv_validations( algorithm_params, task=nil ) + def perform_cv_validations( task=nil ) LOGGER.debug "perform cv validations "+algorithm_params.inspect i = 0 @@ -377,8 +377,7 @@ module Validation @tmp_validations.each do | val | validation = Validation.create val validation.subjectid = self.subjectid - validation.validate_algorithm( algorithm_params, - OpenTox::SubTask.create(task, i * task_step, ( i + 1 ) * task_step) ) + validation.validate_algorithm( OpenTox::SubTask.create(task, i * task_step, ( i + 1 ) * task_step) ) raise "validation '"+validation.validation_uri+"' for crossvaldation could not be finished" unless validation.finished i += 1 @@ -395,14 +394,17 @@ module Validation private # copies datasets from an older crossvalidation on the same dataset and the same folds # returns true if successfull, false otherwise - def copy_cv_datasets( prediction_feature ) + def copy_cv_datasets( ) + # for downwards compatibilty: search prediction_feature=nil is ok cvs = Crossvalidation.find( { :dataset_uri => self.dataset_uri, :num_folds => self.num_folds, :stratified => self.stratified, :random_seed => self.random_seed, :loo => self.loo, - :finished => true} ).reject{ |cv| cv.id == self.id } + :finished => true} ).reject{ |cv| (cv.id == self.id || + (cv.prediction_feature && + cv.prediction_feature != self.prediction_feature)) } cvs.each do |cv| next if AA_SERVER and !OpenTox::Authorization.authorized?(cv.crossvalidation_uri,"GET",self.subjectid) tmp_val = [] @@ -420,7 +422,8 @@ module Validation :crossvalidation_id => self.id, :crossvalidation_fold => v.crossvalidation_fold, :prediction_feature => prediction_feature, - :algorithm_uri => self.algorithm_uri } + :algorithm_uri => self.algorithm_uri, + :algorithm_params => self.algorithm_params } end if tmp_val.size == self.num_folds.to_i @tmp_validations = tmp_val @@ -433,111 +436,78 @@ module Validation # creates cv folds (training and testdatasets) # stores uris in validation objects - def create_new_cv_datasets( prediction_feature, task = nil ) + def create_new_cv_datasets( task = nil ) LOGGER.debug "creating datasets for crossvalidation" orig_dataset = Lib::DatasetCache.find(self.dataset_uri,self.subjectid) raise OpenTox::NotFoundError.new "Dataset not found: "+self.dataset_uri.to_s unless orig_dataset - if self.loo=="true" - shuffled_compounds = orig_dataset.compounds - else - shuffled_compounds = orig_dataset.compounds.shuffle( self.random_seed ) - end + train_dataset_uris = [] + test_dataset_uris = [] - unless self.stratified + meta = { DC.creator => self.crossvalidation_uri } + case stratified + when "false" + if self.loo=="true" + shuffled_compounds = orig_dataset.compounds + else + shuffled_compounds = orig_dataset.compounds.shuffle( self.random_seed ) + end split_compounds = shuffled_compounds.chunk( self.num_folds.to_i ) - else - class_compounds = {} # "inactive" => compounds[], "active" => compounds[] .. - accept_values = orig_dataset.accept_values(prediction_feature) - raise OpenTox::BadRequestError.new("cannot apply stratification (not implemented for regression), acceptValue missing for prediction-feature '"+ - prediction_feature.to_s+"' in dataset '"+dataset_uri.to_s+"'") unless accept_values and accept_values.size>0 - accept_values.each do |value| - class_compounds[value] = [] - shuffled_compounds.each do |c| - #PENDING accept values are type string, data_entries may be boolean - class_compounds[value] << c if orig_dataset.data_entries[c][prediction_feature].collect{|v| v.to_s}.include?(value) - end - end - LOGGER.debug "stratified cv: different class values: "+class_compounds.keys.join(", ") - LOGGER.debug "stratified cv: num instances for each class value: "+class_compounds.values.collect{|c| c.size}.join(", ") - - split_class_compounds = [] # inactive_compounds[fold_i][], active_compounds[fold_i][], .. - class_compounds.values.each do |compounds| - split_class_compounds << compounds.chunk( self.num_folds.to_i ) - end - LOGGER.debug "stratified cv: splits for class values: "+split_class_compounds.collect{ |c| c.collect{ |cc| cc.size }.join("/") }.join(", ") - - # we cannot just merge the splits of the different class_values of each fold - # this could lead to folds, which sizes differ for more than 1 compound - split_compounds = [] - split_class_compounds.each do |split_comp| - # step 1: sort current split in ascending order - split_comp.sort!{|x,y| x.size <=> y.size } - # step 2: add splits - (0..self.num_folds.to_i-1).each do |i| - unless split_compounds[i] - split_compounds[i] = split_comp[i] + LOGGER.debug "cv: num instances for each fold: "+split_compounds.collect{|c| c.size}.join(", ") + + self.num_folds.to_i.times do |n| + test_compounds = [] + train_compounds = [] + self.num_folds.to_i.times do |nn| + compounds = split_compounds[nn] + if n == nn + compounds.each{ |compound| test_compounds << compound} else - split_compounds[i] += split_comp[i] - end + compounds.each{ |compound| train_compounds << compound} + end end - # step 3: sort (total) split in descending order - split_compounds.sort!{|x,y| y.size <=> x.size } + raise "internal error, num test compounds not correct,"+ + " is '#{test_compounds.size}', should be '#{(shuffled_compounds.size/self.num_folds.to_i)}'" unless + (shuffled_compounds.size/self.num_folds.to_i - test_compounds.size).abs <= 1 + raise "internal error, num train compounds not correct, should be '"+(shuffled_compounds.size-test_compounds.size).to_s+ + "', is '"+train_compounds.size.to_s+"'" unless shuffled_compounds.size - test_compounds.size == train_compounds.size + datasetname = 'dataset fold '+(n+1).to_s+' of '+self.num_folds.to_s + meta[DC.title] = "training "+datasetname + LOGGER.debug "training set: "+datasetname+"_train, compounds: "+train_compounds.size.to_s + train_dataset_uri = orig_dataset.split( train_compounds, orig_dataset.features.keys, + meta, self.subjectid ).uri + train_dataset_uris << train_dataset_uri + meta[DC.title] = "test "+datasetname + LOGGER.debug "test set: "+datasetname+"_test, compounds: "+test_compounds.size.to_s + test_features = orig_dataset.features.keys.dclone - [self.prediction_feature] + test_dataset_uri = orig_dataset.split( test_compounds, test_features, + meta, self.subjectid ).uri + test_dataset_uris << test_dataset_uri + end + when /true|super/ + if stratified=="true" + features = [ self.prediction_feature ] + else + features = nil end + train_datasets, test_datasets = stratified_k_fold_split(orig_dataset,meta, + "NA",self.num_folds.to_i,@subjectid,self.random_seed, features) + train_dataset_uris = test_datasets.collect{|d| d.uri} + test_dataset_uris = test_datasets.collect{|d| d.uri} + else + raise OpenTox::BadRequestError.new end - LOGGER.debug "cv: num instances for each fold: "+split_compounds.collect{|c| c.size}.join(", ") - - test_features = orig_dataset.features.keys.dclone - [prediction_feature] @tmp_validations = [] - - (1..self.num_folds.to_i).each do |n| - - datasetname = 'cv'+self.id.to_s + - #'_d'+orig_dataset.name.to_s + - '_f'+n.to_s+'of'+self.num_folds.to_s+ - '_r'+self.random_seed.to_s+ - '_s'+self.stratified.to_s - source = self.crossvalidation_uri - - test_compounds = [] - train_compounds = [] - - (1..self.num_folds.to_i).each do |nn| - compounds = split_compounds.at(nn-1) - - if n == nn - compounds.each{ |compound| test_compounds.push(compound)} - else - compounds.each{ |compound| train_compounds.push(compound)} - end - end - - raise "internal error, num test compounds not correct" unless (shuffled_compounds.size/self.num_folds.to_i - test_compounds.size).abs <= 1 - raise "internal error, num train compounds not correct, should be '"+(shuffled_compounds.size-test_compounds.size).to_s+ - "', is '"+train_compounds.size.to_s+"'" unless shuffled_compounds.size - test_compounds.size == train_compounds.size - - LOGGER.debug "training set: "+datasetname+"_train, compounds: "+train_compounds.size.to_s - #train_dataset_uri = orig_dataset.create_new_dataset( train_compounds, orig_dataset.features, datasetname + '_train', source ) - train_dataset_uri = orig_dataset.split( train_compounds, orig_dataset.features.keys, - { DC.title => datasetname + '_train', DC.creator => source }, self.subjectid ).uri - - LOGGER.debug "test set: "+datasetname+"_test, compounds: "+test_compounds.size.to_s - #test_dataset_uri = orig_dataset.create_new_dataset( test_compounds, test_features, datasetname + '_test', source ) - test_dataset_uri = orig_dataset.split( test_compounds, test_features, - { DC.title => datasetname + '_test', DC.creator => source }, self.subjectid ).uri - - #make sure self.id is set - #self.save if self.new? + self.num_folds.to_i.times do |n| tmp_validation = { :validation_type => "crossvalidation", - :training_dataset_uri => train_dataset_uri, - :test_dataset_uri => test_dataset_uri, + :training_dataset_uri => train_dataset_uris[n], + :test_dataset_uri => test_dataset_uris[n], :test_target_dataset_uri => self.dataset_uri, - :crossvalidation_id => self.id, :crossvalidation_fold => n, - :prediction_feature => prediction_feature, + :crossvalidation_id => self.id, :crossvalidation_fold => (n+1), + :prediction_feature => self.prediction_feature, :algorithm_uri => self.algorithm_uri } @tmp_validations << tmp_validation - task.progress( n / self.num_folds.to_f * 100 ) if task end end @@ -636,7 +606,7 @@ module Validation # splits a dataset into test and training dataset # returns map with training_dataset_uri and test_dataset_uri - def self.train_test_dataset_split( orig_dataset_uri, prediction_feature, subjectid, stratified=false, split_ratio=nil, random_seed=nil, task=nil ) + def self.train_test_dataset_split( orig_dataset_uri, prediction_feature, subjectid, stratified="false", split_ratio=nil, random_seed=nil, task=nil ) split_ratio=0.67 unless split_ratio split_ratio = split_ratio.to_f random_seed=1 unless random_seed @@ -652,15 +622,25 @@ module Validation "' not found in dataset, features are: \n"+ orig_dataset.features.keys.inspect unless orig_dataset.features.include?(prediction_feature) else - LOGGER.warn "no prediciton feature given, all features included in test dataset" + LOGGER.warn "no prediciton feature given, all features will be included in test dataset" end - if stratified + meta = { DC.creator => $url_provider.url_for('/training_test_split',:full) } + + case stratified + when /true|super/ + if stratified=="true" + raise OpenTox::BadRequestError.new "prediction feature required for stratified splits" unless prediction_feature + features = [prediction_feature] + else + LOGGER.warn "prediction feature is ignored for super-stratified splits" if prediction_feature + features = nil + end r_util = OpenTox::RUtil.new - split_sets = r_util.stratified_split( orig_dataset, "NA", df, split_ratio, random_seed ) + train, test = r_util.stratified_split( orig_dataset, meta, "NA", split_ratio, @subjectid, random_seed, features ) r_util.quit_r - result = {:training_dataset_uri => split_sets[0], :test_dataset_uri => split_sets[1]} - else + result = {:training_dataset_uri => train.uri, :test_dataset_uri => test.uri} + when "false" compounds = orig_dataset.compounds raise OpenTox::BadRequestError.new "Cannot split datset, num compounds in dataset < 2 ("+compounds.size.to_s+")" if compounds.size<2 split = (compounds.size*split_ratio).to_i @@ -674,22 +654,18 @@ module Validation test_compounds = compounds[(split+1)..-1] task.progress(33) if task + meta[DC.title] = "Training dataset split of "+orig_dataset.uri result = {} result[:training_dataset_uri] = orig_dataset.split( training_compounds, - orig_dataset.features.keys, - { DC.title => "Training dataset split of "+orig_dataset.title.to_s, - DC.creator => $url_provider.url_for('/training_test_split',:full) }, - subjectid ).uri + orig_dataset.features.keys, meta, subjectid ).uri task.progress(66) if task + meta[DC.title] = "Test dataset split of "+orig_dataset.uri result[:test_dataset_uri] = orig_dataset.split( test_compounds, - orig_dataset.features.keys.dclone - [prediction_feature], - { DC.title => "Test dataset split of "+orig_dataset.title.to_s, - DC.creator => $url_provider.url_for('/training_test_split',:full) }, - subjectid ).uri + orig_dataset.features.keys.dclone - [prediction_feature], meta, subjectid ).uri task.progress(100) if task - if !stratified and ENV['RACK_ENV'] =~ /test|debug/ + if ENV['RACK_ENV'] =~ /test|debug/ raise OpenTox::NotFoundError.new "Training dataset not found: '"+result[:training_dataset_uri].to_s+"'" unless Lib::DatasetCache.find(result[:training_dataset_uri],subjectid) test_data = Lib::DatasetCache.find result[:test_dataset_uri],subjectid @@ -698,8 +674,9 @@ module Validation raise "Test dataset num coumpounds != "+(compounds.size-split-1).to_s+", instead: "+ test_data.compounds.size.to_s+"\n"+test_data.to_yaml unless test_data.compounds.size==(compounds.size-1-split) end - LOGGER.debug "split done, training dataset: '"+result[:training_dataset_uri].to_s+"', test dataset: '"+result[:test_dataset_uri].to_s+"'" + else + raise OpenTox::BadRequestError.new "stratified != false|true|super, is #{stratified}" end result end |