From 1e2d786a13a16abb14b2200f49cbdc7b0063c600 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Thu, 30 Oct 2014 16:05:55 +0100 Subject: add skip-fold for faster partial loo-cv and filtering of predictions --- report/report_factory.rb | 4 ++-- validation/validation_application.rb | 24 ++++++++++++++++++++---- validation/validation_service.rb | 30 ++++++++++++++++-------------- 3 files changed, 38 insertions(+), 20 deletions(-) diff --git a/report/report_factory.rb b/report/report_factory.rb index ac275a3..c830a97 100755 --- a/report/report_factory.rb +++ b/report/report_factory.rb @@ -136,8 +136,8 @@ module Reports::ReportFactory bad_request_error("crossvalidation-id not unique and != nil: "+ validation_set.get_values(:crossvalidation_id,false).inspect) if validation_set.unique_value(:crossvalidation_id)==nil validation_set.load_cv_attributes - bad_request_error("num validations ("+validation_set.size.to_s+") is not equal to num folds ("+ - validation_set.unique_value(:num_folds).to_s+")") unless validation_set.unique_value(:num_folds).to_i==validation_set.size + #bad_request_error("num validations ("+validation_set.size.to_s+") is not equal to num folds ("+ + # validation_set.unique_value(:num_folds).to_s+")") unless validation_set.unique_value(:num_folds).to_i==validation_set.size bad_request_error("num different folds is not equal to num validations") unless validation_set.num_different_values(:crossvalidation_fold)==validation_set.size bad_request_error("validations must have unique feature type, i.e. must be either all regression, "+ "or all classification validations") unless validation_set.unique_feature_type diff --git a/validation/validation_application.rb b/validation/validation_application.rb index 568e1f7..ed8083c 100755 --- a/validation/validation_application.rb +++ b/validation/validation_application.rb @@ -13,6 +13,15 @@ class Validation::Application < OpenTox::Application bad_request_error "stratified != true|false|super, is #{params[:stratified]}" unless params[:stratified]=~/true|false|super/ end + + def filter_validation(validation, params) + if (params[:min_confidence] or params[:min_num_predictions] or params[:max_num_predictions]) + min_confidence = params[:min_confidence] ? params[:min_confidence].to_f : nil + min_num_predictions = params[:min_num_predictions] ? params[:min_num_predictions].to_i : nil + max_num_predictions = params[:max_num_predictions] ? params[:max_num_predictions].to_i : nil + validation.filter_predictions(min_confidence,min_num_predictions,max_num_predictions) + end + end end before do @@ -105,18 +114,19 @@ class Validation::Application < OpenTox::Application bad_request_error "algorithm_uri missing" unless params[:algorithm_uri].to_s.size>0 bad_request_error "prediction_feature missing" unless params[:prediction_feature].to_s.size>0 bad_request_error "illegal param: num_folds, stratified, random_seed not allowed for loo-crossvalidation" if params[:num_folds] or - params[:stratified] or params[:random_seed] + params[:stratified] or (params[:random_seed] and !params[:skip_ratio]) task = OpenTox::Task.run( "Perform loo-crossvalidation", to("/validation/crossvalidation/loo", :full) ) do |task| #, params cv_params = { :dataset_uri => params[:dataset_uri], :algorithm_params => params[:algorithm_params], :prediction_feature => params[:prediction_feature], :algorithm_uri => params[:algorithm_uri], - :loo => (params[:loo]=="uniq" ? "uniq" : "true") } + :loo => (params[:loo]=="uniq" ? "uniq" : "true"), + :random_seed => params[:random_seed]} cv = Validation::Crossvalidation.create cv_params - cv.perform_cv( OpenTox::SubTask.create(task,0,95)) + cv.perform_cv( OpenTox::SubTask.create(task,0,95), (params[:skip_ratio] ? params[:skip_ratio].to_f : nil)) # computation of stats is cheap as dataset are already loaded into the memory Validation::Validation.from_cv_statistics( cv.id, OpenTox::SubTask.create(task,95,100) ) - cv.clean_loo_files( !(params[:algorithm_params] && params[:algorithm_params] =~ /feature_dataset_uri/) ) + #cv.clean_loo_files( !(params[:algorithm_params] && params[:algorithm_params] =~ /feature_dataset_uri/) ) cv.crossvalidation_uri end return_task(task) @@ -189,6 +199,7 @@ class Validation::Application < OpenTox::Application $logger.info "get crossvalidation statistics for crossvalidation with id "+params[:id].to_s v = Validation::Validation.from_cv_statistics( params[:id] ) + filter_validation(v,params) case request.env['HTTP_ACCEPT'].to_s when /text\/html/ related_links = @@ -219,6 +230,10 @@ class Validation::Application < OpenTox::Application content_type "text/x-yaml" props.to_yaml end + + get '/validation/crossvalidation/:id/prediction_data' do + Validation::Validation.from_cv_statistics( params[:id] ).prediction_data.to_yaml + end delete '/validation/crossvalidation/:id/?' do $logger.info "delete crossvalidation with id "+params[:id].to_s @@ -664,6 +679,7 @@ class Validation::Application < OpenTox::Application # end validation = Validation::Validation[params[:id]] resource_not_found_error "Validation '#{params[:id]}' not found." unless validation + filter_validation(validation,params) case request.env['HTTP_ACCEPT'].to_s when "application/rdf+xml" diff --git a/validation/validation_service.rb b/validation/validation_service.rb index 4fc4018..8b22713 100755 --- a/validation/validation_service.rb +++ b/validation/validation_service.rb @@ -269,8 +269,8 @@ module Validation end def filter_predictions( min_confidence, min_num_predictions, max_num_predictions, prediction=nil ) - self.prediction_data = nil - self.save + #self.prediction_data = nil + #self.save bad_request_error "only supported for classification" if prediction!=nil and classification_statistics==nil bad_request_error "illegal confidence value #{min_confidence}" unless @@ -315,9 +315,9 @@ module Validation class Crossvalidation - def perform_cv ( task=nil ) + def perform_cv ( task=nil, skip_ratio=nil ) create_cv_datasets( OpenTox::SubTask.create(task, 0, 33) ) - perform_cv_validations( OpenTox::SubTask.create(task, 33, 100) ) + perform_cv_validations( OpenTox::SubTask.create(task, 33, 100), skip_ratio ) end def clean_loo_files( delete_feature_datasets ) @@ -370,13 +370,12 @@ module Validation else self.num_folds = orig_dataset.compounds.size end - self.random_seed = 0 self.stratified = "false" else - self.random_seed = 1 unless self.random_seed self.num_folds = 10 unless self.num_folds self.stratified = "false" unless self.stratified end + self.random_seed = 1 unless self.random_seed if copy_cv_datasets() # dataset folds of a previous crossvalidaiton could be used task.progress(100) if task @@ -386,18 +385,20 @@ module Validation end # executes the cross-validation (build models and validates them) - def perform_cv_validations( task=nil ) + def perform_cv_validations( task=nil, skip_ratio=nil ) $logger.debug "perform cv validations" - i = 0 + i = -1 task_step = 100 / self.num_folds.to_f; + skip = @tmp_validations.size.times.collect{|i| i < (skip_ratio*@tmp_validations.size) }.shuffle(self.random_seed) if skip_ratio @tmp_validations.each do | val | + i += 1 + next if skip and skip[i] validation = Validation.create val validation.validate_algorithm( OpenTox::SubTask.create(task, i * task_step, ( i + 1 ) * task_step) ) internal_server_error "validation '"+validation.validation_uri+"' for crossvaldation could not be finished" unless - validation.finished - i += 1 - $logger.debug "fold "+i.to_s+" done: "+validation.validation_uri.to_s + validation.finished + $logger.debug "fold "+(i+1).to_s+" done: "+validation.validation_uri.to_s end # self.attributes = { :finished => true } @@ -412,13 +413,14 @@ module Validation # returns true if successfull, false otherwise def copy_cv_datasets( ) # for downwards compatibilty: search prediction_feature=nil is ok - cvs = Crossvalidation.find( { + p = { :dataset_uri => self.dataset_uri, :num_folds => self.num_folds, :stratified => self.stratified, - :random_seed => self.random_seed, :loo => self.loo, - :finished => true} ).reject{ |cv| (cv.id == self.id || + :finished => true} + p[:random_seed] = self.random_seed unless self.loo=="uniq" + cvs = Crossvalidation.find( p ).reject{ |cv| (cv.id == self.id || (cv.prediction_feature && cv.prediction_feature != self.prediction_feature)) } cvs.each do |cv| -- cgit v1.2.3