diff options
author | mguetlein <martin.guetlein@gmail.com> | 2014-10-21 11:22:05 +0200 |
---|---|---|
committer | mguetlein <martin.guetlein@gmail.com> | 2014-10-21 11:22:05 +0200 |
commit | be83f41db88057595680d9c02ae539a0e3e5599c (patch) | |
tree | dc0b1bbaa25fdb3692e4a31cccba4da3532aded5 | |
parent | d40a0528e7550f13053698c3ad80f931816cafb2 (diff) |
add loo 'uniq' version that puts duplicates into the same fold
-rwxr-xr-x | validation/validation_application.rb | 12 | ||||
-rwxr-xr-x | validation/validation_service.rb | 34 |
2 files changed, 29 insertions, 17 deletions
diff --git a/validation/validation_application.rb b/validation/validation_application.rb index 3deedc7..568e1f7 100755 --- a/validation/validation_application.rb +++ b/validation/validation_application.rb @@ -87,8 +87,8 @@ class Validation::Application < OpenTox::Application $logger.info "crossvalidation cleanup, starting..." content_type "text/uri-list" deleted = [] - Validation::Crossvalidation.all.collect.delete_if{|cv| cv.finished}.each do |cv| - if OpenTox::Authorization.authorized?(cv.crossvalidation_uri,"DELETE",OpenTox::RestClientWrapper.subjectid) + Validation::Crossvalidation.all.collect.select{|cv| !cv.finished}.each do |cv| + if OpenTox::Authorization.authorized?(cv.crossvalidation_uri,"DELETE") $logger.debug "delete cv with id:"+cv.id.to_s+", finished is false" deleted << cv.crossvalidation_uri cv.delete_crossvalidation @@ -111,7 +111,7 @@ class Validation::Application < OpenTox::Application :algorithm_params => params[:algorithm_params], :prediction_feature => params[:prediction_feature], :algorithm_uri => params[:algorithm_uri], - :loo => "true" } + :loo => (params[:loo]=="uniq" ? "uniq" : "true") } cv = Validation::Crossvalidation.create cv_params cv.perform_cv( OpenTox::SubTask.create(task,0,95)) # computation of stats is cheap as dataset are already loaded into the memory @@ -520,8 +520,8 @@ class Validation::Application < OpenTox::Application $logger.info "validation cleanup, starting..." content_type "text/uri-list" deleted = [] - Validation::Validation.all.collect.delete_if{|val| val.finished}.each do |val| - if OpenTox::Authorization.authorized?(val.validation_uri,"DELETE",OpenTox::RestClientWrapper.subjectid) + Validation::Validation.all.collect.select{|val| !val.finished}.each do |val| + if OpenTox::Authorization.authorized?(val.validation_uri,"DELETE") $logger.debug "delete val with id:"+val.id.to_s+", finished is false" deleted << val.validation_uri val.delete_validation @@ -546,7 +546,7 @@ class Validation::Application < OpenTox::Application end deleted = [] OpenTox::Dataset.all.each do |d| - if !used_datasets.include?(d.uri) and OpenTox::Authorization.authorized?(d.uri,"DELETE",OpenTox::RestClientWrapper.subjectid) + if !used_datasets.include?(d.uri) and OpenTox::Authorization.authorized?(d.uri,"DELETE") deleted << d.uri d.delete sleep 1 if $aa[:uri] diff --git a/validation/validation_service.rb b/validation/validation_service.rb index 35c9ff1..4fc4018 100755 --- a/validation/validation_service.rb +++ b/validation/validation_service.rb @@ -363,9 +363,13 @@ module Validation # creates the cv folds def create_cv_datasets( task=nil ) - if self.loo=="true" + if self.loo=="true" or self.loo=="uniq" orig_dataset = Lib::DatasetCache.find(self.dataset_uri) - self.num_folds = orig_dataset.compounds.size + if self.loo=="uniq" + self.num_folds = orig_dataset.compounds.collect{|c| c.uri}.uniq.size + else + self.num_folds = orig_dataset.compounds.size + end self.random_seed = 0 self.stratified = "false" else @@ -425,9 +429,6 @@ module Validation v.prediction_feature == prediction_feature and URI.accessible?(v.training_dataset_uri) and URI.accessible?(v.test_dataset_uri) - # CH: Dataset.exist? has been removed, URI.accessible? is cheaper because it needs only HEAD requests - #OpenTox::Dataset.exist?(v.training_dataset_uri,self.subjectid) and - #OpenTox::Dataset.exist?(v.test_dataset_uri,self.subjectid) #make sure self.id is set #self.save if self.new? tmp_val << { :validation_type => "crossvalidation", @@ -461,12 +462,23 @@ module Validation meta = { RDF::DC.creator => self.crossvalidation_uri } case stratified when "false" - if self.loo=="true" - shuffled_compound_indices = (0..(orig_dataset.compounds.size-1)).to_a + unless self.loo=="uniq" + if self.loo=="true" + shuffled_compound_indices = (0..(orig_dataset.compounds.size-1)).to_a + else + shuffled_compound_indices = (0..(orig_dataset.compounds.size-1)).to_a.shuffle( self.random_seed ) + end + split_compound_indices = shuffled_compound_indices.chunk( self.num_folds.to_i ) else - shuffled_compound_indices = (0..(orig_dataset.compounds.size-1)).to_a.shuffle( self.random_seed ) - end - split_compound_indices = shuffled_compound_indices.chunk( self.num_folds.to_i ) + #loo uniq + shuffled_compound_indices = [] + split_compound_indices = [] + orig_dataset.compounds.collect{|c| c.uri}.uniq.each do |c| + idx = orig_dataset.compound_indices(c) + shuffled_compound_indices += idx + split_compound_indices << idx + end + end $logger.debug "cv: num instances for each fold: "+split_compound_indices.collect{|c| c.size}.join(", ") self.num_folds.to_i.times do |n| @@ -482,7 +494,7 @@ module Validation end internal_server_error "internal error, num test compounds not correct,"+ " is '#{test_compound_indices.size}', should be '#{(shuffled_compound_indices.size/self.num_folds.to_i)}'" unless - (shuffled_compound_indices.size/self.num_folds.to_i - test_compound_indices.size).abs <= 1 + (shuffled_compound_indices.size/self.num_folds.to_i - test_compound_indices.size).abs <= 1 or self.loo="uniq" internal_server_error "internal error, num train compounds not correct, should be '"+(shuffled_compound_indices.size-test_compound_indices.size).to_s+ "', is '"+train_compound_indices.size.to_s+"'" unless shuffled_compound_indices.size - test_compound_indices.size == train_compound_indices.size datasetname = 'dataset fold '+(n+1).to_s+' of '+self.num_folds.to_s |