summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormguetlein <martin.guetlein@gmail.com>2014-10-21 11:22:05 +0200
committermguetlein <martin.guetlein@gmail.com>2014-10-21 11:22:05 +0200
commitbe83f41db88057595680d9c02ae539a0e3e5599c (patch)
treedc0b1bbaa25fdb3692e4a31cccba4da3532aded5
parentd40a0528e7550f13053698c3ad80f931816cafb2 (diff)
add loo 'uniq' version that puts duplicates into the same fold
-rwxr-xr-xvalidation/validation_application.rb12
-rwxr-xr-xvalidation/validation_service.rb34
2 files changed, 29 insertions, 17 deletions
diff --git a/validation/validation_application.rb b/validation/validation_application.rb
index 3deedc7..568e1f7 100755
--- a/validation/validation_application.rb
+++ b/validation/validation_application.rb
@@ -87,8 +87,8 @@ class Validation::Application < OpenTox::Application
$logger.info "crossvalidation cleanup, starting..."
content_type "text/uri-list"
deleted = []
- Validation::Crossvalidation.all.collect.delete_if{|cv| cv.finished}.each do |cv|
- if OpenTox::Authorization.authorized?(cv.crossvalidation_uri,"DELETE",OpenTox::RestClientWrapper.subjectid)
+ Validation::Crossvalidation.all.collect.select{|cv| !cv.finished}.each do |cv|
+ if OpenTox::Authorization.authorized?(cv.crossvalidation_uri,"DELETE")
$logger.debug "delete cv with id:"+cv.id.to_s+", finished is false"
deleted << cv.crossvalidation_uri
cv.delete_crossvalidation
@@ -111,7 +111,7 @@ class Validation::Application < OpenTox::Application
:algorithm_params => params[:algorithm_params],
:prediction_feature => params[:prediction_feature],
:algorithm_uri => params[:algorithm_uri],
- :loo => "true" }
+ :loo => (params[:loo]=="uniq" ? "uniq" : "true") }
cv = Validation::Crossvalidation.create cv_params
cv.perform_cv( OpenTox::SubTask.create(task,0,95))
# computation of stats is cheap as dataset are already loaded into the memory
@@ -520,8 +520,8 @@ class Validation::Application < OpenTox::Application
$logger.info "validation cleanup, starting..."
content_type "text/uri-list"
deleted = []
- Validation::Validation.all.collect.delete_if{|val| val.finished}.each do |val|
- if OpenTox::Authorization.authorized?(val.validation_uri,"DELETE",OpenTox::RestClientWrapper.subjectid)
+ Validation::Validation.all.collect.select{|val| !val.finished}.each do |val|
+ if OpenTox::Authorization.authorized?(val.validation_uri,"DELETE")
$logger.debug "delete val with id:"+val.id.to_s+", finished is false"
deleted << val.validation_uri
val.delete_validation
@@ -546,7 +546,7 @@ class Validation::Application < OpenTox::Application
end
deleted = []
OpenTox::Dataset.all.each do |d|
- if !used_datasets.include?(d.uri) and OpenTox::Authorization.authorized?(d.uri,"DELETE",OpenTox::RestClientWrapper.subjectid)
+ if !used_datasets.include?(d.uri) and OpenTox::Authorization.authorized?(d.uri,"DELETE")
deleted << d.uri
d.delete
sleep 1 if $aa[:uri]
diff --git a/validation/validation_service.rb b/validation/validation_service.rb
index 35c9ff1..4fc4018 100755
--- a/validation/validation_service.rb
+++ b/validation/validation_service.rb
@@ -363,9 +363,13 @@ module Validation
# creates the cv folds
def create_cv_datasets( task=nil )
- if self.loo=="true"
+ if self.loo=="true" or self.loo=="uniq"
orig_dataset = Lib::DatasetCache.find(self.dataset_uri)
- self.num_folds = orig_dataset.compounds.size
+ if self.loo=="uniq"
+ self.num_folds = orig_dataset.compounds.collect{|c| c.uri}.uniq.size
+ else
+ self.num_folds = orig_dataset.compounds.size
+ end
self.random_seed = 0
self.stratified = "false"
else
@@ -425,9 +429,6 @@ module Validation
v.prediction_feature == prediction_feature and
URI.accessible?(v.training_dataset_uri) and
URI.accessible?(v.test_dataset_uri)
- # CH: Dataset.exist? has been removed, URI.accessible? is cheaper because it needs only HEAD requests
- #OpenTox::Dataset.exist?(v.training_dataset_uri,self.subjectid) and
- #OpenTox::Dataset.exist?(v.test_dataset_uri,self.subjectid)
#make sure self.id is set
#self.save if self.new?
tmp_val << { :validation_type => "crossvalidation",
@@ -461,12 +462,23 @@ module Validation
meta = { RDF::DC.creator => self.crossvalidation_uri }
case stratified
when "false"
- if self.loo=="true"
- shuffled_compound_indices = (0..(orig_dataset.compounds.size-1)).to_a
+ unless self.loo=="uniq"
+ if self.loo=="true"
+ shuffled_compound_indices = (0..(orig_dataset.compounds.size-1)).to_a
+ else
+ shuffled_compound_indices = (0..(orig_dataset.compounds.size-1)).to_a.shuffle( self.random_seed )
+ end
+ split_compound_indices = shuffled_compound_indices.chunk( self.num_folds.to_i )
else
- shuffled_compound_indices = (0..(orig_dataset.compounds.size-1)).to_a.shuffle( self.random_seed )
- end
- split_compound_indices = shuffled_compound_indices.chunk( self.num_folds.to_i )
+ #loo uniq
+ shuffled_compound_indices = []
+ split_compound_indices = []
+ orig_dataset.compounds.collect{|c| c.uri}.uniq.each do |c|
+ idx = orig_dataset.compound_indices(c)
+ shuffled_compound_indices += idx
+ split_compound_indices << idx
+ end
+ end
$logger.debug "cv: num instances for each fold: "+split_compound_indices.collect{|c| c.size}.join(", ")
self.num_folds.to_i.times do |n|
@@ -482,7 +494,7 @@ module Validation
end
internal_server_error "internal error, num test compounds not correct,"+
" is '#{test_compound_indices.size}', should be '#{(shuffled_compound_indices.size/self.num_folds.to_i)}'" unless
- (shuffled_compound_indices.size/self.num_folds.to_i - test_compound_indices.size).abs <= 1
+ (shuffled_compound_indices.size/self.num_folds.to_i - test_compound_indices.size).abs <= 1 or self.loo="uniq"
internal_server_error "internal error, num train compounds not correct, should be '"+(shuffled_compound_indices.size-test_compound_indices.size).to_s+
"', is '"+train_compound_indices.size.to_s+"'" unless shuffled_compound_indices.size - test_compound_indices.size == train_compound_indices.size
datasetname = 'dataset fold '+(n+1).to_s+' of '+self.num_folds.to_s