summaryrefslogtreecommitdiff
path: root/validation/validation_service.rb
diff options
context:
space:
mode:
Diffstat (limited to 'validation/validation_service.rb')
-rwxr-xr-x[-rw-r--r--]validation/validation_service.rb549
1 files changed, 401 insertions, 148 deletions
diff --git a/validation/validation_service.rb b/validation/validation_service.rb
index cfbb681..a1efba5 100644..100755
--- a/validation/validation_service.rb
+++ b/validation/validation_service.rb
@@ -1,7 +1,5 @@
-require "rdf/redland"
-
require "lib/validation_db.rb"
require "lib/ot_predictions.rb"
@@ -31,171 +29,287 @@ class Array
end
module Validation
-
+
class Validation < Lib::Validation
# constructs a validation object, Rsets id und uri
- def initialize( params={} )
- $sinatra.halt 500,"do not set id manually" if params[:id]
- $sinatra.halt 500,"do not set uri manually" if params[:validation_uri]
- super params
- self.save!
- raise "internal error, validation-id not set "+to_yaml if self.id==nil
- self.attributes = { :validation_uri => $sinatra.url_for("/"+self.id.to_s, :full).to_s }
- self.save!
- end
+ #def initialize( params={} )
+ #raise "do not set id manually" if params[:id]
+ #params[:finished] = false
+ #super params
+ #self.save!
+ #raise "internal error, validation-id not set "+to_yaml if self.id==nil
+ #end
# deletes a validation
# PENDING: model and referenced datasets are deleted as well, keep it that way?
- def delete
-
- model = OpenTox::Model::PredictionModel.find(self.model_uri) if self.model_uri
- model.destroy if model
-
- #[@test_dataset_uri, @training_dataset_uri, @prediction_dataset_uri].each do |d|
- #dataset = OpenTox::Dataset.find(d) if d
- #dataset.delete if dataset
- #end
- destroy
+ def delete( delete_all=true )
+ if (delete_all)
+ to_delete = [:model_uri, :training_dataset_uri, :test_dataset_uri, :test_target_dataset_uri, :prediction_dataset_uri ]
+ case self.validation_type
+ when /test_set_validation/
+ to_delete -= [ :model_uri, :training_dataset_uri, :test_dataset_uri, :test_target_dataset_uri ]
+ when /bootstrapping/
+ to_delete -= [ :test_target_dataset_uri ]
+ when /training_test_validation/
+ to_delete -= [ :training_dataset_uri, :test_dataset_uri, :test_target_dataset_uri ]
+ when /training_test_split/
+ to_delete -= [ :test_target_dataset_uri ]
+ when /validate_dataset/
+ to_delete = []
+ when /crossvalidation/
+ to_delete -= [ :test_target_dataset_uri ]
+ else
+ raise "unknown dataset type"
+ end
+ to_delete.each do |attr|
+ uri = self.send(attr)
+ LOGGER.debug "also deleting "+attr.to_s+" : "+uri.to_s if uri
+ begin
+ OpenTox::RestClientWrapper.delete(uri, :subjectid => subjectid) if uri
+ rescue => ex
+ LOGGER.warn "could not delete "+uri.to_s+" : "+ex.message.to_s
+ end
+ end
+ end
+ self.destroy
+ if (subjectid)
+ begin
+ res = OpenTox::Authorization.delete_policies_from_uri(validation_uri, subjectid)
+ LOGGER.debug "Deleted validation policy: #{res}"
+ rescue
+ LOGGER.warn "Policy delete error for validation: #{validation_uri}"
+ end
+ end
"Successfully deleted validation "+self.id.to_s+"."
end
# validates an algorithm by building a model and validating this model
- def validate_algorithm( algorithm_params=nil )
-
- $sinatra.halt 404, "no algorithm uri: '"+self.algorithm_uri.to_s+"'" if self.algorithm_uri==nil or self.algorithm_uri.to_s.size<1
+ def validate_algorithm( algorithm_params=nil, task=nil )
+ raise "validation_type missing" unless self.validation_type
+ raise OpenTox::BadRequestError.new "no algorithm uri: '"+self.algorithm_uri.to_s+"'" if self.algorithm_uri==nil or self.algorithm_uri.to_s.size<1
params = { :dataset_uri => self.training_dataset_uri, :prediction_feature => self.prediction_feature }
if (algorithm_params!=nil)
algorithm_params.split(";").each do |alg_params|
alg_param = alg_params.split("=")
- $sinatra.halt 404, "invalid algorithm param: '"+alg_params.to_s+"'" unless alg_param.size==2 or alg_param[0].to_s.size<1 or alg_param[1].to_s.size<1
+ raise OpenTox::BadRequestError.new "invalid algorithm param: '"+alg_params.to_s+"'" unless alg_param.size==2 or alg_param[0].to_s.size<1 or alg_param[1].to_s.size<1
LOGGER.warn "algorihtm param contains empty space, encode? "+alg_param[1].to_s if alg_param[1] =~ /\s/
params[alg_param[0].to_sym] = alg_param[1]
end
end
LOGGER.debug "building model '"+algorithm_uri.to_s+"' "+params.inspect
- model = OpenTox::Model::PredictionModel.build(algorithm_uri, params)
- $sinatra.halt 500,"model building failed" unless model
- self.attributes = { :model_uri => model.uri }
- self.save!
+ algorithm = OpenTox::Algorithm::Generic.new(algorithm_uri)
+ params[:subjectid] = subjectid
+ self.model_uri = algorithm.run(params, OpenTox::SubTask.create(task, 0, 33))
+
+ #model = OpenTox::Model::PredictionModel.build(algorithm_uri, params,
+ # OpenTox::SubTask.create(task, 0, 33) )
- $sinatra.halt 500,"error after building model: model.dependent_variable != validation.prediciton_feature ("+
- model.dependentVariables.to_s+" != "+self.prediction_feature+")" if self.prediction_feature!=model.dependentVariables
+ raise "model building failed" unless model_uri
+ #self.attributes = { :model_uri => model_uri }
+ #self.save!
+
+# self.save if self.new?
+# self.update :model_uri => model_uri
+
+ #raise "error after building model: model.dependent_variable != validation.prediciton_feature ("+
+ # model.dependentVariables.to_s+" != "+self.prediction_feature+")" if self.prediction_feature!=model.dependentVariables
- validate_model
+ validate_model OpenTox::SubTask.create(task, 33, 100)
end
# validates a model
# PENDING: a new dataset is created to store the predictions, this should be optional: delete predictions afterwards yes/no
- def validate_model
+ def validate_model( task=nil )
+ raise "validation_type missing" unless self.validation_type
LOGGER.debug "validating model '"+self.model_uri+"'"
- model = OpenTox::Model::PredictionModel.find(self.model_uri)
- $sinatra.halt 400, "model not found: "+self.model_uri.to_s unless model
+ #model = OpenTox::Model::PredictionModel.find(self.model_uri)
+ #raise OpenTox::NotFoundError.new "model not found: "+self.model_uri.to_s unless model
+ model = OpenTox::Model::Generic.find(self.model_uri, self.subjectid)
unless self.algorithm_uri
- self.attributes = { :algorithm_uri => model.algorithm }
- self.save!
+# self.attributes = { :algorithm_uri => model.algorithm }
+# self.save!
+ #self.update :algorithm_uri => model.algorithm
+ self.algorithm_uri = model.metadata[OT.algorithm]
end
- if self.prediction_feature
- $sinatra.halt 400, "error validating model: model.dependent_variable != validation.prediciton_feature ("+
- model.dependentVariables+" != "+self.prediction_feature+")" if self.prediction_feature!=model.dependentVariables
+ if self.prediction_feature and model.uri=~/ambit2\/model/
+ LOGGER.warn "REMOVE AMBIT HACK TO __NOT__ RELY ON DEPENDENT VARIABLE"
else
- $sinatra.halt 400, "model has no dependentVariables specified, please give prediction feature for model validation" unless model.dependentVariables
- self.attributes = { :prediction_feature => model.dependentVariables }
- self.save!
+ dependentVariables = model.metadata[OT.dependentVariables]
+ if self.prediction_feature
+ raise OpenTox::NotFoundError.new "error validating model: model.dependent_variable != validation.prediction_feature ("+
+ dependentVariables.to_s+" != "+self.prediction_feature+"), model-metadata is "+model.metadata.inspect if self.prediction_feature!=dependentVariables
+ else
+ raise OpenTox::NotFoundError.new "model has no dependentVariables specified, please give prediction feature for model validation" unless dependentVariables
+ #self.attributes = { :prediction_feature => model.dependentVariables }
+ #self.save!
+ #self.update :prediction_feature => model.dependentVariables
+ self.prediction_feature = model.metadata[OT.dependentVariables]
+ end
end
prediction_dataset_uri = ""
benchmark = Benchmark.measure do
- prediction_dataset_uri = model.predict_dataset(self.test_dataset_uri)
+ #prediction_dataset_uri = model.predict_dataset(self.test_dataset_uri, OpenTox::SubTask.create(task, 0, 50))
+ prediction_dataset_uri = model.run(
+ {:dataset_uri => self.test_dataset_uri, :subjectid => self.subjectid},
+ "text/uri-list",
+ OpenTox::SubTask.create(task, 0, 50))
end
- self.attributes = { :prediction_dataset_uri => prediction_dataset_uri,
- :real_runtime => benchmark.real }
- self.save!
-
- compute_validation_stats_with_model( model )
+# self.attributes = { :prediction_dataset_uri => prediction_dataset_uri,
+# :real_runtime => benchmark.real }
+# self.save!
+# self.update :prediction_dataset_uri => prediction_dataset_uri,
+# :real_runtime => benchmark.real
+ self.prediction_dataset_uri = prediction_dataset_uri
+ self.real_runtime = benchmark.real
+
+ compute_validation_stats_with_model( model, false, OpenTox::SubTask.create(task, 50, 100) )
end
- def compute_validation_stats_with_model( model=nil )
+ def compute_validation_stats_with_model( model=nil, dry_run=false, task=nil )
- model = OpenTox::Model::PredictionModel.find(self.model_uri) if model==nil and self.model_uri
- $sinatra.halt 400, "model not found: "+self.model_uri.to_s unless model
- prediction_feature = self.prediction_feature ? nil : model.dependentVariables
- algorithm_uri = self.algorithm_uri ? nil : model.algorithm
- compute_validation_stats( model.classification?, model.predictedVariables, prediction_feature, algorithm_uri )
+ #model = OpenTox::Model::PredictionModel.find(self.model_uri) if model==nil and self.model_uri
+ #raise OpenTox::NotFoundError.new "model not found: "+self.model_uri.to_s unless model
+ model = OpenTox::Model::Generic.find(self.model_uri, self.subjectid) if model==nil and self.model_uri
+ raise OpenTox::NotFoundError.new "model not found: "+self.model_uri.to_s unless model
+
+ dependentVariables = model.metadata[OT.dependentVariables]
+ prediction_feature = self.prediction_feature ? nil : dependentVariables
+ algorithm_uri = self.algorithm_uri ? nil : model.metadata[OT.algorithm]
+ predictedVariables = model.metadata[OT.predictedVariables]
+ compute_validation_stats( model.feature_type(self.subjectid), predictedVariables,
+ prediction_feature, algorithm_uri, dry_run, task )
end
- def compute_validation_stats( classification, predicted_feature, prediction_feature=nil, algorithm_uri=nil)
+ def compute_validation_stats( feature_type, predicted_feature, prediction_feature=nil,
+ algorithm_uri=nil, dry_run=false, task=nil )
- self.attributes = { :prediction_feature => prediction_feature } if self.prediction_feature==nil && prediction_feature
- self.attributes = { :algorithm_uri => algorithm_uri } if self.algorithm_uri==nil && algorithm_uri
- self.save!
+# self.attributes = { :prediction_feature => prediction_feature } if self.prediction_feature==nil && prediction_feature
+# self.attributes = { :algorithm_uri => algorithm_uri } if self.algorithm_uri==nil && algorithm_uri
+# self.save!
+# self.update :prediction_feature => prediction_feature if self.prediction_feature==nil && prediction_feature
+# self.update :algorithm_uri => algorithm_uri if self.algorithm_uri==nil && algorithm_uri
+ self.prediction_feature = prediction_feature if self.prediction_feature==nil && prediction_feature
+ self.algorithm_uri = algorithm_uri if self.algorithm_uri==nil && algorithm_uri
LOGGER.debug "computing prediction stats"
- prediction = Lib::OTPredictions.new( classification,
+ prediction = Lib::OTPredictions.new( feature_type,
self.test_dataset_uri, self.test_target_dataset_uri, self.prediction_feature,
- self.prediction_dataset_uri, predicted_feature )
- if prediction.classification?
- self.attributes = { :classification_statistics => prediction.compute_stats }
- else
- self.attributes = { :regression_statistics => prediction.compute_stats }
+ self.prediction_dataset_uri, predicted_feature, self.subjectid, OpenTox::SubTask.create(task, 0, 80) )
+ #reading datasets and computing the main stats is 80% the work
+
+ unless dry_run
+ case feature_type
+ when "classification"
+ #self.attributes = { :classification_statistics => prediction.compute_stats }
+ #self.update :classification_statistics => prediction.compute_stats
+ self.classification_statistics = prediction.compute_stats
+ when "regression"
+ #self.attributes = { :regression_statistics => prediction.compute_stats }
+ self.regression_statistics = prediction.compute_stats
+ end
+# self.attributes = { :num_instances => prediction.num_instances,
+# :num_without_class => prediction.num_without_class,
+# :percent_without_class => prediction.percent_without_class,
+# :num_unpredicted => prediction.num_unpredicted,
+# :percent_unpredicted => prediction.percent_unpredicted,
+# :finished => true}
+# self.save!
+ self.attributes= {:num_instances => prediction.num_instances,
+ :num_without_class => prediction.num_without_class,
+ :percent_without_class => prediction.percent_without_class,
+ :num_unpredicted => prediction.num_unpredicted,
+ :percent_unpredicted => prediction.percent_unpredicted,
+ :finished => true}
+ begin
+ self.save
+ rescue DataMapper::SaveFailureError => e
+ raise "could not save validation: "+e.resource.errors.inspect
+ end
end
- self.attributes = { :num_instances => prediction.num_instances,
- :num_without_class => prediction.num_without_class,
- :percent_without_class => prediction.percent_without_class,
- :num_unpredicted => prediction.num_unpredicted,
- :percent_unpredicted => prediction.percent_unpredicted }
- self.save!
+ task.progress(100) if task
+ prediction
end
end
class Crossvalidation < Lib::Crossvalidation
# constructs a crossvalidation, id and uri are set
- def initialize( params={} )
-
- $sinatra.halt 500,"do not set id manually" if params[:id]
- $sinatra.halt 500,"do not set uri manually" if params[:crossvalidation_uri]
-
- params[:num_folds] = 10 if params[:num_folds]==nil
- params[:random_seed] = 1 if params[:random_seed]==nil
- params[:stratified] = false if params[:stratified]==nil
- super params
- self.save!
- raise "internal error, crossvalidation-id not set" if self.id==nil
- self.attributes = { :crossvalidation_uri => $sinatra.url_for("/crossvalidation/"+self.id.to_s, :full) }
- self.save!
+ #def initialize( params={} )
+ #
+ # raise "do not set id manually" if params[:id]
+ # params[:num_folds] = 10 if params[:num_folds]==nil
+ # params[:random_seed] = 1 if params[:random_seed]==nil
+ # params[:stratified] = false if params[:stratified]==nil
+ # params[:finished] = false
+ # super params
+ # self.save!
+ # raise "internal error, crossvalidation-id not set" if self.id==nil
+ #end
+
+ def perform_cv ( prediction_feature, algorithm_params=nil, task=nil )
+
+ create_cv_datasets( prediction_feature, OpenTox::SubTask.create(task, 0, 33) )
+ perform_cv_validations( algorithm_params, OpenTox::SubTask.create(task, 33, 100) )
end
# deletes a crossvalidation, all validations are deleted as well
def delete
- Validation.all(:crossvalidation_id => self.id).each{ |v| v.delete }
- destroy
+ Validation.all(:crossvalidation_id => self.id).each do |v|
+ v.subjectid = self.subjectid
+ v.delete
+ end
+ self.destroy
+ if (subjectid)
+ begin
+ res = OpenTox::Authorization.delete_policies_from_uri(crossvalidation_uri, subjectid)
+ LOGGER.debug "Deleted crossvalidation policy: #{res}"
+ rescue
+ LOGGER.warn "Policy delete error for crossvalidation: #{crossvalidation_uri}"
+ end
+ end
"Successfully deleted crossvalidation "+self.id.to_s+"."
end
# creates the cv folds
- # PENDING copying datasets of an equal (same dataset, same params) crossvalidation is disabled for now
- def create_cv_datasets( prediction_feature )
-
- create_new_cv_datasets( prediction_feature ) #unless copy_cv_datasets( prediction_feature )
+ def create_cv_datasets( prediction_feature, task=nil )
+ if copy_cv_datasets( prediction_feature )
+ # dataset folds of a previous crossvalidaiton could be used
+ task.progress(100) if task
+ else
+ create_new_cv_datasets( prediction_feature, task )
+ end
end
# executes the cross-validation (build models and validates them)
- def perform_cv ( algorithm_params=nil )
+ def perform_cv_validations( algorithm_params, task=nil )
- LOGGER.debug "perform cv validations"
+ LOGGER.debug "perform cv validations "+algorithm_params.inspect
+ i = 0
+ task_step = 100 / self.num_folds.to_f;
@tmp_validations.each do | val |
validation = Validation.new val
- validation.validate_algorithm( algorithm_params )
- #break
+ validation.subjectid = self.subjectid
+ validation.validate_algorithm( algorithm_params,
+ OpenTox::SubTask.create(task, i * task_step, ( i + 1 ) * task_step) )
+ raise "validation '"+validation.validation_uri+"' for crossvaldation could not be finished" unless
+ validation.finished
+ i += 1
end
+
+# self.attributes = { :finished => true }
+# self.save!
+ #self.save if self.new?
+ self.finished = true
+ self.save
end
private
@@ -203,39 +317,48 @@ module Validation
# returns true if successfull, false otherwise
def copy_cv_datasets( prediction_feature )
- equal_cvs = Crossvalidation.all( { :dataset_uri => self.dataset_uri, :num_folds => self.num_folds,
- :stratified => self.stratified, :random_seed => self.random_seed } ).reject{ |cv| cv.id == self.id }
- return false if equal_cvs.size == 0
- cv = equal_cvs[0]
- Validation.all( :crossvalidation_id => cv.id ).each do |v|
-
- if self.stratified and v.prediction_feature != prediction_feature
- return false;
+ cvs = Crossvalidation.all( {
+ :dataset_uri => self.dataset_uri,
+ :num_folds => self.num_folds,
+ :stratified => self.stratified,
+ :random_seed => self.random_seed,
+ :finished => true} ).reject{ |cv| cv.id == self.id }
+ cvs.each do |cv|
+ next if AA_SERVER and !OpenTox::Authorization.authorized?(cv.crossvalidation_uri,"GET",self.subjectid)
+ tmp_val = []
+ Validation.all( :crossvalidation_id => cv.id ).each do |v|
+ break unless
+ v.prediction_feature == prediction_feature and
+ OpenTox::Dataset.exist?(v.training_dataset_uri,self.subjectid) and
+ OpenTox::Dataset.exist?(v.test_dataset_uri,self.subjectid)
+ #make sure self.id is set
+ self.save if self.new?
+ tmp_val << { :validation_type => "crossvalidation",
+ :training_dataset_uri => v.training_dataset_uri,
+ :test_dataset_uri => v.test_dataset_uri,
+ :test_target_dataset_uri => self.dataset_uri,
+ :crossvalidation_id => self.id,
+ :crossvalidation_fold => v.crossvalidation_fold,
+ :prediction_feature => prediction_feature,
+ :algorithm_uri => self.algorithm_uri }
end
- unless (OpenTox::Dataset.find(v.training_dataset_uri) and
- OpenTox::Dataset.find(v.test_dataset_uri))
- LOGGER.debug "dataset uris obsolete, aborting copy of datasets"
- Validation.all( :crossvalidation_id => self.id ).each{ |v| v.delete }
- return false
+ if tmp_val.size == self.num_folds
+ @tmp_validations = tmp_val
+ LOGGER.debug "copied dataset uris from cv "+cv.crossvalidation_uri.to_s #+":\n"+tmp_val.inspect
+ return true
end
- validation = Validation.new :crossvalidation_id => self.id,
- :crossvalidation_fold => v.crossvalidation_fold,
- :training_dataset_uri => v.training_dataset_uri,
- :test_dataset_uri => v.test_dataset_uri,
- :algorithm_uri => self.algorithm_uri
end
- LOGGER.debug "copied dataset uris from cv "+cv.crossvalidation_uri.to_s
- return true
+ false
end
# creates cv folds (training and testdatasets)
# stores uris in validation objects
- def create_new_cv_datasets( prediction_feature )
+ def create_new_cv_datasets( prediction_feature, task = nil )
- $sinatra.halt(500,"random seed not set") unless self.random_seed
+ raise "random seed not set "+self.inspect unless self.random_seed
LOGGER.debug "creating datasets for crossvalidation"
- orig_dataset = OpenTox::Dataset.find(self.dataset_uri)
- $sinatra.halt 400, "Dataset not found: "+self.dataset_uri.to_s unless orig_dataset
+ orig_dataset = OpenTox::Dataset.find(self.dataset_uri,self.subjectid)
+ raise OpenTox::NotFoundError.new "Dataset not found: "+self.dataset_uri.to_s unless orig_dataset
shuffled_compounds = orig_dataset.compounds.shuffle( self.random_seed )
@@ -279,7 +402,7 @@ module Validation
end
LOGGER.debug "cv: num instances for each fold: "+split_compounds.collect{|c| c.size}.join(", ")
- test_features = orig_dataset.features.dclone - [prediction_feature]
+ test_features = orig_dataset.features.keys.dclone - [prediction_feature]
@tmp_validations = []
@@ -290,7 +413,7 @@ module Validation
'_f'+n.to_s+'of'+self.num_folds.to_s+
'_r'+self.random_seed.to_s+
'_s'+self.stratified.to_s
- source = $sinatra.url_for('/crossvalidation',:full)
+ source = $url_provider.url_for('/crossvalidation',:full)
test_compounds = []
train_compounds = []
@@ -305,22 +428,31 @@ module Validation
end
end
- $sinatra.halt 500,"internal error, num test compounds not correct" unless (shuffled_compounds.size/self.num_folds - test_compounds.size).abs <= 1
- $sinatra.halt 500,"internal error, num train compounds not correct" unless shuffled_compounds.size - test_compounds.size == train_compounds.size
+ raise "internal error, num test compounds not correct" unless (shuffled_compounds.size/self.num_folds - test_compounds.size).abs <= 1
+ raise "internal error, num train compounds not correct" unless shuffled_compounds.size - test_compounds.size == train_compounds.size
LOGGER.debug "training set: "+datasetname+"_train, compounds: "+train_compounds.size.to_s
- train_dataset_uri = orig_dataset.create_new_dataset( train_compounds, orig_dataset.features, datasetname + '_train', source )
+ #train_dataset_uri = orig_dataset.create_new_dataset( train_compounds, orig_dataset.features, datasetname + '_train', source )
+ train_dataset_uri = orig_dataset.split( train_compounds, orig_dataset.features.keys,
+ { DC.title => datasetname + '_train', DC.creator => source }, self.subjectid ).uri
LOGGER.debug "test set: "+datasetname+"_test, compounds: "+test_compounds.size.to_s
- test_dataset_uri = orig_dataset.create_new_dataset( test_compounds, test_features, datasetname + '_test', source )
-
- tmp_validation = { :training_dataset_uri => train_dataset_uri,
+ #test_dataset_uri = orig_dataset.create_new_dataset( test_compounds, test_features, datasetname + '_test', source )
+ test_dataset_uri = orig_dataset.split( test_compounds, test_features,
+ { DC.title => datasetname + '_test', DC.creator => source }, self.subjectid ).uri
+
+ #make sure self.id is set
+ self.save if self.new?
+ tmp_validation = { :validation_type => "crossvalidation",
+ :training_dataset_uri => train_dataset_uri,
:test_dataset_uri => test_dataset_uri,
:test_target_dataset_uri => self.dataset_uri,
:crossvalidation_id => self.id, :crossvalidation_fold => n,
:prediction_feature => prediction_feature,
:algorithm_uri => self.algorithm_uri }
@tmp_validations << tmp_validation
+
+ task.progress( n / self.num_folds.to_f * 100 ) if task
end
end
end
@@ -328,27 +460,116 @@ module Validation
module Util
+ # splits a dataset into test and training dataset via bootstrapping
+ # (training dataset-size is n, sampling from orig dataset with replacement)
+ # returns map with training_dataset_uri and test_dataset_uri
+ def self.bootstrapping( orig_dataset_uri, prediction_feature, subjectid, random_seed=nil, task=nil )
+
+ random_seed=1 unless random_seed
+
+ orig_dataset = OpenTox::Dataset.find orig_dataset_uri,subjectid
+ orig_dataset.load_all
+ raise OpenTox::NotFoundError.new "Dataset not found: "+orig_dataset_uri.to_s unless orig_dataset
+ if prediction_feature
+ raise OpenTox::NotFoundError.new "Prediction feature '"+prediction_feature.to_s+
+ "' not found in dataset, features are: \n"+
+ orig_dataset.features.inspect unless orig_dataset.features.include?(prediction_feature)
+ else
+ LOGGER.warn "no prediciton feature given, all features included in test dataset"
+ end
+
+ compounds = orig_dataset.compounds
+ raise OpenTox::NotFoundError.new "Cannot split datset, num compounds in dataset < 2 ("+compounds.size.to_s+")" if compounds.size<2
+
+ compounds.each do |c|
+ raise OpenTox::NotFoundError.new "Bootstrapping not yet implemented for duplicate compounds" if
+ orig_dataset.data_entries[c][prediction_feature].size > 1
+ end
+
+ srand random_seed.to_i
+ while true
+ training_compounds = []
+ compounds.size.times do
+ training_compounds << compounds[rand(compounds.size)]
+ end
+ test_compounds = []
+ compounds.each do |c|
+ test_compounds << c unless training_compounds.include?(c)
+ end
+ if test_compounds.size > 0
+ break
+ else
+ srand rand(10000)
+ end
+ end
+
+ LOGGER.debug "bootstrapping on dataset "+orig_dataset_uri+
+ " into training ("+training_compounds.size.to_s+") and test ("+test_compounds.size.to_s+")"+
+ ", duplicates in training dataset: "+test_compounds.size.to_s
+ task.progress(33) if task
+
+ result = {}
+# result[:training_dataset_uri] = orig_dataset.create_new_dataset( training_compounds,
+# orig_dataset.features,
+# "Bootstrapping training dataset of "+orig_dataset.title.to_s,
+# $sinatra.url_for('/bootstrapping',:full) )
+ result[:training_dataset_uri] = orig_dataset.split( training_compounds,
+ orig_dataset.features.keys,
+ { DC.title => "Bootstrapping training dataset of "+orig_dataset.title.to_s,
+ DC.creator => $url_provider.url_for('/bootstrapping',:full) },
+ subjectid ).uri
+ task.progress(66) if task
+
+# result[:test_dataset_uri] = orig_dataset.create_new_dataset( test_compounds,
+# orig_dataset.features.dclone - [prediction_feature],
+# "Bootstrapping test dataset of "+orig_dataset.title.to_s,
+# $sinatra.url_for('/bootstrapping',:full) )
+ result[:test_dataset_uri] = orig_dataset.split( test_compounds,
+ orig_dataset.features.keys.dclone - [prediction_feature],
+ { DC.title => "Bootstrapping test dataset of "+orig_dataset.title.to_s,
+ DC.creator => $url_provider.url_for('/bootstrapping',:full)} ,
+ subjectid ).uri
+ task.progress(100) if task
+
+ if ENV['RACK_ENV'] =~ /test|debug/
+ training_dataset = OpenTox::Dataset.find result[:training_dataset_uri],subjectid
+ raise OpenTox::NotFoundError.new "Training dataset not found: '"+result[:training_dataset_uri].to_s+"'" unless training_dataset
+ training_dataset.load_all
+ value_count = 0
+ training_dataset.compounds.each do |c|
+ value_count += training_dataset.data_entries[c][prediction_feature].size
+ end
+ raise "training compounds error" unless value_count==training_compounds.size
+ raise OpenTox::NotFoundError.new "Test dataset not found: '"+result[:test_dataset_uri].to_s+"'" unless
+ OpenTox::Dataset.find result[:test_dataset_uri], subjectid
+ end
+ LOGGER.debug "bootstrapping done, training dataset: '"+result[:training_dataset_uri].to_s+"', test dataset: '"+result[:test_dataset_uri].to_s+"'"
+
+ return result
+ end
+
# splits a dataset into test and training dataset
# returns map with training_dataset_uri and test_dataset_uri
- def self.train_test_dataset_split( orig_dataset_uri, prediction_feature, split_ratio=nil, random_seed=nil )
+ def self.train_test_dataset_split( orig_dataset_uri, prediction_feature, subjectid, split_ratio=nil, random_seed=nil, task=nil )
split_ratio=0.67 unless split_ratio
random_seed=1 unless random_seed
- orig_dataset = OpenTox::Dataset.find orig_dataset_uri
- $sinatra.halt 400, "Dataset not found: "+orig_dataset_uri.to_s unless orig_dataset
- $sinatra.halt 400, "Split ratio invalid: "+split_ratio.to_s unless split_ratio and split_ratio=split_ratio.to_f
- $sinatra.halt 400, "Split ratio not >0 and <1 :"+split_ratio.to_s unless split_ratio>0 && split_ratio<1
+ orig_dataset = OpenTox::Dataset.find orig_dataset_uri, subjectid
+ orig_dataset.load_all subjectid
+ raise OpenTox::NotFoundError.new "Dataset not found: "+orig_dataset_uri.to_s unless orig_dataset
+ raise OpenTox::NotFoundError.new "Split ratio invalid: "+split_ratio.to_s unless split_ratio and split_ratio=split_ratio.to_f
+ raise OpenTox::NotFoundError.new "Split ratio not >0 and <1 :"+split_ratio.to_s unless split_ratio>0 && split_ratio<1
if prediction_feature
- $sinatra.halt 400, "Prediction feature '"+prediction_feature.to_s+
+ raise OpenTox::NotFoundError.new "Prediction feature '"+prediction_feature.to_s+
"' not found in dataset, features are: \n"+
- orig_dataset.features.inspect unless orig_dataset.features.include?(prediction_feature)
+ orig_dataset.features.keys.inspect unless orig_dataset.features.include?(prediction_feature)
else
LOGGER.warn "no prediciton feature given, all features included in test dataset"
end
compounds = orig_dataset.compounds
- $sinatra.halt 400, "Cannot split datset, num compounds in dataset < 2 ("+compounds.size.to_s+")" if compounds.size<2
+ raise OpenTox::BadRequestError.new "Cannot split datset, num compounds in dataset < 2 ("+compounds.size.to_s+")" if compounds.size<2
split = (compounds.size*split_ratio).to_i
split = [split,1].max
split = [split,compounds.size-2].min
@@ -356,24 +577,56 @@ module Validation
LOGGER.debug "splitting dataset "+orig_dataset_uri+
" into train:0-"+split.to_s+" and test:"+(split+1).to_s+"-"+(compounds.size-1).to_s+
" (shuffled with seed "+random_seed.to_s+")"
-
compounds.shuffle!( random_seed )
+ task.progress(33) if task
result = {}
- result[:training_dataset_uri] = orig_dataset.create_new_dataset( compounds[0..split],
- orig_dataset.features,
- "Training dataset split of "+orig_dataset.title.to_s,
- $sinatra.url_for('/training_test_split',:full) )
- result[:test_dataset_uri] = orig_dataset.create_new_dataset( compounds[(split+1)..-1],
- orig_dataset.features.dclone - [prediction_feature],
- "Test dataset split of "+orig_dataset.title.to_s,
- $sinatra.url_for('/training_test_split',:full) )
-
- $sinatra.halt 400, "Training dataset not found: '"+result[:training_dataset_uri].to_s+"'" unless OpenTox::Dataset.find result[:training_dataset_uri]
- $sinatra.halt 400, "Test dataset not found: '"+result[:test_dataset_uri].to_s+"'" unless OpenTox::Dataset.find result[:test_dataset_uri]
+# result[:training_dataset_uri] = orig_dataset.create_new_dataset( compounds[0..split],
+# orig_dataset.features,
+# "Training dataset split of "+orig_dataset.title.to_s,
+# $sinatra.url_for('/training_test_split',:full) )
+
+# orig_dataset.data_entries.each do |k,v|
+# puts k.inspect+" =>"+v.inspect
+# puts v.values[0].to_s+" "+v.values[0].class.to_s
+# end
+
+ result[:training_dataset_uri] = orig_dataset.split( compounds[0..split],
+ orig_dataset.features.keys,
+ { DC.title => "Training dataset split of "+orig_dataset.title.to_s,
+ DC.creator => $url_provider.url_for('/training_test_split',:full) },
+ subjectid ).uri
+ task.progress(66) if task
+
+# d = OpenTox::Dataset.find(result[:training_dataset_uri])
+# d.data_entries.values.each do |v|
+# puts v.inspect
+# puts v.values[0].to_s+" "+v.values[0].class.to_s
+# end
+# raise "stop here"
+
+# result[:test_dataset_uri] = orig_dataset.create_new_dataset( compounds[(split+1)..-1],
+# orig_dataset.features.dclone - [prediction_feature],
+# "Test dataset split of "+orig_dataset.title.to_s,
+# $sinatra.url_for('/training_test_split',:full) )
+ result[:test_dataset_uri] = orig_dataset.split( compounds[(split+1)..-1],
+ orig_dataset.features.keys.dclone - [prediction_feature],
+ { DC.title => "Test dataset split of "+orig_dataset.title.to_s,
+ DC.creator => $url_provider.url_for('/training_test_split',:full) },
+ subjectid ).uri
+ task.progress(100) if task
+
+ if ENV['RACK_ENV'] =~ /test|debug/
+ raise OpenTox::NotFoundError.new "Training dataset not found: '"+result[:training_dataset_uri].to_s+"'" unless
+ OpenTox::Dataset.find(result[:training_dataset_uri],subjectid)
+ test_data = OpenTox::Dataset.find result[:test_dataset_uri],subjectid
+ raise OpenTox::NotFoundError.new "Test dataset not found: '"+result[:test_dataset_uri].to_s+"'" unless test_data
+ test_data.load_compounds subjectid
+ raise "Test dataset num coumpounds != "+(compounds.size-split-1).to_s+", instead: "+
+ test_data.compounds.size.to_s+"\n"+test_data.to_yaml unless test_data.compounds.size==(compounds.size-1-split)
+ end
LOGGER.debug "split done, training dataset: '"+result[:training_dataset_uri].to_s+"', test dataset: '"+result[:test_dataset_uri].to_s+"'"
-
return result
end