summaryrefslogtreecommitdiff
path: root/validation/validation_service.rb
diff options
context:
space:
mode:
Diffstat (limited to 'validation/validation_service.rb')
-rwxr-xr-xvalidation/validation_service.rb277
1 files changed, 146 insertions, 131 deletions
diff --git a/validation/validation_service.rb b/validation/validation_service.rb
index 8dc90e2..614363d 100755
--- a/validation/validation_service.rb
+++ b/validation/validation_service.rb
@@ -38,32 +38,13 @@ module Validation
crossvalidation = Crossvalidation.get(cv_id)
raise OpenTox::NotFoundError.new "Crossvalidation '#{cv_id}' not found." unless crossvalidation
raise OpenTox::BadRequestError.new "Crossvalidation '"+cv_id.to_s+"' not finished" unless crossvalidation.finished
-
vals = Validation.find( :crossvalidation_id => cv_id, :validation_type => "crossvalidation" ).collect{|x| x}
- models = vals.collect{|v| OpenTox::Model::Generic.find(v.model_uri, subjectid)}
- feature_type = models.first.feature_type(subjectid)
- test_dataset_uris = vals.collect{|v| v.test_dataset_uri}
- test_target_dataset_uris = vals.collect{|v| v.test_target_dataset_uri}
- prediction_feature = vals.first.prediction_feature
- prediction_dataset_uris = vals.collect{|v| v.prediction_dataset_uri}
- predicted_variables = models.collect{|m| m.predicted_variable(subjectid)}
- predicted_confidences = models.collect{|m| m.predicted_confidence(subjectid)}
- prediction = Lib::OTPredictions.new( feature_type, test_dataset_uris, test_target_dataset_uris, prediction_feature,
- prediction_dataset_uris, predicted_variables, predicted_confidences, subjectid, OpenTox::SubTask.create(waiting_task, 0, 90) )
-
+
v = Validation.new
- case feature_type
- when "classification"
- v.classification_statistics = prediction.compute_stats
- when "regression"
- v.regression_statistics = prediction.compute_stats
- end
- v.update :num_instances => prediction.num_instances,
- :num_without_class => prediction.num_without_class,
- :percent_without_class => prediction.percent_without_class,
- :num_unpredicted => prediction.num_unpredicted,
- :percent_unpredicted => prediction.percent_unpredicted,
- :finished => true
+ v.subjectid = subjectid
+ v.compute_prediction_data_with_cv(vals, waiting_task)
+ v.compute_validation_stats()
+
(VAL_PROPS_GENERAL-[:validation_uri]).each do |p|
v.send("#{p.to_s}=".to_sym, vals.collect{ |vv| vv.send(p) }.uniq.join(";"))
end
@@ -74,6 +55,7 @@ module Validation
v.real_runtime = vals.collect{ |vv| vv.real_runtime }.uniq.join(";")
v.save
end
+ v.subjectid = subjectid
waiting_task.progress(100) if waiting_task
v
end
@@ -199,13 +181,26 @@ module Validation
self.prediction_dataset_uri = prediction_dataset_uri
self.real_runtime = benchmark.real
- compute_validation_stats_with_model( model, false, OpenTox::SubTask.create(task, 50, 100) )
+ compute_prediction_data_with_model( model, OpenTox::SubTask.create(task, 50, 100) )
+ compute_validation_stats()
end
-
- def compute_validation_stats_with_model( model=nil, dry_run=false, task=nil )
-
- #model = OpenTox::Model::PredictionModel.find(self.model_uri) if model==nil and self.model_uri
- #raise OpenTox::NotFoundError.new "model not found: "+self.model_uri.to_s unless model
+
+ def compute_prediction_data_with_cv(cv_vals, waiting_task=nil)
+ models = cv_vals.collect{|v| OpenTox::Model::Generic.find(v.model_uri, subjectid)}
+ feature_type = models.first.feature_type(subjectid)
+ test_dataset_uris = cv_vals.collect{|v| v.test_dataset_uri}
+ test_target_dataset_uris = cv_vals.collect{|v| v.test_target_dataset_uri}
+ prediction_feature = cv_vals.first.prediction_feature
+ prediction_dataset_uris = cv_vals.collect{|v| v.prediction_dataset_uri}
+ predicted_variables = models.collect{|m| m.predicted_variable(subjectid)}
+ predicted_confidences = models.collect{|m| m.predicted_confidence(subjectid)}
+ p_data = Lib::PredictionData.create( feature_type, test_dataset_uris, test_target_dataset_uris, prediction_feature,
+ prediction_dataset_uris, predicted_variables, predicted_confidences, subjectid, waiting_task )
+ self.prediction_data = p_data.data
+ p_data.data
+ end
+
+ def compute_prediction_data_with_model(model=nil, task=nil)
model = OpenTox::Model::Generic.find(self.model_uri, self.subjectid) if model==nil and self.model_uri
raise OpenTox::NotFoundError.new "model not found: "+self.model_uri.to_s unless model
@@ -218,55 +213,88 @@ module Validation
raise "cannot determine whether model '"+model.uri.to_s+"' performs classification or regression, "+
"please set rdf-type of predictedVariables feature '"+predicted_variable.to_s+
"' to NominalFeature or NumericFeature" if (feature_type.to_s!="classification" and feature_type.to_s!="regression")
- compute_validation_stats( feature_type, predicted_variable, predicted_confidence,
- prediction_feature, algorithm_uri, dry_run, task )
+ compute_prediction_data( feature_type, predicted_variable, predicted_confidence,
+ prediction_feature, algorithm_uri, task )
end
-
- def compute_validation_stats( feature_type, predicted_variable, predicted_confidence, prediction_feature,
- algorithm_uri, dry_run, task )
-
-# self.attributes = { :prediction_feature => prediction_feature } if self.prediction_feature==nil && prediction_feature
-# self.attributes = { :algorithm_uri => algorithm_uri } if self.algorithm_uri==nil && algorithm_uri
-# self.save!
-# self.update :prediction_feature => prediction_feature if self.prediction_feature==nil && prediction_feature
-# self.update :algorithm_uri => algorithm_uri if self.algorithm_uri==nil && algorithm_uri
+
+ def compute_prediction_data( feature_type, predicted_variable, predicted_confidence, prediction_feature,
+ algorithm_uri, task )
self.prediction_feature = prediction_feature if self.prediction_feature==nil && prediction_feature
self.algorithm_uri = algorithm_uri if self.algorithm_uri==nil && algorithm_uri
-
+
LOGGER.debug "computing prediction stats"
- prediction = Lib::OTPredictions.new( feature_type,
+ p_data = Lib::PredictionData.create( feature_type,
self.test_dataset_uri, self.test_target_dataset_uri, self.prediction_feature,
- self.prediction_dataset_uri, predicted_variable, predicted_confidence, self.subjectid, OpenTox::SubTask.create(task, 0, 80) )
- #reading datasets and computing the main stats is 80% the work
-
- unless dry_run
- case feature_type
- when "classification"
- #self.attributes = { :classification_statistics => prediction.compute_stats }
- #self.update :classification_statistics => prediction.compute_stats
- self.classification_statistics = prediction.compute_stats
- when "regression"
- #self.attributes = { :regression_statistics => prediction.compute_stats }
- self.regression_statistics = prediction.compute_stats
- end
-# self.attributes = { :num_instances => prediction.num_instances,
-# :num_without_class => prediction.num_without_class,
-# :percent_without_class => prediction.percent_without_class,
-# :num_unpredicted => prediction.num_unpredicted,
-# :percent_unpredicted => prediction.percent_unpredicted,
-# :finished => true}
-# self.save!
- self.update :num_instances => prediction.num_instances,
- :num_without_class => prediction.num_without_class,
- :percent_without_class => prediction.percent_without_class,
- :num_unpredicted => prediction.num_unpredicted,
- :percent_unpredicted => prediction.percent_unpredicted,
- :finished => true
+ self.prediction_dataset_uri, predicted_variable, predicted_confidence, self.subjectid,
+ OpenTox::SubTask.create(task, 0, 80) )
+ self.prediction_data = p_data.data
+ task.progress(100) if task
+ p_data.data
+ end
+
+ def compute_validation_stats( save_stats=true )
+ p_data = self.prediction_data
+ raise "compute prediction data before" if p_data==nil
+ predictions = Lib::OTPredictions.new(p_data)
+ case p_data[:feature_type]
+ when "classification"
+ self.classification_statistics = predictions.compute_stats()
+ when "regression"
+ self.regression_statistics = predictions.compute_stats()
+ end
+ self.num_instances = predictions.num_instances
+ self.num_without_class = predictions.num_without_class
+ self.percent_without_class = predictions.percent_without_class
+ self.num_unpredicted = predictions.num_unpredicted
+ self.percent_unpredicted = predictions.percent_unpredicted
+ if (save_stats)
+ self.finished = true
+ self.save
raise unless self.valid?
end
+ end
+
+ def filter_predictions( min_confidence, min_num_predictions, max_num_predictions, prediction=nil )
+ self.prediction_data = nil
+ self.save
- task.progress(100) if task
- prediction
+ raise OpenTox::BadRequestError.new "only supported for classification" if prediction!=nil and classification_statistics==nil
+ raise OpenTox::BadRequestError.new "illegal confidence value #{min_confidence}" unless
+ min_confidence==nil or (min_confidence.is_a?(Numeric) and min_confidence>=0 and min_confidence<=1)
+ p_data = self.prediction_data
+ if p_data==nil
+ # this is to ensure backwards compatibilty
+ # may cause a timeout on the first run, as this is not meant to run in a task
+ if validation_type=="crossvalidation_statistics"
+ vals = Validation.find( :crossvalidation_id => self.crossvalidation_id, :validation_type => "crossvalidation" ).collect{|x| x}
+ compute_prediction_data_with_cv(vals)
+ else
+ compute_prediction_data_with_model
+ end
+ self.save
+ p_data = self.prediction_data
+ end
+ raise OpenTox::BadRequestError.new("illegal prediction value: '"+prediction+"', available: "+
+ p_data[:accept_values].inspect) if prediction!=nil and p_data[:accept_values].index(prediction)==nil
+ p = Lib::PredictionData.filter_data(p_data, nil, min_confidence, min_num_predictions, max_num_predictions,
+ prediction==nil ? nil : p_data[:accept_values].index(prediction))
+ self.prediction_data = p.data
+ compute_validation_stats(false)
+ end
+
+ def probabilities( confidence, prediction )
+ filter_predictions( confidence, 12, nil, prediction )
+ p_data = self.prediction_data
+ p = Lib::Predictions.new(p_data)
+ prediction_counts = p.confusion_matrix_row( p_data[:accept_values].index(prediction) )
+ sum = 0
+ prediction_counts.each{|v| sum+=v}
+ probs = {}
+ p_data[:accept_values].size.times do |i|
+ probs[p_data[:accept_values][i]] = prediction_counts[i]/sum.to_f
+ end
+ probs
+ {:probs => probs, :num_predictions => sum, :min_confidence => p.min_confidence}
end
end
@@ -590,17 +618,17 @@ module Validation
# splits a dataset into test and training dataset
# returns map with training_dataset_uri and test_dataset_uri
- def self.train_test_dataset_split( orig_dataset_uri, prediction_feature, subjectid, split_ratio=nil, random_seed=nil, task=nil )
+ def self.train_test_dataset_split( orig_dataset_uri, prediction_feature, subjectid, stratified=false, split_ratio=nil, random_seed=nil, task=nil )
split_ratio=0.67 unless split_ratio
split_ratio = split_ratio.to_f
random_seed=1 unless random_seed
random_seed = random_seed.to_i
+ raise OpenTox::NotFoundError.new "Split ratio invalid: "+split_ratio.to_s unless split_ratio and split_ratio=split_ratio.to_f
+ raise OpenTox::NotFoundError.new "Split ratio not >0 and <1 :"+split_ratio.to_s unless split_ratio>0 && split_ratio<1
orig_dataset = Lib::DatasetCache.find orig_dataset_uri, subjectid
orig_dataset.load_all subjectid
raise OpenTox::NotFoundError.new "Dataset not found: "+orig_dataset_uri.to_s unless orig_dataset
- raise OpenTox::NotFoundError.new "Split ratio invalid: "+split_ratio.to_s unless split_ratio and split_ratio=split_ratio.to_f
- raise OpenTox::NotFoundError.new "Split ratio not >0 and <1 :"+split_ratio.to_s unless split_ratio>0 && split_ratio<1
if prediction_feature
raise OpenTox::NotFoundError.new "Prediction feature '"+prediction_feature.to_s+
"' not found in dataset, features are: \n"+
@@ -609,66 +637,53 @@ module Validation
LOGGER.warn "no prediciton feature given, all features included in test dataset"
end
- compounds = orig_dataset.compounds
- raise OpenTox::BadRequestError.new "Cannot split datset, num compounds in dataset < 2 ("+compounds.size.to_s+")" if compounds.size<2
- split = (compounds.size*split_ratio).to_i
- split = [split,1].max
- split = [split,compounds.size-2].min
-
- LOGGER.debug "splitting dataset "+orig_dataset_uri+
+ if stratified
+ r_util = OpenTox::RUtil.new
+ split_sets = r_util.stratified_split( orig_dataset, "NA", df, split_ratio, random_seed )
+ r_util.quit_r
+ result = {:training_dataset_uri => split_sets[0], :test_dataset_uri => split_sets[1]}
+ else
+ compounds = orig_dataset.compounds
+ raise OpenTox::BadRequestError.new "Cannot split datset, num compounds in dataset < 2 ("+compounds.size.to_s+")" if compounds.size<2
+ split = (compounds.size*split_ratio).to_i
+ split = [split,1].max
+ split = [split,compounds.size-2].min
+ LOGGER.debug "splitting dataset "+orig_dataset_uri+
" into train:0-"+split.to_s+" and test:"+(split+1).to_s+"-"+(compounds.size-1).to_s+
" (shuffled with seed "+random_seed.to_s+")"
- compounds.shuffle!( random_seed )
- task.progress(33) if task
-
- result = {}
-# result[:training_dataset_uri] = orig_dataset.create_new_dataset( compounds[0..split],
-# orig_dataset.features,
-# "Training dataset split of "+orig_dataset.title.to_s,
-# $sinatra.url_for('/training_test_split',:full) )
-
-# orig_dataset.data_entries.each do |k,v|
-# puts k.inspect+" =>"+v.inspect
-# puts v.values[0].to_s+" "+v.values[0].class.to_s
-# end
-
- result[:training_dataset_uri] = orig_dataset.split( compounds[0..split],
- orig_dataset.features.keys,
- { DC.title => "Training dataset split of "+orig_dataset.title.to_s,
- DC.creator => $url_provider.url_for('/training_test_split',:full) },
- subjectid ).uri
- task.progress(66) if task
-
-# d = Lib::DatasetCache.find(result[:training_dataset_uri])
-# d.data_entries.values.each do |v|
-# puts v.inspect
-# puts v.values[0].to_s+" "+v.values[0].class.to_s
-# end
-# raise "stop here"
-
-# result[:test_dataset_uri] = orig_dataset.create_new_dataset( compounds[(split+1)..-1],
-# orig_dataset.features.dclone - [prediction_feature],
-# "Test dataset split of "+orig_dataset.title.to_s,
-# $sinatra.url_for('/training_test_split',:full) )
- result[:test_dataset_uri] = orig_dataset.split( compounds[(split+1)..-1],
- orig_dataset.features.keys.dclone - [prediction_feature],
- { DC.title => "Test dataset split of "+orig_dataset.title.to_s,
- DC.creator => $url_provider.url_for('/training_test_split',:full) },
- subjectid ).uri
- task.progress(100) if task
-
- if ENV['RACK_ENV'] =~ /test|debug/
- raise OpenTox::NotFoundError.new "Training dataset not found: '"+result[:training_dataset_uri].to_s+"'" unless
- Lib::DatasetCache.find(result[:training_dataset_uri],subjectid)
- test_data = Lib::DatasetCache.find result[:test_dataset_uri],subjectid
- raise OpenTox::NotFoundError.new "Test dataset not found: '"+result[:test_dataset_uri].to_s+"'" unless test_data
- test_data.load_compounds subjectid
- raise "Test dataset num coumpounds != "+(compounds.size-split-1).to_s+", instead: "+
- test_data.compounds.size.to_s+"\n"+test_data.to_yaml unless test_data.compounds.size==(compounds.size-1-split)
+ compounds.shuffle!( random_seed )
+ training_compounds = compounds[0..split]
+ test_compounds = compounds[(split+1)..-1]
+ task.progress(33) if task
+
+ result = {}
+ result[:training_dataset_uri] = orig_dataset.split( training_compounds,
+ orig_dataset.features.keys,
+ { DC.title => "Training dataset split of "+orig_dataset.title.to_s,
+ DC.creator => $url_provider.url_for('/training_test_split',:full) },
+ subjectid ).uri
+ task.progress(66) if task
+
+ result[:test_dataset_uri] = orig_dataset.split( test_compounds,
+ orig_dataset.features.keys.dclone - [prediction_feature],
+ { DC.title => "Test dataset split of "+orig_dataset.title.to_s,
+ DC.creator => $url_provider.url_for('/training_test_split',:full) },
+ subjectid ).uri
+ task.progress(100) if task
+
+ if !stratified and ENV['RACK_ENV'] =~ /test|debug/
+ raise OpenTox::NotFoundError.new "Training dataset not found: '"+result[:training_dataset_uri].to_s+"'" unless
+ Lib::DatasetCache.find(result[:training_dataset_uri],subjectid)
+ test_data = Lib::DatasetCache.find result[:test_dataset_uri],subjectid
+ raise OpenTox::NotFoundError.new "Test dataset not found: '"+result[:test_dataset_uri].to_s+"'" unless test_data
+ test_data.load_compounds subjectid
+ raise "Test dataset num coumpounds != "+(compounds.size-split-1).to_s+", instead: "+
+ test_data.compounds.size.to_s+"\n"+test_data.to_yaml unless test_data.compounds.size==(compounds.size-1-split)
+ end
+
+ LOGGER.debug "split done, training dataset: '"+result[:training_dataset_uri].to_s+"', test dataset: '"+result[:test_dataset_uri].to_s+"'"
end
-
- LOGGER.debug "split done, training dataset: '"+result[:training_dataset_uri].to_s+"', test dataset: '"+result[:test_dataset_uri].to_s+"'"
- return result
+ result
end
end