From 78201e10a1f4695c65f431d07512188363f897fb Mon Sep 17 00:00:00 2001 From: mguetlein Date: Tue, 20 Nov 2012 12:14:13 +0100 Subject: adjust validation to dataset changes, remove test_target_dataset concept (never worked for multiple compound occurences with different values), fix splitting (multiple compound occurences are now split in different sets as well) --- example.rb | 1 - lib/prediction_data.rb | 138 ++++++++--------------------- lib/validation_db.rb | 4 +- report/plot_factory.rb | 9 +- report/validation_access.rb | 15 ++-- test/test_examples.rb | 4 - test/test_examples_util.rb | 7 +- validation/validation_application.rb | 12 +-- validation/validation_service.rb | 164 ++++++++++++++--------------------- validation/validation_test.rb | 25 ++++-- 10 files changed, 138 insertions(+), 241 deletions(-) diff --git a/example.rb b/example.rb index 636579e..11af160 100755 --- a/example.rb +++ b/example.rb @@ -96,7 +96,6 @@ class Example v = Validation::Validation.new :training_dataset_uri => split_params[:training_dataset_uri], :validation_type => "test_set_validation", :test_dataset_uri => split_params[:test_dataset_uri], - :test_target_dataset_uri => data_uri, :prediction_feature => URI.decode(@@feature), :algorithm_uri => @@alg v.validate_algorithm( @@alg_params, OpenTox::SubTask.new(task, 20, 40) ) diff --git a/lib/prediction_data.rb b/lib/prediction_data.rb index d387d24..f42cd9f 100644 --- a/lib/prediction_data.rb +++ b/lib/prediction_data.rb @@ -1,6 +1,7 @@ module Lib + class PredictionData CHECK_VALUES = ENV['RACK_ENV'] =~ /debug|test/ @@ -53,17 +54,14 @@ module Lib @compounds end - def self.create( feature_type, test_dataset_uris, test_target_dataset_uris, - prediction_feature, prediction_dataset_uris, predicted_variables, predicted_confidences, - subjectid=nil, task=nil ) + def self.create( feature_type, test_dataset_uris, prediction_feature, prediction_dataset_uris, + predicted_variables, predicted_confidences, subjectid=nil, task=nil ) test_dataset_uris = [test_dataset_uris] unless test_dataset_uris.is_a?(Array) - test_target_dataset_uris = [test_target_dataset_uris] unless test_target_dataset_uris.is_a?(Array) prediction_dataset_uris = [prediction_dataset_uris] unless prediction_dataset_uris.is_a?(Array) predicted_variables = [predicted_variables] unless predicted_variables.is_a?(Array) predicted_confidences = [predicted_confidences] unless predicted_confidences.is_a?(Array) LOGGER.debug "loading prediction -- test-dataset: "+test_dataset_uris.inspect - LOGGER.debug "loading prediction -- test-target-datset: "+test_target_dataset_uris.inspect LOGGER.debug "loading prediction -- prediction-dataset: "+prediction_dataset_uris.inspect LOGGER.debug "loading prediction -- predicted_variable: "+predicted_variables.inspect LOGGER.debug "loading prediction -- predicted_confidence: "+predicted_confidences.inspect @@ -84,7 +82,6 @@ module Lib test_dataset_uris.size.times do |i| test_dataset_uri = test_dataset_uris[i] - test_target_dataset_uri = test_target_dataset_uris[i] prediction_dataset_uri = prediction_dataset_uris[i] predicted_variable = predicted_variables[i] predicted_confidence = predicted_confidences[i] @@ -94,35 +91,18 @@ module Lib test_dataset = Lib::DatasetCache.find test_dataset_uri,subjectid raise "test dataset not found: '"+test_dataset_uri.to_s+"'" unless test_dataset - if test_target_dataset_uri == nil || test_target_dataset_uri.strip.size==0 || test_target_dataset_uri==test_dataset_uri - test_target_dataset_uri = test_dataset_uri - test_target_dataset = test_dataset - raise "prediction_feature not found in test_dataset, specify a test_target_dataset\n"+ - "prediction_feature: '"+prediction_feature.to_s+"'\n"+ - "test_dataset: '"+test_target_dataset_uri.to_s+"'\n"+ - "available features are: "+test_target_dataset.features.inspect if test_target_dataset.features.keys.index(prediction_feature)==nil - else - test_target_dataset = Lib::DatasetCache.find test_target_dataset_uri,subjectid - raise "test target datset not found: '"+test_target_dataset_uri.to_s+"'" unless test_target_dataset - if CHECK_VALUES - test_dataset.compounds.each do |c| - raise "test compound not found on test class dataset "+c.to_s unless test_target_dataset.compounds.include?(c) - end - end - raise "prediction_feature not found in test_target_dataset\n"+ - "prediction_feature: '"+prediction_feature.to_s+"'\n"+ - "test_target_dataset: '"+test_target_dataset_uri.to_s+"'\n"+ - "available features are: "+test_target_dataset.features.inspect if test_target_dataset.features.keys.index(prediction_feature)==nil - end + raise "prediction_feature not found in test_dataset\n"+ + "prediction_feature: '"+prediction_feature.to_s+"'\n"+ + "test_dataset: '"+test_dataset_uri.to_s+"'\n"+ + "available features are: "+test_dataset.features.inspect if test_dataset.features.keys.index(prediction_feature)==nil - compounds = test_dataset.compounds - LOGGER.debug "test dataset size: "+compounds.size.to_s - raise "test dataset is empty "+test_dataset_uri.to_s unless compounds.size>0 + LOGGER.debug "test dataset size: "+test_dataset.compounds.size.to_s + raise "test dataset is empty "+test_dataset_uri.to_s unless test_dataset.compounds.size>0 if feature_type=="classification" - av = test_target_dataset.accept_values(prediction_feature) + av = test_dataset.accept_values(prediction_feature) raise "'"+OT.acceptValue.to_s+"' missing/invalid for feature '"+prediction_feature.to_s+"' in dataset '"+ - test_target_dataset_uri.to_s+"', acceptValues are: '"+av.inspect+"'" if av==nil or av.length<2 + test_dataset_uri.to_s+"', acceptValues are: '"+av.inspect+"'" if av==nil or av.length<2 if accept_values==nil accept_values=av else @@ -131,20 +111,15 @@ module Lib end actual_values = [] - tmp_compounds = [] - compounds.each do |c| + test_dataset.compounds.size.times do |c_idx| case feature_type when "classification" - vals = classification_vals(test_target_dataset, c, prediction_feature, accept_values) + actual_values << classification_val(test_dataset, c_idx, prediction_feature, accept_values) when "regression" - vals = regression_vals(test_target_dataset, c, prediction_feature) - end - vals.each do |v| - actual_values << v - tmp_compounds << c + actual_values << numeric_val(test_dataset, c_idx, prediction_feature) end + #raise "WTF #{c_idx} #{test_dataset.compounds[c_idx]} #{actual_values[-1]} #{actual_values[-2]}" if c_idx>0 and test_dataset.compounds[c_idx]==test_dataset.compounds[c_idx-1] and actual_values[-1]!=actual_values[-2] end - compounds = tmp_compounds task.progress( task_status += task_step ) if task # loaded actual values prediction_dataset = Lib::DatasetCache.find prediction_dataset_uri,subjectid @@ -160,41 +135,42 @@ module Lib "prediction_dataset: '"+prediction_dataset_uri.to_s+"'\n"+ "available features are: "+prediction_dataset.features.inspect if predicted_confidence and prediction_dataset.features.keys.index(predicted_confidence)==nil and prediction_dataset.compounds.size>0 - raise "more predicted than test compounds, #test: "+compounds.size.to_s+" < #prediction: "+ + raise "more predicted than test compounds, #test: "+test_dataset.compounds.size.to_s+" < #prediction: "+ prediction_dataset.compounds.size.to_s+", test-dataset: "+test_dataset_uri.to_s+", prediction-dataset: "+ - prediction_dataset_uri if compounds.size < prediction_dataset.compounds.size + prediction_dataset_uri if test_dataset.compounds.size < prediction_dataset.compounds.size if CHECK_VALUES prediction_dataset.compounds.each do |c| raise "predicted compound not found in test dataset:\n"+c+"\ntest-compounds:\n"+ - compounds.collect{|c| c.to_s}.join("\n") if compounds.index(c)==nil + test_dataset.compounds.collect{|c| c.to_s}.join("\n") unless test_dataset.compounds.include?(c) end end predicted_values = [] confidence_values = [] - count = 0 - compounds.each do |c| - if prediction_dataset.compounds.index(c)==nil + + test_dataset.compounds.size.times do |test_c_idx| + c = test_dataset.compounds[test_c_idx] + pred_c_idx = prediction_dataset.compound_index(test_dataset,test_c_idx) + if pred_c_idx==nil + raise "internal error: mapping failed" if prediction_dataset.compounds.include?(c) predicted_values << nil confidence_values << nil else + raise "internal error: mapping failed" unless c==prediction_dataset.compounds[pred_c_idx] case feature_type when "classification" - vals = classification_vals(prediction_dataset, c, predicted_variable, accept_values) + predicted_values << classification_val(prediction_dataset, pred_c_idx, predicted_variable, accept_values) when "regression" - vals = regression_vals(prediction_dataset, c, predicted_variable) + predicted_values << numeric_val(prediction_dataset, pred_c_idx, predicted_variable) end - raise "not yet implemented: more than one prediction for one compound" if vals.size>1 - predicted_values << vals[0] if predicted_confidence - confidence_values << confidence_val(prediction_dataset, c, predicted_confidence) + confidence_values << numeric_val(prediction_dataset, pred_c_idx, predicted_confidence) else confidence_values << nil end end - count += 1 end - all_compounds += compounds + all_compounds += test_dataset.compounds all_predicted_values += predicted_values all_actual_values += actual_values all_confidence_values += confidence_values @@ -237,61 +213,23 @@ module Lib end private - def self.regression_vals(dataset, compound, feature) - v_num = [] - values(dataset, compound, feature).each do |v| - if v==nil or v.is_a?(Numeric) - v_num << v - else - begin - v_num << v.to_f - rescue - LOGGER.warn "no numeric value for regression: '"+v.to_s+"'" - v_num << nil - end - end - end - v_num - end - - def self.confidence_val(dataset, compound, confidence) - v = values(dataset, compound, confidence) - raise "not yet implemented: duplicate conf value" if v.size>1 + def self.numeric_val(dataset, compound_index, feature) + v = dataset.data_entry_value(compound_index, feature) begin - v = v[0] v = v.to_f unless v==nil or v.is_a?(Numeric) v rescue - LOGGER.warn "no numeric value for confidence '"+v.to_s+"'" + LOGGER.warn "no numeric value for feature '#{feature}' : '#{v}'" nil end end - def self.classification_vals(dataset, compound, feature, accept_values) - v_indices = [] - values(dataset, compound, feature).each do |v| - i = accept_values.index(v) - raise "illegal class_value of prediction (value is '"+v.to_s+"'), accept values are "+ - accept_values.inspect unless v==nil or i!=nil - v_indices << i - end - v_indices + def self.classification_val(dataset, compound_index, feature, accept_values) + v = dataset.data_entry_value(compound_index, feature) + i = accept_values.index(v) + raise "illegal class_value of prediction (value is '"+v.to_s+"'), accept values are "+ + accept_values.inspect unless v==nil or i!=nil + i end - - def self.values(dataset, compound, feature) - return [nil] if dataset.data_entries[compound]==nil - if feature==nil - v = dataset.data_entries[compound].values[0] - else - v = dataset.data_entries[compound][feature] - end - return [nil] if v==nil - # sanitiy checks - raise "no array "+v.class.to_s+" : '"+v.to_s+"'" unless v.is_a?(Array) - v.each{|vv| raise "array-elem is array" if vv.is_a?(Array)} - # replace empty strings with nil - v_mod = v.collect{|vv| (vv.to_s().size==0 ? nil : vv)} - v_mod - end end end diff --git a/lib/validation_db.rb b/lib/validation_db.rb index 086853e..9d67cf0 100755 --- a/lib/validation_db.rb +++ b/lib/validation_db.rb @@ -7,8 +7,7 @@ require "lib/merge.rb" module Validation VAL_PROPS_GENERAL = [ :validation_uri, :validation_type, :model_uri, :algorithm_uri, :algorithm_params, - :training_dataset_uri, :prediction_feature, :test_dataset_uri, :test_target_dataset_uri, - :prediction_dataset_uri, :date ] + :training_dataset_uri, :prediction_feature, :test_dataset_uri, :prediction_dataset_uri, :date ] VAL_PROPS_SUM = [ :num_instances, :num_without_class, :num_unpredicted ] VAL_PROPS_AVG = [:real_runtime, :percent_without_class, :percent_unpredicted ] VAL_PROPS = VAL_PROPS_GENERAL + VAL_PROPS_SUM + VAL_PROPS_AVG @@ -59,7 +58,6 @@ module Validation attribute :algorithm_uri attribute :algorithm_params attribute :training_dataset_uri - attribute :test_target_dataset_uri attribute :test_dataset_uri attribute :prediction_dataset_uri attribute :prediction_feature diff --git a/report/plot_factory.rb b/report/plot_factory.rb index 6e90dbc..ad73170 100644 --- a/report/plot_factory.rb +++ b/report/plot_factory.rb @@ -106,14 +106,13 @@ module Reports train = [] test = [] validation_set.validations.each do |v| - [[v.test_dataset_uri, test, v.test_target_dataset_uri], - [v.training_dataset_uri, train, v.training_dataset_uri]].each do |uri,array,uri2| + [[v.test_dataset_uri, test], + [v.training_dataset_uri, train]].each do |uri,array| d = Lib::DatasetCache.find(uri, validation_set.validations[0].subjectid) - d2 = Lib::DatasetCache.find((uri2 ? uri2 : uri), validation_set.validations[0].subjectid) d.compounds.each do |c| - d2.data_entries[c][v.prediction_feature].each do |val| + d.data_entries[c][v.prediction_feature].each do |val| array << val - end if d2.data_entries[c] and d2.data_entries[c][v.prediction_feature] + end if d.data_entries[c] and d.data_entries[c][v.prediction_feature] end end end diff --git a/report/validation_access.rb b/report/validation_access.rb index e2a3978..784f928 100755 --- a/report/validation_access.rb +++ b/report/validation_access.rb @@ -187,8 +187,8 @@ class Reports::ValidationDB def get_predictions(validation, filter_params, subjectid, task) # we need compound info, cannot reuse stored prediction data data = Lib::PredictionData.create( validation.feature_type, validation.test_dataset_uri, - validation.test_target_dataset_uri, validation.prediction_feature, validation.prediction_dataset_uri, - validation.predicted_variable, validation.predicted_confidence, subjectid, OpenTox::SubTask.create(task, 0, 80 ) ) + validation.prediction_feature, validation.prediction_dataset_uri, validation.predicted_variable, + validation.predicted_confidence, subjectid, OpenTox::SubTask.create(task, 0, 80 ) ) data = Lib::PredictionData.filter_data( data.data, data.compounds, filter_params[:min_confidence], filter_params[:min_num_predictions], filter_params[:max_num_predictions] ) if filter_params!=nil task.progress(100) if task @@ -197,14 +197,13 @@ class Reports::ValidationDB def get_accept_values( validation, subjectid=nil ) # PENDING So far, one has to load the whole dataset to get the accept_value from ambit - test_target_datasets = validation.test_target_dataset_uri - test_target_datasets = validation.test_dataset_uri unless test_target_datasets + test_datasets = validation.test_dataset_uri res = nil - test_target_datasets.split(";").each do |test_target_dataset| - d = Lib::DatasetCache.find( test_target_dataset, subjectid ) - raise "cannot get test target dataset for accept values, dataset: "+test_target_dataset.to_s unless d + test_datasets.split(";").each do |test_dataset| + d = Lib::DatasetCache.find( test_dataset, subjectid ) + raise "cannot get test target dataset for accept values, dataset: "+test_dataset.to_s unless d accept_values = d.accept_values(validation.prediction_feature) - raise "cannot get accept values from dataset "+test_target_dataset.to_s+" for feature "+ + raise "cannot get accept values from dataset "+test_dataset.to_s+" for feature "+ validation.prediction_feature+":\n"+d.features[validation.prediction_feature].to_yaml unless accept_values!=nil raise "different accept values" if res && res!=accept_values res = accept_values diff --git a/test/test_examples.rb b/test/test_examples.rb index 2b95cf2..a07456b 100755 --- a/test/test_examples.rb +++ b/test/test_examples.rb @@ -203,11 +203,9 @@ module ValidationExamples class HamsterTrainingTest < TrainingTestValidation def initialize -# @test_target_dataset_file = File.new("data/hamster_carcinogenicity.yaml","r") # @training_dataset_file = File.new("data/hamster_carcinogenicity.train.yaml","r") # @test_dataset_file = File.new("data/hamster_carcinogenicity.test.yaml","r") - @test_target_dataset_file = File.new("data/hamster_carcinogenicity.csv","r") @training_dataset_file = File.new("data/hamster_carcinogenicity.train.csv","r") @test_dataset_file = File.new("data/hamster_carcinogenicity.test.csv","r") @@ -667,11 +665,9 @@ module ValidationExamples class HamsterTrainingTest < TrainingTestValidation def initialize -# @test_target_dataset_file = File.new("data/hamster_carcinogenicity.yaml","r") # @training_dataset_file = File.new("data/hamster_carcinogenicity.train.yaml","r") # @test_dataset_file = File.new("data/hamster_carcinogenicity.test.yaml","r") - @test_target_dataset_file = File.new("data/hamster_carcinogenicity.csv","r") @training_dataset_file = File.new("data/hamster_carcinogenicity.train.csv","r") @test_dataset_file = File.new("data/hamster_carcinogenicity.test.csv","r") diff --git a/test/test_examples_util.rb b/test/test_examples_util.rb index 82c4c48..b7f170a 100755 --- a/test/test_examples_util.rb +++ b/test/test_examples_util.rb @@ -238,8 +238,6 @@ module ValidationExamples :model_uri, :test_dataset_uri, :test_dataset_file, - :test_target_dataset_uri, - :test_target_dataset_file, :training_dataset_uri, :training_dataset_file, :dataset_uri, @@ -258,7 +256,6 @@ module ValidationExamples def upload_files [[:test_dataset_uri, :test_dataset_file], - [:test_target_dataset_uri, :test_target_dataset_file], [:training_dataset_uri, :training_dataset_file], [:dataset_uri, :dataset_file]].each do |a| uri = a[0] @@ -438,7 +435,7 @@ module ValidationExamples end def opt_params - [ :prediction_feature, :test_target_dataset_uri ] + [ :prediction_feature ] end def validation_type @@ -452,7 +449,7 @@ module ValidationExamples end def opt_params - [ :algorithm_params, :test_target_dataset_uri ] + [ :algorithm_params ] end def validation_type diff --git a/validation/validation_application.rb b/validation/validation_application.rb index 1bc55f6..f146b59 100755 --- a/validation/validation_application.rb +++ b/validation/validation_application.rb @@ -308,7 +308,6 @@ post '/test_set_validation' do v = Validation::Validation.create :validation_type => "test_set_validation", :model_uri => params[:model_uri], :test_dataset_uri => params[:test_dataset_uri], - :test_target_dataset_uri => params[:test_target_dataset_uri], :prediction_feature => params[:prediction_feature] v.subjectid = @subjectid v.validate_model( task ) @@ -340,7 +339,6 @@ get '/test_set_validation' do post_command = OpenTox::PostCommand.new request.url,"Perform test-set-validation" post_command.attributes << OpenTox::PostAttribute.new("model_uri") post_command.attributes << OpenTox::PostAttribute.new("test_dataset_uri") - post_command.attributes << OpenTox::PostAttribute.new("test_target_dataset_uri",false,nil,"Specify if target endpoint values are not available in test dataset.") post_command.attributes << OpenTox::PostAttribute.new("prediction_feature",false,nil,"Default is 'dependentVariables' of the model.") content_type "text/html" OpenTox.text_to_html uri_list,@subjectid,related_links,description,post_command @@ -360,7 +358,6 @@ post '/training_test_validation/?' do :algorithm_params => params[:algorithm_params], :training_dataset_uri => params[:training_dataset_uri], :test_dataset_uri => params[:test_dataset_uri], - :test_target_dataset_uri => params[:test_target_dataset_uri], :prediction_feature => params[:prediction_feature] v.subjectid = @subjectid v.validate_algorithm( task ) @@ -392,7 +389,6 @@ get '/training_test_validation' do post_command.attributes << OpenTox::PostAttribute.new("algorithm_uri") post_command.attributes << OpenTox::PostAttribute.new("training_dataset_uri") post_command.attributes << OpenTox::PostAttribute.new("test_dataset_uri") - post_command.attributes << OpenTox::PostAttribute.new("test_target_dataset_uri",false,nil,"Specify if target endpoint values are not available in test dataset.") post_command.attributes << OpenTox::PostAttribute.new("prediction_feature") post_command.attributes << OpenTox::PostAttribute.new("algorithm_params",false,nil,"Params used for model building, separate with ';', example: param1=v1;param2=v2") content_type "text/html" @@ -414,7 +410,6 @@ post '/bootstrapping' do params[:random_seed], OpenTox::SubTask.create(task,0,33)) ) LOGGER.info "params after bootstrapping: "+params.inspect v = Validation::Validation.create :validation_type => "bootstrapping", - :test_target_dataset_uri => params[:dataset_uri], :prediction_feature => params[:prediction_feature], :algorithm_uri => params[:algorithm_uri], :algorithm_params => params[:algorithm_params], @@ -470,12 +465,11 @@ post '/training_test_split' do raise OpenTox::BadRequestError.new "prediction_feature missing" unless params[:prediction_feature].to_s.size>0 check_stratified(params) task = OpenTox::Task.create( "Perform training test split validation", url_for("/training_test_split", :full) ) do |task| #, params - params.merge!( Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:prediction_feature], + params.merge!( Validation::Util.train_test_dataset_split(params[:dataset_uri], (params[:stratified].to_s=~/true/ ? params[:prediction_feature] : nil), @subjectid, params[:stratified], params[:split_ratio], params[:random_seed], OpenTox::SubTask.create(task,0,33))) v = Validation::Validation.create :validation_type => "training_test_split", :training_dataset_uri => params[:training_dataset_uri], :test_dataset_uri => params[:test_dataset_uri], - :test_target_dataset_uri => params[:dataset_uri], :prediction_feature => params[:prediction_feature], :algorithm_uri => params[:algorithm_uri], :algorithm_params => params[:algorithm_params] @@ -543,7 +537,6 @@ post '/cleanup_datasets/?' do end Validation::Validation.all.each do |val| used_datasets << val.training_dataset_uri - used_datasets << val.test_target_dataset_uri used_datasets << val.test_dataset_uri used_datasets << val.prediction_dataset_uri end @@ -595,7 +588,8 @@ post '/validate_datasets' do feature_type = "regression" if params.delete("regression")!=nil v = Validation::Validation.create params v.subjectid = @subjectid - v.compute_validation_stats(feature_type,predicted_variable,predicted_confidence,nil,nil,false,task) + v.compute_prediction_data(feature_type,predicted_variable,predicted_confidence,v.prediction_feature,nil,task) + v.compute_validation_stats()#feature_type,predicted_variable,predicted_confidence,nil,nil,false,task) end v.validation_uri end diff --git a/validation/validation_service.rb b/validation/validation_service.rb index 8c8b11f..2967bd0 100755 --- a/validation/validation_service.rb +++ b/validation/validation_service.rb @@ -64,20 +64,20 @@ module Validation # PENDING: model and referenced datasets are deleted as well, keep it that way? def delete_validation( delete_all=true ) if (delete_all) - to_delete = [:model_uri, :training_dataset_uri, :test_dataset_uri, :test_target_dataset_uri, :prediction_dataset_uri ] + to_delete = [:model_uri, :training_dataset_uri, :test_dataset_uri, :prediction_dataset_uri ] case self.validation_type when "test_set_validation" - to_delete -= [ :model_uri, :training_dataset_uri, :test_dataset_uri, :test_target_dataset_uri ] + to_delete -= [ :model_uri, :training_dataset_uri, :test_dataset_uri ] when "bootstrapping" - to_delete -= [ :test_target_dataset_uri ] + to_delete -= [] when "training_test_validation" - to_delete -= [ :training_dataset_uri, :test_dataset_uri, :test_target_dataset_uri ] + to_delete -= [ :training_dataset_uri, :test_dataset_uri ] when "training_test_split" - to_delete -= [ :test_target_dataset_uri ] + to_delete -= [] when "validate_datasets" to_delete = [] when "crossvalidation" - to_delete -= [ :test_target_dataset_uri ] + to_delete -= [] when "crossvalidation_statistics" to_delete = [] else @@ -189,12 +189,11 @@ module Validation models = cv_vals.collect{|v| OpenTox::Model::Generic.find(v.model_uri, subjectid)} feature_type = models.first.feature_type(subjectid) test_dataset_uris = cv_vals.collect{|v| v.test_dataset_uri} - test_target_dataset_uris = cv_vals.collect{|v| v.test_target_dataset_uri} prediction_feature = cv_vals.first.prediction_feature prediction_dataset_uris = cv_vals.collect{|v| v.prediction_dataset_uri} predicted_variables = models.collect{|m| m.predicted_variable(subjectid)} predicted_confidences = models.collect{|m| m.predicted_confidence(subjectid)} - p_data = Lib::PredictionData.create( feature_type, test_dataset_uris, test_target_dataset_uris, prediction_feature, + p_data = Lib::PredictionData.create( feature_type, test_dataset_uris, prediction_feature, prediction_dataset_uris, predicted_variables, predicted_confidences, subjectid, waiting_task ) self.prediction_data = p_data.data p_data.data @@ -225,7 +224,7 @@ module Validation LOGGER.debug "computing prediction stats" p_data = Lib::PredictionData.create( feature_type, - self.test_dataset_uri, self.test_target_dataset_uri, self.prediction_feature, + self.test_dataset_uri, self.prediction_feature, self.prediction_dataset_uri, predicted_variable, predicted_confidence, self.subjectid, OpenTox::SubTask.create(task, 0, 80) ) self.prediction_data = p_data.data @@ -418,7 +417,6 @@ module Validation tmp_val << { :validation_type => "crossvalidation", :training_dataset_uri => v.training_dataset_uri, :test_dataset_uri => v.test_dataset_uri, - :test_target_dataset_uri => self.dataset_uri, :crossvalidation_id => self.id, :crossvalidation_fold => v.crossvalidation_fold, :prediction_feature => prediction_feature, @@ -448,39 +446,38 @@ module Validation case stratified when "false" if self.loo=="true" - shuffled_compounds = orig_dataset.compounds + shuffled_compound_indices = (0..(orig_dataset.compounds.size-1)).to_a else - shuffled_compounds = orig_dataset.compounds.shuffle( self.random_seed ) + shuffled_compound_indices = (0..(orig_dataset.compounds.size-1)).to_a.shuffle( self.random_seed ) end - split_compounds = shuffled_compounds.chunk( self.num_folds.to_i ) - LOGGER.debug "cv: num instances for each fold: "+split_compounds.collect{|c| c.size}.join(", ") + split_compound_indices = shuffled_compound_indices.chunk( self.num_folds.to_i ) + LOGGER.debug "cv: num instances for each fold: "+split_compound_indices.collect{|c| c.size}.join(", ") self.num_folds.to_i.times do |n| - test_compounds = [] - train_compounds = [] + test_compound_indices = [] + train_compound_indices = [] self.num_folds.to_i.times do |nn| - compounds = split_compounds[nn] + compound_indices = split_compound_indices[nn] if n == nn - compounds.each{ |compound| test_compounds << compound} + compound_indices.each{ |compound| test_compound_indices << compound} else - compounds.each{ |compound| train_compounds << compound} + compound_indices.each{ |compound| train_compound_indices << compound} end end raise "internal error, num test compounds not correct,"+ - " is '#{test_compounds.size}', should be '#{(shuffled_compounds.size/self.num_folds.to_i)}'" unless - (shuffled_compounds.size/self.num_folds.to_i - test_compounds.size).abs <= 1 - raise "internal error, num train compounds not correct, should be '"+(shuffled_compounds.size-test_compounds.size).to_s+ - "', is '"+train_compounds.size.to_s+"'" unless shuffled_compounds.size - test_compounds.size == train_compounds.size + " is '#{test_compound_indices.size}', should be '#{(shuffled_compound_indices.size/self.num_folds.to_i)}'" unless + (shuffled_compound_indices.size/self.num_folds.to_i - test_compound_indices.size).abs <= 1 + raise "internal error, num train compounds not correct, should be '"+(shuffled_compound_indices.size-test_compound_indices.size).to_s+ + "', is '"+train_compound_indices.size.to_s+"'" unless shuffled_compound_indices.size - test_compound_indices.size == train_compound_indices.size datasetname = 'dataset fold '+(n+1).to_s+' of '+self.num_folds.to_s meta[DC.title] = "training "+datasetname - LOGGER.debug "training set: "+datasetname+"_train, compounds: "+train_compounds.size.to_s - train_dataset_uri = orig_dataset.split( train_compounds, orig_dataset.features.keys, + LOGGER.debug "training set: "+datasetname+"_train, compounds: "+train_compound_indices.size.to_s + train_dataset_uri = orig_dataset.split( train_compound_indices, orig_dataset.features.keys, meta, self.subjectid ).uri train_dataset_uris << train_dataset_uri meta[DC.title] = "test "+datasetname - LOGGER.debug "test set: "+datasetname+"_test, compounds: "+test_compounds.size.to_s - test_features = orig_dataset.features.keys.dclone - [self.prediction_feature] - test_dataset_uri = orig_dataset.split( test_compounds, test_features, + LOGGER.debug "test set: "+datasetname+"_test, compounds: "+test_compound_indices.size.to_s + test_dataset_uri = orig_dataset.split( test_compound_indices, orig_dataset.features.keys, meta, self.subjectid ).uri test_dataset_uris << test_dataset_uri end @@ -505,7 +502,6 @@ module Validation tmp_validation = { :validation_type => "crossvalidation", :training_dataset_uri => train_dataset_uris[n], :test_dataset_uri => test_dataset_uris[n], - :test_target_dataset_uri => self.dataset_uri, :crossvalidation_id => self.id, :crossvalidation_fold => (n+1), :prediction_feature => self.prediction_feature, :algorithm_uri => self.algorithm_uri, @@ -537,25 +533,20 @@ module Validation LOGGER.warn "no prediciton feature given, all features included in test dataset" end - compounds = orig_dataset.compounds - raise OpenTox::NotFoundError.new "Cannot split datset, num compounds in dataset < 2 ("+compounds.size.to_s+")" if compounds.size<2 - - compounds.each do |c| - raise OpenTox::NotFoundError.new "Bootstrapping not yet implemented for duplicate compounds" if - orig_dataset.data_entries[c][prediction_feature].size > 1 - end + compound_indices = (0..(orig_dataset.compounds.size-1)).to_a + raise OpenTox::NotFoundError.new "Cannot split datset, num compounds in dataset < 2 ("+compound_indices.size.to_s+")" if compound_indices.size<2 srand random_seed.to_i while true - training_compounds = [] - compounds.size.times do - training_compounds << compounds[rand(compounds.size)] + training_compound_indices = [] + compound_indices.size.times do + training_compound_indices << compound_indices[rand(compound_indices.size)] end - test_compounds = [] - compounds.each do |c| - test_compounds << c unless training_compounds.include?(c) + test_compound_indices = [] + compound_indices.each do |idx| + test_compound_indices << idx unless training_compound_indices.include?(idx) end - if test_compounds.size > 0 + if test_compound_indices.size > 0 break else srand rand(10000) @@ -563,47 +554,26 @@ module Validation end LOGGER.debug "bootstrapping on dataset "+orig_dataset_uri+ - " into training ("+training_compounds.size.to_s+") and test ("+test_compounds.size.to_s+")"+ - ", duplicates in training dataset: "+test_compounds.size.to_s + " into training ("+training_compound_indices.size.to_s+") and test ("+test_compound_indices.size.to_s+")"+ + ", duplicates in training dataset: "+test_compound_indices.size.to_s task.progress(33) if task result = {} -# result[:training_dataset_uri] = orig_dataset.create_new_dataset( training_compounds, -# orig_dataset.features, -# "Bootstrapping training dataset of "+orig_dataset.title.to_s, -# $sinatra.url_for('/bootstrapping',:full) ) - result[:training_dataset_uri] = orig_dataset.split( training_compounds, + result[:training_dataset_uri] = orig_dataset.split( training_compound_indices, orig_dataset.features.keys, { DC.title => "Bootstrapping training dataset of "+orig_dataset.title.to_s, DC.creator => $url_provider.url_for('/bootstrapping',:full) }, subjectid ).uri task.progress(66) if task -# result[:test_dataset_uri] = orig_dataset.create_new_dataset( test_compounds, -# orig_dataset.features.dclone - [prediction_feature], -# "Bootstrapping test dataset of "+orig_dataset.title.to_s, -# $sinatra.url_for('/bootstrapping',:full) ) - result[:test_dataset_uri] = orig_dataset.split( test_compounds, - orig_dataset.features.keys.dclone - [prediction_feature], + result[:test_dataset_uri] = orig_dataset.split( test_compound_indices, + orig_dataset.features.keys, { DC.title => "Bootstrapping test dataset of "+orig_dataset.title.to_s, DC.creator => $url_provider.url_for('/bootstrapping',:full)} , subjectid ).uri task.progress(100) if task - if ENV['RACK_ENV'] =~ /test|debug/ - training_dataset = Lib::DatasetCache.find result[:training_dataset_uri],subjectid - raise OpenTox::NotFoundError.new "Training dataset not found: '"+result[:training_dataset_uri].to_s+"'" unless training_dataset - training_dataset.load_all - value_count = 0 - training_dataset.compounds.each do |c| - value_count += training_dataset.data_entries[c][prediction_feature].size - end - raise "training compounds error" unless value_count==training_compounds.size - raise OpenTox::NotFoundError.new "Test dataset not found: '"+result[:test_dataset_uri].to_s+"'" unless - Lib::DatasetCache.find result[:test_dataset_uri], subjectid - end LOGGER.debug "bootstrapping done, training dataset: '"+result[:training_dataset_uri].to_s+"', test dataset: '"+result[:test_dataset_uri].to_s+"'" - return result end @@ -620,12 +590,17 @@ module Validation orig_dataset = Lib::DatasetCache.find orig_dataset_uri, subjectid orig_dataset.load_all subjectid raise OpenTox::NotFoundError.new "Dataset not found: "+orig_dataset_uri.to_s unless orig_dataset + if prediction_feature - raise OpenTox::NotFoundError.new "Prediction feature '"+prediction_feature.to_s+ - "' not found in dataset, features are: \n"+ - orig_dataset.features.keys.inspect unless orig_dataset.features.include?(prediction_feature) - else - LOGGER.warn "no prediciton feature given, all features will be included in test dataset" + if stratified==/true/ + raise OpenTox::NotFoundError.new "Prediction feature '"+prediction_feature.to_s+ + "' not found in dataset, features are: \n"+orig_dataset.features.keys.inspect unless orig_dataset.features.include?(prediction_feature) + else + LOGGER.warn "prediction_feature argument is ignored for non-stratified splits" if prediction_feature + prediction_feature=nil + end + elsif stratified==/true/ + raise OpenTox::BadRequestError.new "prediction feature required for stratified splits" unless prediction_feature end meta = { DC.creator => $url_provider.url_for('/training_test_split',:full) } @@ -633,10 +608,8 @@ module Validation case stratified when /true|super/ if stratified=="true" - raise OpenTox::BadRequestError.new "prediction feature required for stratified splits" unless prediction_feature features = [prediction_feature] else - LOGGER.warn "prediction feature is ignored for super-stratified splits" if prediction_feature features = nil end r_util = OpenTox::RUtil.new @@ -644,39 +617,36 @@ module Validation r_util.quit_r result = {:training_dataset_uri => train.uri, :test_dataset_uri => test.uri} when "false" - compounds = orig_dataset.compounds - raise OpenTox::BadRequestError.new "Cannot split datset, num compounds in dataset < 2 ("+compounds.size.to_s+")" if compounds.size<2 - split = (compounds.size*split_ratio).to_i + compound_indices = (0..(orig_dataset.compounds.size-1)).to_a + raise OpenTox::BadRequestError.new "Cannot split datset, num compounds in dataset < 2 ("+compound_indices.size.to_s+")" if compound_indices.size<2 + split = (compound_indices.size*split_ratio).round split = [split,1].max - split = [split,compounds.size-2].min + split = [split,compound_indices.size-2].min LOGGER.debug "splitting dataset "+orig_dataset_uri+ - " into train:0-"+split.to_s+" and test:"+(split+1).to_s+"-"+(compounds.size-1).to_s+ + " into train:0-"+split.to_s+" and test:"+(split+1).to_s+"-"+(compound_indices.size-1).to_s+ " (shuffled with seed "+random_seed.to_s+")" - compounds.shuffle!( random_seed ) - training_compounds = compounds[0..split] - test_compounds = compounds[(split+1)..-1] + compound_indices.shuffle!( random_seed ) + training_compound_indices = compound_indices[0..(split-1)] + test_compound_indices = compound_indices[split..-1] task.progress(33) if task meta[DC.title] = "Training dataset split of "+orig_dataset.uri result = {} - result[:training_dataset_uri] = orig_dataset.split( training_compounds, - orig_dataset.features.keys, meta, subjectid ).uri + train_data = orig_dataset.split( training_compound_indices, + orig_dataset.features.keys, meta, subjectid ) + raise "Train dataset num coumpounds != "+(orig_dataset.compounds.size*split_ratio).round.to_s+", instead: "+train_data.compounds.size.to_s unless + train_data.compounds.size==(orig_dataset.compounds.size*split_ratio).round + result[:training_dataset_uri] = train_data.uri task.progress(66) if task meta[DC.title] = "Test dataset split of "+orig_dataset.uri - result[:test_dataset_uri] = orig_dataset.split( test_compounds, - orig_dataset.features.keys.dclone - [prediction_feature], meta, subjectid ).uri + test_data = orig_dataset.split( test_compound_indices, + orig_dataset.features.keys, meta, subjectid ) + raise "Test dataset num coumpounds != "+(orig_dataset.compounds.size*(1-split_ratio)).round.to_s+", instead: "+test_data.compounds.size.to_s unless + test_data.compounds.size==(orig_dataset.compounds.size*(1-split_ratio)).round + result[:test_dataset_uri] = test_data.uri task.progress(100) if task - if ENV['RACK_ENV'] =~ /test|debug/ - raise OpenTox::NotFoundError.new "Training dataset not found: '"+result[:training_dataset_uri].to_s+"'" unless - Lib::DatasetCache.find(result[:training_dataset_uri],subjectid) - test_data = Lib::DatasetCache.find result[:test_dataset_uri],subjectid - raise OpenTox::NotFoundError.new "Test dataset not found: '"+result[:test_dataset_uri].to_s+"'" unless test_data - test_data.load_compounds subjectid - raise "Test dataset num coumpounds != "+(compounds.size-split-1).to_s+", instead: "+ - test_data.compounds.size.to_s+"\n"+test_data.to_yaml unless test_data.compounds.size==(compounds.size-1-split) - end LOGGER.debug "split done, training dataset: '"+result[:training_dataset_uri].to_s+"', test dataset: '"+result[:test_dataset_uri].to_s+"'" else raise OpenTox::BadRequestError.new "stratified != false|true|super, is #{stratified}" diff --git a/validation/validation_test.rb b/validation/validation_test.rb index 70f3ca4..a6dd2a7 100755 --- a/validation/validation_test.rb +++ b/validation/validation_test.rb @@ -59,6 +59,22 @@ class ValidationTest < Test::Unit::TestCase def test_it begin $test_case = self + + post '/validate_datasets',{:test_dataset_uri=>"http://local-ot/dataset/14111", + :prediction_dataset_uri=>"http://local-ot/dataset/14113", + :prediction_feature=>"http://local-ot/dataset/14109/feature/Hamster%20Carcinogenicity", + :predicted_variable=>"http://local-ot/model/21/predicted/value", + :predicted_confidence=>"http://local-ot/model/21/predicted/confidence", + :classification=>"true"} + +#D, [2012-11-07T12:38:11.291069 #31035] DEBUG -- : validation :: loading prediction -- test-dataset: ["http://local-ot/dataset/14099"] :: /validation_service.rb:227:in `compute_prediction_data' +# D, [2012-11-07T12:38:11.291174 #31035] DEBUG -- : validation :: loading prediction -- test-target-datset: ["http://local-ot/dataset/14097"] :: /validation_service.rb:227:in `compute_prediction_data' +# D, [2012-11-07T12:38:11.291281 #31035] DEBUG -- : validation :: loading prediction -- prediction-dataset: ["http://local-ot/dataset/14101"] :: /validation_service.rb:227:in `compute_prediction_data' +# D, [2012-11-07T12:38:11.291398 #31035] DEBUG -- : validation :: loading prediction -- predicted_variable: ["http://local-ot/model/19/predicted/value"] :: /validation_service.rb:227:in `compute_prediction_data' +# D, [2012-11-07T12:38:11.291506 #31035] DEBUG -- : validation :: loading prediction -- predicted_confidence: ["http://local-ot/model/19/predicted/confidence"] :: /validation_service.rb:227:in `compute_prediction_data' +# D, [2012-11-07T12:38:11.291611 #31035] DEBUG -- : validation :: loading prediction -- prediction_feature: http://local-ot/dataset/14097/feature/Hamster%20Carcinogenicity :: /validation_service.rb:227:in `compute_prediction_data' + + exit # dataset_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/603206?pagesize=250&page=0" # test_dataset_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/603206?pagesize=250&page=1" @@ -117,7 +133,6 @@ class ValidationTest < Test::Unit::TestCase # post "/validate_datasets",{ # :test_dataset_uri=>"http://local-ot/dataset/6907", # :prediction_dataset_uri=>"http://local-ot/dataset/6909", -# :test_target_dataset_uri=>"http://local-ot/dataset/6905", # :prediction_feature=>"http://local-ot/dataset/6905/feature/Hamster%20Carcinogenicity", # #:model_uri=>"http://local-ot/model/1078", # :predicted_variable=>"http://local-ot/dataset/6909/feature/prediction/Hamster%20Carcinogenicity/value", @@ -351,7 +366,6 @@ end # post "/validate_datasets",{ # :test_dataset_uri=>"http://apps.deaconsult.net:8080/ambit2/dataset/R3924", # :prediction_dataset_uri=>"http://apps.ideaconsult.net:8080/ambit2/dataset/R3924?feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Fmodel%2F52%2Fpredicted", - # #:test_target_dataset_uri=>"http://local-ot/dataset/202", # :prediction_feature=>"http://apps.ideaconsult.net:8080/ambit2/feature/21715", # :predicted_feature=>"http://apps.ideaconsult.net:8080/ambit2/feature/28944", # :regression=>"true"} @@ -363,7 +377,6 @@ end #get "/crossvalidation/19/predictions",nil,'HTTP_ACCEPT' => "application/x-yaml" #/statistics" # post "",:model_uri=>"http://local-ot/model/1",:test_dataset_uri=>"http://local-ot/dataset/3", - # :test_target_dataset_uri=>"http://local-ot/dataset/1" # get "/crossvalidation/2",nil,'HTTP_ACCEPT' => "application/rdf+xml" #puts last_response.body @@ -384,7 +397,6 @@ end # post "/validate_datasets",{ # :test_dataset_uri=>"http://local-ot/dataset/204", # :prediction_dataset_uri=>"http://local-ot/dataset/206", - # :test_target_dataset_uri=>"http://local-ot/dataset/202", # :prediction_feature=>"http://ot-dev.in-silico.ch/toxcreate/feature#IRIS%20unit%20risk", # :predicted_feature=>"http://ot-dev.in-silico.ch/toxcreate/feature#IRIS%20unit%20risk_lazar_regression", # :regression=>"true"} @@ -394,7 +406,6 @@ end # post "/validate_datasets",{ # :test_dataset_uri=>"http://apps.ideaconsult.net:8080/ambit2/dataset/9?max=10", # :prediction_dataset_uri=>"http://apps.ideaconsult.net:8080/ambit2/dataset/9?max=10", -# #:test_target_dataset_uri=>"http://local-ot/dataset/202", # :prediction_feature=>"http://apps.ideaconsult.net:8080/ambit2/feature/21573", # :predicted_feature=>"http://apps.ideaconsult.net:8080/ambit2/feature/21573", # #:regression=>"true"} @@ -406,7 +417,6 @@ end # post "/validate_datasets",{ # :test_dataset_uri=>"http://local-ot/dataset/89", # :prediction_dataset_uri=>"http://local-ot/dataset/91", - # :test_target_dataset_uri=>"http://local-ot/dataset/87", # :prediction_feature=>"http://local-ot/dataset/1/feature/hamster_carcinogenicity", # :predicted_feature=>"", ## :regression=>"true"} @@ -419,7 +429,6 @@ end # post "/validate_datasets",{ # :test_dataset_uri=>"http://local-ot/dataset/390", # :prediction_dataset_uri=>"http://local-ot/dataset/392", -# :test_target_dataset_uri=>"http://local-ot/dataset/388", # :prediction_feature=>"http://local-ot/dataset/388/feature/repdose_classification", # :model_uri=>"http://local-ot/model/31"} # #:regression=>"true"} @@ -432,7 +441,6 @@ end # post "/validate_datasets",{ # :test_dataset_uri=>"http://opentox.informatik.uni-freiburg.de/dataset/409", # :prediction_dataset_uri=>"http://opentox.informatik.uni-freiburg.de/dataset/410", -# :test_target_dataset_uri=>"https://ambit.uni-plovdiv.bg:8443/ambit2/dataset/R401560", # :prediction_feature=>"https://ambit.uni-plovdiv.bg:8443/ambit2/feature/22190", # :predicted_feature=>"https://ambit.uni-plovdiv.bg:8443/ambit2/feature/218304", # :regression=>"true", @@ -453,7 +461,6 @@ end # post "/validate_datasets",{ # :test_dataset_uri=>"http://local-ot/dataset/94", # :prediction_dataset_uri=>'http://local-ot/dataset/96', -# :test_target_dataset_uri=>'http://local-ot/dataset/92', # :prediction_feature=>'http://local-ot/dataset/92/feature/Hamster%20Carcinogenicity', # :predicted_feature=>"", # :classification=>"true", -- cgit v1.2.3