From c019ecedcb54e0ccbfebcf6901b3007d1e24ba1d Mon Sep 17 00:00:00 2001 From: mguetlein Date: Tue, 28 Feb 2012 10:36:17 +0100 Subject: remove unneeded dataset files when performing loo-cv --- test/test_examples.rb | 18 ++++++ test/test_examples_util.rb | 80 +++++++++++++++++++++++++++ validation/validation_application.rb | 1 + validation/validation_service.rb | 15 +++++ validation/validation_test.rb | 104 +++++++++++++++++++++++++++++------ 5 files changed, 201 insertions(+), 17 deletions(-) diff --git a/test/test_examples.rb b/test/test_examples.rb index f3c0b7e..2b95cf2 100755 --- a/test/test_examples.rb +++ b/test/test_examples.rb @@ -274,6 +274,22 @@ module ValidationExamples end end + ######################################################################################################## + + class HamsterLooCrossvalidation < LooCrossValidation + def initialize + @dataset_file = File.new("data/hamster_carcinogenicity.csv","r") + end + end + + class LazarHamsterLooCrossvalidation < HamsterLooCrossvalidation + def initialize + @algorithm_uri = File.join(CONFIG[:services]["opentox-algorithm"],"lazar") + @algorithm_params = "feature_generation_uri="+File.join(CONFIG[:services]["opentox-algorithm"],"fminer/bbrc") + super + end + end + ######################################################################################################## class LazarHamsterMiniCrossvalidation < CrossValidation @@ -828,6 +844,8 @@ module ValidationExamples "22e" => [ AmbitVsNtuaTrainingTest ], "22f" => [ AnotherAmbitJ48TrainingTest ], "22g" => [ TumTrainingTest ], + + "23a" => [ LazarHamsterLooCrossvalidation ], } diff --git a/test/test_examples_util.rb b/test/test_examples_util.rb index b48096d..82c4c48 100755 --- a/test/test_examples_util.rb +++ b/test/test_examples_util.rb @@ -1,4 +1,15 @@ +class Numeric + def to_human + return "0" if self==0 + units = %w{B KB MB GB TB} + e = (Math.log(self)/Math.log(1024)).floor + s = "%.1f" % (to_f / 1024**e) + s.sub(/\.?0*$/, units[e]) + end +end + + module ValidationExamples class Util @@ -335,6 +346,57 @@ module ValidationExamples end end + def compute_dataset_size + if @validation_uri =~ /crossvalidation/ + cv = OpenTox::Crossvalidation.find(@validation_uri,@subjectid) + count = 0 + size = 0 + target = nil + + cv.metadata[OT.validation].each do |v| + val = OpenTox::Validation.find(v) + dataset = {} + dataset[:test] = val.metadata[OT.testDataset] + dataset[:training] = val.metadata[OT.trainingDataset] + #dataset[:target] = val.metadata[OT.testTargetDataset] + raise if (target!=nil and target!=val.metadata[OT.testTargetDataset]) + target = val.metadata[OT.testTargetDataset] + + dataset[:prediction] = val.metadata[OT.predictionDataset] + m = val.metadata[OT.model] + model = OpenTox::Model::Generic.find(m) + dataset[:feature] = model.metadata[OT.featureDataset] + + puts v + val_size = 0 + dataset.each do |k,v| + s = size(v) + val_size += s + puts k.to_s+" "+v+" "+s.to_human + end + puts val_size.to_human + puts "" + size += val_size + + count += 1 + #break if (count>2) + end + + puts "total "+size.to_human+" (count: "+count.to_s+")" + puts "avg "+(size/count.to_f).to_human + + puts "" + puts "orig file: "+target+" "+size(target).to_human + end + end + + private + def size(dataset) + f = "/home/martin/opentox-ruby/www/opentox/dataset/data/#{dataset.split("/")[-1]}.json" + File.exist?(f) ? File.new(f).size : 0 + end + + public def verify_yaml raise "cannot very validation, validation_uri is null" unless @validation_uri @@ -443,4 +505,22 @@ module ValidationExamples "crossvalidation" end end + + class LooCrossValidation < ValidationExample + def params + [:algorithm_uri, :dataset_uri, :prediction_feature] + end + + def opt_params + [ :algorithm_params ] + end + + def report_type + "crossvalidation" + end + + def validation_type + "crossvalidation/loo" + end + end end diff --git a/validation/validation_application.rb b/validation/validation_application.rb index 279cd14..b07e814 100755 --- a/validation/validation_application.rb +++ b/validation/validation_application.rb @@ -97,6 +97,7 @@ post '/crossvalidation/loo/?' do cv.perform_cv( params[:prediction_feature], params[:algorithm_params], OpenTox::SubTask.create(task,0,95)) # computation of stats is cheap as dataset are already loaded into the memory Validation::Validation.from_cv_statistics( cv.id, @subjectid, OpenTox::SubTask.create(task,95,100) ) + cv.clean_loo_files cv.crossvalidation_uri end return_task(task) diff --git a/validation/validation_service.rb b/validation/validation_service.rb index 614363d..527e5ca 100755 --- a/validation/validation_service.rb +++ b/validation/validation_service.rb @@ -305,6 +305,21 @@ module Validation perform_cv_validations( algorithm_params, OpenTox::SubTask.create(task, 33, 100) ) end + def clean_loo_files + Validation.find( :crossvalidation_id => self.id, :validation_type => "crossvalidation" ).each do |v| + LOGGER.debug "loo-cleanup> delete training dataset "+v.training_dataset_uri + OpenTox::RestClientWrapper.delete v.training_dataset_uri,subjectid + begin + model = OpenTox::Model::Generic.find(v.model_uri) + if model.metadata[OT.featureDataset] + LOGGER.debug "loo-cleanup> delete feature dataset "+model.metadata[OT.featureDataset] + OpenTox::RestClientWrapper.delete model.metadata[OT.featureDataset],subjectid + end + rescue + end + end + end + # deletes a crossvalidation, all validations are deleted as well def delete_crossvalidation validations = Validation.find(:crossvalidation_id => self.id) diff --git a/validation/validation_test.rb b/validation/validation_test.rb index ae71749..70f3ca4 100755 --- a/validation/validation_test.rb +++ b/validation/validation_test.rb @@ -60,6 +60,60 @@ class ValidationTest < Test::Unit::TestCase begin $test_case = self +# dataset_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/603206?pagesize=250&page=0" +# test_dataset_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/603206?pagesize=250&page=1" +# #prediction_feature = "http://apps.ideaconsult.net:8080/ambit2/feature/528321" +# prediction_feature = "http://apps.ideaconsult.net:8080/ambit2/feature/528402" +# prediction_algorithm = "http://apps.ideaconsult.net:8080/ambit2/algorithm/RandomForest" +# #ad_algorithm = "http://apps.ideaconsult.net:8080/ambit2/algorithm/leverage" +# #ad_algorithm = "http://apps.ideaconsult.net:8080/ambit2/algorithm/distanceMahalanobis" +# #ad_algorithm = "http://apps.ideaconsult.net:8080/ambit2/algorithm/pcaRanges" +# ad_algorithm = "http://apps.ideaconsult.net:8080/ambit2/algorithm/RandomForest" +# post "/training_test_validation",{:training_dataset_uri=>dataset_uri, :test_dataset_uri=>test_dataset_uri, +# :prediction_feature => prediction_feature, :algorithm_uri=>"http://local-ot/adwrap", +# :algorithm_params=>"prediction_algorithm=#{prediction_algorithm};ad_algorithm=#{ad_algorithm}"} +# puts last_response.body +# uri = last_response.body +# rep = wait_for_task(uri) +# puts rep +# +# post "/report/method_comparison", +# {:validation_uris=>"http://local-ot/validation/433,http://local-ot/validation/434,http://local-ot/validation/435,http://local-ot/validation/436,http://local-ot/validation/437,http://local-ot/validation/438,http://local-ot/validation/439,http://local-ot/validation/440,http://local-ot/validation/441,http://local-ot/validation/442,http://local-ot/validation/crossvalidation/30,", +# :identifier=>"random,random,random,random,random,random,random,random,random,random,crossvalidated,"} + +# post "/report/method_comparison", +# {:validation_uris=>"http://local-ot/validation/389,http://local-ot/validation/390,http://local-ot/validation/391,http://local-ot/validation/392", +# :identifier=>"split1,split1,split2,split2"} + + + #post "/report/validation",{:validation_uris=>"http://local-ot/validation/171"} + #post "/report/validation",{:validation_uris=>"http://local-ot/validation/389"} + + #dataset_uri = OpenTox::Dataset.create_from_csv_file(File.new("data/EPAFHM.csv").path, nil).uri + #puts dataset_uri + +# #dataset_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/603306?feature_uris[]=http://apps.ideaconsult.net:8080/ambit2/feature/764036" +# #dataset_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/603204" +# post "/plain_training_test_split",{:dataset_uri=>dataset_uri, :stratified=>"true", :split_ratio=>0.3} +# puts last_response.body +# uri = last_response.body +# rep = wait_for_task(uri) +# puts rep + #OpenTox::RestClientWrapper.post("http://opentox.informatik.uni-freiburg.de/validation/plain_training_test_split", + # {:dataset_uri=>dataset_uri, :stratified=>"true", :split_ratio=>0.7407407407}) + + #puts OpenTox::Dataset.create_from_csv_file(File.new("data/hamster_carcinogenicity.csv").path, nil).uri + #puts OpenTox::Dataset.create_from_csv_file(File.new("data/multi_cell_call.csv").path, nil).uri + + #puts OpenTox::Dataset.find("http://opentox.informatik.uni-freiburg.de/dataset/98").compounds.size + +# +# #post "/plain_training_test_split",{:dataset_uri=>"http://apps.ideaconsult.net:8080/ambit2/dataset/603204", :stratified=>"true"} +# +# +# + + # post "/validate_datasets",{ # :test_dataset_uri=>"http://local-ot/dataset/6907", # :prediction_dataset_uri=>"http://local-ot/dataset/6909", @@ -71,20 +125,19 @@ class ValidationTest < Test::Unit::TestCase # #:regression=>"true"} # :classification=>"true"} # -# puts last_response.body -# uri = last_response.body -# rep = wait_for_task(uri) -# puts rep + #get 'crossvalidation/19/statistics' #get 'crossvalidation/189/statistics' #puts last_response.body -# run_test("1b") + + #run_test("13a") + # run_test("1a",:validation_uri=>"http://local-ot/validation/513") #get '/crossvalidation/79/predictions',nil,'HTTP_ACCEPT' => "application/x-yaml" #puts last_response.body - run_test("22f") #,:validation_uri=>"http://local-ot/validation/84" ) + # run_test("22f") #,:validation_uri=>"http://local-ot/validation/84" ) #run_test("21b") @@ -109,12 +162,6 @@ class ValidationTest < Test::Unit::TestCase # puts rep # 205 206 207 -# post '/report/algorithm_comparison',{:validation_uris=>"http://local-ot/validation/crossvalidation/149,http://local-ot/validation/crossvalidation/210", -# :identifier=>"bbrc,last"} -# uri = last_response.body -# rep = wait_for_task(uri) -# puts rep - #run_test("1a", {:validation_uri=>"http://local-ot/validation/305"}) # puts "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" #run_test("3a",{:validation_uri=>"http://local-ot/validation/crossvalidation/6"}) @@ -123,14 +170,33 @@ class ValidationTest < Test::Unit::TestCase # puts "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" #run_test("14a") #,{:validation_uri=>"http://local-ot/validation/crossvalidation/148"}) # puts "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" - - #run_test("1a") -# run_test("3d",{ -# :dataset_uri => "http://local-ot/dataset/2897", -# :prediction_feature => "http://local-ot/dataset/2897/feature/Hamster%20Carcinogenicity", + #run_test("3a") + #run_test("3d",{ + # :dataset_uri => "http://local-ot/dataset/447", + # :prediction_feature => "http://local-ot/dataset/447/feature/Hamster%20Carcinogenicity", + # :random_seed => 1 + # }) + + #run_test("23a") + run_test("23a",{:validation_uri=>"http://local-ot/validation/crossvalidation/53"}) + #run_test("23a",{:validation_uri=>"http://local-ot/validation/crossvalidation/47"}) + #23a loo {:validation_uri=>"http://local-ot/validation/crossvalidation/47"}) + #loo mit datasets auf ortona {:validation_uri=>"http://local-ot/validation/crossvalidation/46"} + +# run_test("14d",{ +# :dataset_uri => "http://local-ot/dataset/508", +# :prediction_feature => "http://local-ot/dataset/508/feature/LC50_mmol", # :random_seed => 1 # }) + + #post '/report/algorithm_comparison',{ + # :validation_uris=>"http://local-ot/validation/crossvalidation/9,http://local-ot/validation/crossvalidation/10", + # :identifier=>"bbrc,last", + # :ttest_attributes=>"num_instances,num_without_class,num_unpredicted,real_runtime,percent_without_class,percent_unpredicted"} + #uri = last_response.body + #rep = wait_for_task(uri) + #puts rep #run_test("14",{ # :dataset_uri => "http://local-ot/dataset/3877", @@ -189,6 +255,10 @@ class ValidationTest < Test::Unit::TestCase LOGGER.debug "validation done '"+ex.validation_uri.to_s+"'" end + + #ex.compute_dataset_size + #break + if !delete and ex.validation_uri if SUBJECTID puts ex.validation_uri+"?subjectid="+CGI.escape(SUBJECTID) -- cgit v1.2.3 From e3da22ccc8aa7c7f808b5ebf3d3a1539d59eade0 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Fri, 2 Mar 2012 17:09:09 +0100 Subject: fix: do not always delete feature datasets in loo-cv --- validation/validation_application.rb | 2 +- validation/validation_service.rb | 16 +++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/validation/validation_application.rb b/validation/validation_application.rb index b07e814..b29b7ff 100755 --- a/validation/validation_application.rb +++ b/validation/validation_application.rb @@ -97,7 +97,7 @@ post '/crossvalidation/loo/?' do cv.perform_cv( params[:prediction_feature], params[:algorithm_params], OpenTox::SubTask.create(task,0,95)) # computation of stats is cheap as dataset are already loaded into the memory Validation::Validation.from_cv_statistics( cv.id, @subjectid, OpenTox::SubTask.create(task,95,100) ) - cv.clean_loo_files + cv.clean_loo_files( !(params[:algorithm_params] && params[:algorithm_params] =~ /feature_dataset_uri/) ) cv.crossvalidation_uri end return_task(task) diff --git a/validation/validation_service.rb b/validation/validation_service.rb index 527e5ca..c6a0299 100755 --- a/validation/validation_service.rb +++ b/validation/validation_service.rb @@ -305,17 +305,19 @@ module Validation perform_cv_validations( algorithm_params, OpenTox::SubTask.create(task, 33, 100) ) end - def clean_loo_files + def clean_loo_files( delete_feature_datasets ) Validation.find( :crossvalidation_id => self.id, :validation_type => "crossvalidation" ).each do |v| LOGGER.debug "loo-cleanup> delete training dataset "+v.training_dataset_uri OpenTox::RestClientWrapper.delete v.training_dataset_uri,subjectid - begin - model = OpenTox::Model::Generic.find(v.model_uri) - if model.metadata[OT.featureDataset] - LOGGER.debug "loo-cleanup> delete feature dataset "+model.metadata[OT.featureDataset] - OpenTox::RestClientWrapper.delete model.metadata[OT.featureDataset],subjectid + if (delete_feature_datasets) + begin + model = OpenTox::Model::Generic.find(v.model_uri) + if model.metadata[OT.featureDataset] + LOGGER.debug "loo-cleanup> delete feature dataset "+model.metadata[OT.featureDataset] + OpenTox::RestClientWrapper.delete model.metadata[OT.featureDataset],subjectid + end + rescue end - rescue end end end -- cgit v1.2.3 From 628aa17f792d501330c7b79a021ac8621bc3c401 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Tue, 13 Mar 2012 10:16:55 +0100 Subject: set correct creator URI in cv dataset folds --- validation/validation_service.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validation/validation_service.rb b/validation/validation_service.rb index c6a0299..25081f4 100755 --- a/validation/validation_service.rb +++ b/validation/validation_service.rb @@ -497,7 +497,7 @@ module Validation '_f'+n.to_s+'of'+self.num_folds.to_s+ '_r'+self.random_seed.to_s+ '_s'+self.stratified.to_s - source = $url_provider.url_for('/crossvalidation',:full) + source = self.crossvalidation_uri test_compounds = [] train_compounds = [] -- cgit v1.2.3 From df14373220686355df347d05412a5c6c3c1fe034 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Tue, 13 Mar 2012 10:24:53 +0100 Subject: skip results in loocv-reports --- report/report_factory.rb | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/report/report_factory.rb b/report/report_factory.rb index 2bb74ee..4472e6c 100755 --- a/report/report_factory.rb +++ b/report/report_factory.rb @@ -175,7 +175,8 @@ module Reports::ReportFactory report.end_section report.add_result(validation_set, [:validation_uri, :validation_report_uri]+VAL_ATTR_CV+VAL_ATTR_CLASS-[:num_folds, :dataset_uri, :algorithm_uri], - "Results","Results") + "Results","Results") if + (validation_set.unique_value(:num_folds) < validation_set.unique_value(:num_instances)) when "regression" report.add_result(cv_set, [:crossvalidation_uri]+VAL_ATTR_CV+VAL_ATTR_REGR-[:crossvalidation_fold],res_titel, res_titel, res_text) report.add_section("Plots") @@ -189,11 +190,13 @@ module Reports::ReportFactory report.end_section report.add_result(validation_set, [:validation_uri, :validation_report_uri]+VAL_ATTR_CV+VAL_ATTR_REGR-[:num_folds, :dataset_uri, :algorithm_uri], - "Results","Results") + "Results","Results") if + (validation_set.unique_value(:num_folds) < validation_set.unique_value(:num_instances)) end task.progress(90) if task - report.add_result(validation_set, Validation::ALL_PROPS, "All Results", "All Results") + report.add_result(validation_set, Validation::ALL_PROPS, "All Results", "All Results") if + (validation_set.unique_value(:num_folds) < validation_set.unique_value(:num_instances)) report.add_predictions( validation_set ) #, [:crossvalidation_fold] ) task.progress(100) if task report -- cgit v1.2.3 From ad597754c030535c7e40016a396342d854dfd569 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Wed, 14 Mar 2012 09:43:14 +0100 Subject: fix 'skip results in loocv-reports' --- report/report_factory.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/report/report_factory.rb b/report/report_factory.rb index 4472e6c..716210c 100755 --- a/report/report_factory.rb +++ b/report/report_factory.rb @@ -176,7 +176,7 @@ module Reports::ReportFactory report.add_result(validation_set, [:validation_uri, :validation_report_uri]+VAL_ATTR_CV+VAL_ATTR_CLASS-[:num_folds, :dataset_uri, :algorithm_uri], "Results","Results") if - (validation_set.unique_value(:num_folds) < validation_set.unique_value(:num_instances)) + (cv_set.unique_value(:num_folds) < cv_set.unique_value(:num_instances)) when "regression" report.add_result(cv_set, [:crossvalidation_uri]+VAL_ATTR_CV+VAL_ATTR_REGR-[:crossvalidation_fold],res_titel, res_titel, res_text) report.add_section("Plots") @@ -191,12 +191,12 @@ module Reports::ReportFactory report.add_result(validation_set, [:validation_uri, :validation_report_uri]+VAL_ATTR_CV+VAL_ATTR_REGR-[:num_folds, :dataset_uri, :algorithm_uri], "Results","Results") if - (validation_set.unique_value(:num_folds) < validation_set.unique_value(:num_instances)) + (cv_set.unique_value(:num_folds) < cv_set.unique_value(:num_instances)) end task.progress(90) if task report.add_result(validation_set, Validation::ALL_PROPS, "All Results", "All Results") if - (validation_set.unique_value(:num_folds) < validation_set.unique_value(:num_instances)) + (cv_set.unique_value(:num_folds) < cv_set.unique_value(:num_instances)) report.add_predictions( validation_set ) #, [:crossvalidation_fold] ) task.progress(100) if task report -- cgit v1.2.3 From ac9f3ee04f997fa14a88dd7b16a5a6d9ccb8b30e Mon Sep 17 00:00:00 2001 From: mguetlein Date: Thu, 22 Mar 2012 15:35:19 +0100 Subject: fix loo-report-build: make sure values are int --- report/report_factory.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/report/report_factory.rb b/report/report_factory.rb index 716210c..07a5ac5 100755 --- a/report/report_factory.rb +++ b/report/report_factory.rb @@ -176,7 +176,7 @@ module Reports::ReportFactory report.add_result(validation_set, [:validation_uri, :validation_report_uri]+VAL_ATTR_CV+VAL_ATTR_CLASS-[:num_folds, :dataset_uri, :algorithm_uri], "Results","Results") if - (cv_set.unique_value(:num_folds) < cv_set.unique_value(:num_instances)) + (cv_set.unique_value(:num_folds).to_i < cv_set.unique_value(:num_instances).to_i) when "regression" report.add_result(cv_set, [:crossvalidation_uri]+VAL_ATTR_CV+VAL_ATTR_REGR-[:crossvalidation_fold],res_titel, res_titel, res_text) report.add_section("Plots") @@ -191,12 +191,12 @@ module Reports::ReportFactory report.add_result(validation_set, [:validation_uri, :validation_report_uri]+VAL_ATTR_CV+VAL_ATTR_REGR-[:num_folds, :dataset_uri, :algorithm_uri], "Results","Results") if - (cv_set.unique_value(:num_folds) < cv_set.unique_value(:num_instances)) + (cv_set.unique_value(:num_folds).to_i < cv_set.unique_value(:num_instances).to_i) end task.progress(90) if task report.add_result(validation_set, Validation::ALL_PROPS, "All Results", "All Results") if - (cv_set.unique_value(:num_folds) < cv_set.unique_value(:num_instances)) + (cv_set.unique_value(:num_folds).to_i < cv_set.unique_value(:num_instances).to_i) report.add_predictions( validation_set ) #, [:crossvalidation_fold] ) task.progress(100) if task report -- cgit v1.2.3 From 8a199a09a6d9ac8b0349af0d7c5b5320bdcec9b5 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Mon, 26 Mar 2012 11:29:14 +0200 Subject: add concordance correlation coefficient, adjust feature value plotting --- lib/predictions.rb | 68 ++++++++++++++++++++++++++++++++++------ lib/validation_db.rb | 2 +- report/plot_factory.rb | 5 +-- report/report_factory.rb | 5 ++- report/validation_access.rb | 39 ++++++++++++++++------- validation/validation_service.rb | 5 +-- 6 files changed, 96 insertions(+), 28 deletions(-) diff --git a/lib/predictions.rb b/lib/predictions.rb index 233267d..348ac44 100755 --- a/lib/predictions.rb +++ b/lib/predictions.rb @@ -577,6 +577,31 @@ module Lib # return weighted_sample_correlation_coefficient ** 2 #end + def concordance_correlation_coefficient + begin + numerator = 0 + @predicted_values.size.times do |i| + numerator += (@actual_values[i]-@actual_mean) * (@predicted_values[i]-@prediction_mean) if + @actual_values[i]!=nil and @predicted_values[i]!=nil + end + numerator *= 2 + denominator = total_sum_of_squares + denominator += prediction_total_sum_of_squares + denominator += @num_predicted * (@actual_mean - @prediction_mean)**2 + ccc = numerator / denominator + ( ccc.infinite? || ccc.nan? ) ? 0 : ccc + rescue; 0; end + end + + def prediction_total_sum_of_squares + #return @variance_actual * ( @num_predicted - 1 ) + sum = 0 + @predicted_values.size.times do |i| + sum += (@predicted_values[i]-@prediction_mean)**2 if @actual_values[i]!=nil and @predicted_values[i]!=nil + end + sum + end + def sample_correlation_coefficient begin # formula see http://en.wikipedia.org/wiki/Correlation_and_dependence#Pearson.27s_product-moment_coefficient @@ -804,22 +829,45 @@ module Lib end puts "num values "+p.size.to_s - pred = Predictions.new(p,a,c,"regression") + #a = [1.0,2.0, 3.0,4.0, 5.0] + #p = [1.5,2.25,3.0,3.75,4.5] + + #a = [1.0,2.0,3.0,4.0,5.0] + #p = [1.5,2.5,3.5,4.5,5.5] + + #p = a.collect{|v| v-0.5} + #p = a.collect{|v| v+0.5} + + #p = [2.0,2.5,3.0,3.5,4.0] + + c = Array.new(p.size,nil) + + data = { :predicted_values => p, :actual_values => a, :confidence_values => c, + :feature_type => "regression", :accept_values => nil } + + pred = Predictions.new(data) puts "internal" #puts "r-square old "+pred.r_square_old.to_s puts "cor "+pred.sample_correlation_coefficient.to_s - puts "weighted cor "+pred.weighted_sample_correlation_coefficient.to_s + #puts "weighted cor "+pred.weighted_sample_correlation_coefficient.to_s puts "r-square "+pred.r_square.to_s + puts "ccc "+pred.concordance_correlation_coefficient.to_s puts "R" - @@r = RinRuby.new(true,false) unless defined?(@@r) and @@r - @@r.assign "v1",a - @@r.assign "v2",p - puts "r cor "+@@r.pull("cor(v1,v2)").to_s - @@r.eval "fit <- lm(v1 ~ v2)" - @@r.eval "sum <- summary(fit)" - puts "r r-square "+@@r.pull("sum$r.squared").to_s - puts "r adjusted-r-square "+@@r.pull("sum$adj.r.squared").to_s + rutil = OpenTox::RUtil.new + + rutil.r.assign "v1",a + rutil.r.assign "v2",p + puts "r cor "+rutil.r.pull("cor(v1,v2)").to_s + rutil.r.eval "fit <- lm(v1 ~ v2)" + rutil.r.eval "sum <- summary(fit)" + puts "r r-square "+rutil.r.pull("sum$r.squared").to_s + puts "r adjusted-r-square "+rutil.r.pull("sum$adj.r.squared").to_s + rutil.r.eval "save.image(\"/tmp/image.R\")" + #rutil.r.eval "require(epiR)" + #rutil.r.eval "tmp.ccc <- epi.ccc(v1,v2)" + #puts "r ccc "+rutil.r.pull("tmp.ccc$rho.c$est").to_s + rutil.quit_r end def prediction_feature_value_map(proc) diff --git a/lib/validation_db.rb b/lib/validation_db.rb index c3a3f71..7d83966 100755 --- a/lib/validation_db.rb +++ b/lib/validation_db.rb @@ -38,7 +38,7 @@ module Validation # :regression_statistics VAL_REGR_PROPS = [ :root_mean_squared_error, :mean_absolute_error, :r_square, :weighted_r_square, :target_variance_actual, :target_variance_predicted, :sum_squared_error, :sample_correlation_coefficient, - :weighted_mean_absolute_error, :weighted_root_mean_squared_error ] + :weighted_mean_absolute_error, :weighted_root_mean_squared_error, :concordance_correlation_coefficient ] CROSS_VAL_PROPS = [:dataset_uri, :num_folds, :stratified, :random_seed] CROSS_VAL_PROPS_REDUNDANT = [:crossvalidation_uri, :algorithm_uri, :date] + CROSS_VAL_PROPS diff --git a/report/plot_factory.rb b/report/plot_factory.rb index f114dd3..6e90dbc 100644 --- a/report/plot_factory.rb +++ b/report/plot_factory.rb @@ -4,7 +4,8 @@ ENV['RANK_PLOTTER_JAR'] = "RankPlotter/RankPlotter.jar" unless ENV['RANK_PLOTTER CONF_PLOT_RANGE = { :accuracy => [0.45,1.05], :true_positive_rate => [0.45,1.05],:true_negative_rate => [0.45,1.05], :false_positive_rate => [0.45,1.05], :false_negative_rate => [0.45,1.05], :positive_predictive_value => [0.45,1.05], - :negative_predictive_value => [0.45,1.05], :r_square => [0, 1.05], :sample_correlation_coefficient => [0, 1.05] } + :negative_predictive_value => [0.45,1.05], :r_square => [0, 1.05], :sample_correlation_coefficient => [0, 1.05], + :concordance_correlation_coefficient => [0, 1.05] } class Array def swap!(i,j) @@ -124,7 +125,7 @@ module Reports else Reports::r_util.feature_value_plot(out_files, validation_set.validations[0].training_feature_dataset_uri, validation_set.validations[0].test_feature_dataset_uri, "Training Data", "Test Data", - nil, true, validation_set.validations[0].subjectid, waiting_task ) + nil, validation_set.validations[0].subjectid, waiting_task ) end end diff --git a/report/report_factory.rb b/report/report_factory.rb index 07a5ac5..f51b999 100755 --- a/report/report_factory.rb +++ b/report/report_factory.rb @@ -9,7 +9,7 @@ VAL_ATTR_CLASS = [ :num_instances, :num_unpredicted, :accuracy, :weighted_accura :area_under_roc, :f_measure, :true_positive_rate, :true_negative_rate, :positive_predictive_value, :negative_predictive_value ] VAL_ATTR_REGR = [ :num_instances, :num_unpredicted, :root_mean_squared_error, :weighted_root_mean_squared_error, :mean_absolute_error, :weighted_mean_absolute_error, :r_square, :weighted_r_square, - :sample_correlation_coefficient ] + :sample_correlation_coefficient, :concordance_correlation_coefficient ] #VAL_ATTR_BOX_PLOT_CLASS = [ :accuracy, :average_area_under_roc, # :area_under_roc, :f_measure, :true_positive_rate, :true_negative_rate ] @@ -113,6 +113,9 @@ module Reports::ReportFactory report.add_confidence_plot(validation_set, :root_mean_squared_error, nil) report.add_confidence_plot(validation_set, :r_square, nil) report.align_last_two_images "Confidence Plots" + report.add_confidence_plot(validation_set, :sample_correlation_coefficient, nil) + report.add_confidence_plot(validation_set, :concordance_correlation_coefficient, nil) + report.align_last_two_images "More Confidence Plots" end task.progress(70) if task report.add_train_test_plot( validation_set, false, OpenTox::SubTask.create(task,70,80) ) diff --git a/report/validation_access.rb b/report/validation_access.rb index aaa7bdc..e2a3978 100755 --- a/report/validation_access.rb +++ b/report/validation_access.rb @@ -145,20 +145,35 @@ class Reports::ValidationDB end def test_feature_dataset_uri(validation, subjectid) - m = OpenTox::Model::Generic.find(validation.model_uri, subjectid) - feat_gen = nil - m.metadata[OT.parameters].each do |h| - if h[DC.title] and h[DC.title]=~/feature_generation/ and h[OT.paramValue] - feat_gen = h[OT.paramValue] + training_features = Lib::DatasetCache.find( training_feature_dataset_uri(validation,subjectid), subjectid ) + test_dataset = Lib::DatasetCache.find( validation.test_dataset_uri, subjectid ) + features_found = true + training_features.features.keys.each do |f| + unless test_dataset.features.keys.include?(f) + features_found = false + LOGGER.debug "training-feature are not in test-datset #{f}" break end - end if m and m.metadata[OT.parameters] - raise "no feature creation alg found" unless feat_gen - feat_gen = File.join(feat_gen,"match") if feat_gen=~/fminer/ - uri = OpenTox::RestClientWrapper.post(feat_gen,{:subjectid => subjectid, - :feature_dataset_uri=>training_feature_dataset_uri(validation,subjectid), - :dataset_uri=>validation.test_dataset_uri}) - @@tmp_resources << uri + end + if features_found + LOGGER.debug "all training-features found in test-datset" + uri = test_dataset.uri + else + m = OpenTox::Model::Generic.find(validation.model_uri, subjectid) + feat_gen = nil + m.metadata[OT.parameters].each do |h| + if h[DC.title] and h[DC.title]=~/feature_generation/ and h[OT.paramValue] + feat_gen = h[OT.paramValue] + break + end + end if m and m.metadata[OT.parameters] + raise "no feature creation alg found" unless feat_gen + feat_gen = File.join(feat_gen,"match") if feat_gen=~/fminer/ + uri = OpenTox::RestClientWrapper.post(feat_gen,{:subjectid => subjectid, + :feature_dataset_uri=>training_feature_dataset_uri(validation,subjectid), + :dataset_uri=>validation.test_dataset_uri}) + @@tmp_resources << uri + end uri end diff --git a/validation/validation_service.rb b/validation/validation_service.rb index 25081f4..686a287 100755 --- a/validation/validation_service.rb +++ b/validation/validation_service.rb @@ -210,9 +210,10 @@ module Validation algorithm_uri = self.algorithm_uri ? nil : model.metadata[OT.algorithm] predicted_variable = model.predicted_variable(self.subjectid) predicted_confidence = model.predicted_confidence(self.subjectid) - raise "cannot determine whether model '"+model.uri.to_s+"' performs classification or regression, "+ + raise "cannot determine whether model '"+model.uri.to_s+"' performs classification or regression: '#{feature_type}', "+ "please set rdf-type of predictedVariables feature '"+predicted_variable.to_s+ - "' to NominalFeature or NumericFeature" if (feature_type.to_s!="classification" and feature_type.to_s!="regression") + "' to NominalFeature or NumericFeature" if + (feature_type.to_s!="classification" and feature_type.to_s!="regression") compute_prediction_data( feature_type, predicted_variable, predicted_confidence, prediction_feature, algorithm_uri, task ) end -- cgit v1.2.3 From 324e7f2dc7c8417fd5af0a084f06ecc92de41d48 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Mon, 2 Apr 2012 16:11:28 +0200 Subject: new stratified type super added to crossvalidation and traning-test-split validation, add some more metadata to crossvaldiation, add validation_uri to predictions in crossvaldiation-report --- lib/ot_predictions.rb | 21 ++-- lib/validation_db.rb | 18 ++- report/report_content.rb | 11 +- report/report_factory.rb | 4 +- validation/validation_application.rb | 46 +++++--- validation/validation_service.rb | 217 ++++++++++++++++------------------- 6 files changed, 163 insertions(+), 154 deletions(-) diff --git a/lib/ot_predictions.rb b/lib/ot_predictions.rb index 3be845b..2752fcc 100755 --- a/lib/ot_predictions.rb +++ b/lib/ot_predictions.rb @@ -35,7 +35,7 @@ module Lib OTPredictions.to_array( [self] ) end - def self.to_array( predictions, add_pic=false, format=false ) + def self.to_array( predictions, add_pic=false, format=false, validation_uris=nil ) confidence_available = false predictions.each do |p| @@ -43,7 +43,10 @@ module Lib end res = [] conf_column = nil + count = 0 predictions.each do |p| + v_uris = validation_uris[count] if validation_uris + count += 1 (0..p.num_instances-1).each do |i| a = [] @@ -75,6 +78,9 @@ module Lib conf_column = a.size if conf_column==nil a << p.confidence_value(i) end + if validation_uris + a << v_uris[i] + end a << p.identifier(i) res << a end @@ -90,12 +96,13 @@ module Lib end end header = [] - header << "compound" if add_pic - header << "actual value" - header << "predicted value" - header << "classification" if predictions[0].feature_type=="classification" - header << "confidence value" if predictions[0].confidence_values_available? - header << "compound-uri" + header << "Compound" if add_pic + header << "Actual value" + header << "Predicted value" + header << "Classification" if predictions[0].feature_type=="classification" + header << "Confidence value" if predictions[0].confidence_values_available? + header << "Validation URI" if validation_uris + header << "Compound URI" res.insert(0, header) return res diff --git a/lib/validation_db.rb b/lib/validation_db.rb index 7d83966..086853e 100755 --- a/lib/validation_db.rb +++ b/lib/validation_db.rb @@ -6,8 +6,9 @@ require "lib/merge.rb" module Validation - VAL_PROPS_GENERAL = [ :validation_uri, :validation_type, :model_uri, :algorithm_uri, :training_dataset_uri, :prediction_feature, - :test_dataset_uri, :test_target_dataset_uri, :prediction_dataset_uri, :date ] + VAL_PROPS_GENERAL = [ :validation_uri, :validation_type, :model_uri, :algorithm_uri, :algorithm_params, + :training_dataset_uri, :prediction_feature, :test_dataset_uri, :test_target_dataset_uri, + :prediction_dataset_uri, :date ] VAL_PROPS_SUM = [ :num_instances, :num_without_class, :num_unpredicted ] VAL_PROPS_AVG = [:real_runtime, :percent_without_class, :percent_unpredicted ] VAL_PROPS = VAL_PROPS_GENERAL + VAL_PROPS_SUM + VAL_PROPS_AVG @@ -41,7 +42,8 @@ module Validation :weighted_mean_absolute_error, :weighted_root_mean_squared_error, :concordance_correlation_coefficient ] CROSS_VAL_PROPS = [:dataset_uri, :num_folds, :stratified, :random_seed] - CROSS_VAL_PROPS_REDUNDANT = [:crossvalidation_uri, :algorithm_uri, :date] + CROSS_VAL_PROPS + CROSS_VAL_PROPS_REDUNDANT = [:crossvalidation_uri, :algorithm_uri, :algorithm_params, + :prediction_feature, :date] + CROSS_VAL_PROPS ALL_PROPS = VAL_PROPS + VAL_CV_PROPS + VAL_CLASS_PROPS + VAL_REGR_PROPS + CROSS_VAL_PROPS @@ -55,6 +57,7 @@ module Validation attribute :validation_type attribute :model_uri attribute :algorithm_uri + attribute :algorithm_params attribute :training_dataset_uri attribute :test_target_dataset_uri attribute :test_dataset_uri @@ -77,6 +80,11 @@ module Validation index :model_uri index :validation_type index :crossvalidation_id + index :algorithm_uri + index :algorithm_params + index :prediction_feature + index :training_dataset_uri + index :test_dataset_uri attr_accessor :subjectid @@ -141,6 +149,8 @@ module Validation class Crossvalidation < Ohm::Model attribute :algorithm_uri + attribute :algorithm_params + attribute :prediction_feature attribute :dataset_uri attribute :date attribute :num_folds @@ -152,6 +162,8 @@ module Validation attr_accessor :subjectid index :algorithm_uri + index :algorithm_params + index :prediction_feature index :dataset_uri index :num_folds index :random_seed diff --git a/report/report_content.rb b/report/report_content.rb index 80473c5..033b367 100755 --- a/report/report_content.rb +++ b/report/report_content.rb @@ -63,20 +63,17 @@ class Reports::ReportContent end end - def add_predictions( validation_set, - validation_attributes=[], + def add_predictions( validation_set, + add_validation_uris, section_title="Predictions", section_text=nil, table_title="Predictions") - - #PENING - raise "validation attributes not implemented in get prediction array" if validation_attributes.size>0 - section_table = @xml_report.add_section(@current_section, section_title) if validation_set.validations[0].get_predictions @xml_report.add_paragraph(section_table, section_text) if section_text + v_uris = validation_set.validations.collect{|v| Array.new(v.num_instances.to_i,v.validation_uri)} if add_validation_uris @xml_report.add_table(section_table, table_title, Lib::OTPredictions.to_array(validation_set.validations.collect{|v| v.get_predictions}, - true, true)) + true, true, v_uris)) else @xml_report.add_paragraph(section_table, "No prediction info available.") end diff --git a/report/report_factory.rb b/report/report_factory.rb index f51b999..f73ffd9 100755 --- a/report/report_factory.rb +++ b/report/report_factory.rb @@ -124,7 +124,7 @@ module Reports::ReportFactory report.end_section report.add_result(validation_set, Validation::ALL_PROPS, "All Results", "All Results") - report.add_predictions( validation_set ) + report.add_predictions( validation_set, false ) task.progress(100) if task report end @@ -200,7 +200,7 @@ module Reports::ReportFactory report.add_result(validation_set, Validation::ALL_PROPS, "All Results", "All Results") if (cv_set.unique_value(:num_folds).to_i < cv_set.unique_value(:num_instances).to_i) - report.add_predictions( validation_set ) #, [:crossvalidation_fold] ) + report.add_predictions( validation_set, true ) task.progress(100) if task report end diff --git a/validation/validation_application.rb b/validation/validation_application.rb index 60fc7df..1bc55f6 100755 --- a/validation/validation_application.rb +++ b/validation/validation_application.rb @@ -6,8 +6,16 @@ end require 'lib/dataset_cache.rb' require 'validation/validation_service.rb' +helpers do + def check_stratified(params) + params[:stratified] = "false" unless params[:stratified] + raise OpenTox::BadRequestError.new "stratified != true|false|super, is #{params[:stratified]}" unless + params[:stratified]=~/true|false|super/ + end +end + get '/crossvalidation/?' do - LOGGER.info "list all crossvalidations" + LOGGER.info "list all crossvalidations "+params.inspect model_uri = params.delete("model") || params.delete("model_uri") if model_uri model = OpenTox::Model::Generic.find(model_uri, @subjectid) @@ -46,17 +54,20 @@ post '/crossvalidation/?' do raise OpenTox::BadRequestError.new "prediction_feature missing" unless params[:prediction_feature].to_s.size>0 raise OpenTox::BadRequestError.new "illegal param-value num_folds: '"+params[:num_folds].to_s+"', must be integer >1" unless params[:num_folds]==nil or params[:num_folds].to_i>1 - + check_stratified(params) + task = OpenTox::Task.create( "Perform crossvalidation", url_for("/crossvalidation", :full) ) do |task| #, params cv_params = { :dataset_uri => params[:dataset_uri], :algorithm_uri => params[:algorithm_uri], + :algorithm_params => params[:algorithm_params], + :prediction_feature => params[:prediction_feature], + :stratified => params[:stratified], :loo => "false", :subjectid => @subjectid } [ :num_folds, :random_seed ].each{ |sym| cv_params[sym] = params[sym] if params[sym] } - cv_params[:stratified] = (params[:stratified].size>0 && params[:stratified]!="false" && params[:stratified]!="0") if params[:stratified] cv = Validation::Crossvalidation.create cv_params cv.subjectid = @subjectid - cv.perform_cv( params[:prediction_feature], params[:algorithm_params], OpenTox::SubTask.create(task,0,95)) + cv.perform_cv( OpenTox::SubTask.create(task,0,95) ) # computation of stats is cheap as dataset are already loaded into the memory Validation::Validation.from_cv_statistics( cv.id, @subjectid, OpenTox::SubTask.create(task,95,100) ) cv.crossvalidation_uri @@ -87,14 +98,16 @@ post '/crossvalidation/loo/?' do raise OpenTox::BadRequestError.new "algorithm_uri missing" unless params[:algorithm_uri].to_s.size>0 raise OpenTox::BadRequestError.new "prediction_feature missing" unless params[:prediction_feature].to_s.size>0 raise OpenTox::BadRequestError.new "illegal param: num_folds, stratified, random_seed not allowed for loo-crossvalidation" if params[:num_folds] or - params[:stratifed] or params[:random_seed] + params[:stratified] or params[:random_seed] task = OpenTox::Task.create( "Perform loo-crossvalidation", url_for("/crossvalidation/loo", :full) ) do |task| #, params - cv_params = { :dataset_uri => params[:dataset_uri], + cv_params = { :dataset_uri => params[:dataset_uri], + :algorithm_params => params[:algorithm_params], + :prediction_feature => params[:prediction_feature], :algorithm_uri => params[:algorithm_uri], :loo => "true" } cv = Validation::Crossvalidation.create cv_params cv.subjectid = @subjectid - cv.perform_cv( params[:prediction_feature], params[:algorithm_params], OpenTox::SubTask.create(task,0,95)) + cv.perform_cv( OpenTox::SubTask.create(task,0,95)) # computation of stats is cheap as dataset are already loaded into the memory Validation::Validation.from_cv_statistics( cv.id, @subjectid, OpenTox::SubTask.create(task,95,100) ) cv.clean_loo_files( !(params[:algorithm_params] && params[:algorithm_params] =~ /feature_dataset_uri/) ) @@ -344,12 +357,13 @@ post '/training_test_validation/?' do task = OpenTox::Task.create( "Perform training-test-validation", url_for("/", :full) ) do |task| #, params v = Validation::Validation.create :validation_type => "training_test_validation", :algorithm_uri => params[:algorithm_uri], + :algorithm_params => params[:algorithm_params], :training_dataset_uri => params[:training_dataset_uri], :test_dataset_uri => params[:test_dataset_uri], :test_target_dataset_uri => params[:test_target_dataset_uri], :prediction_feature => params[:prediction_feature] v.subjectid = @subjectid - v.validate_algorithm( params[:algorithm_params], task ) + v.validate_algorithm( task ) v.validation_uri end return_task(task) @@ -403,10 +417,11 @@ post '/bootstrapping' do :test_target_dataset_uri => params[:dataset_uri], :prediction_feature => params[:prediction_feature], :algorithm_uri => params[:algorithm_uri], + :algorithm_params => params[:algorithm_params], :training_dataset_uri => params[:training_dataset_uri], :test_dataset_uri => params[:test_dataset_uri] v.subjectid = @subjectid - v.validate_algorithm( params[:algorithm_params], OpenTox::SubTask.create(task,33,100)) + v.validate_algorithm( OpenTox::SubTask.create(task,33,100)) v.validation_uri end return_task(task) @@ -453,18 +468,19 @@ post '/training_test_split' do raise OpenTox::BadRequestError.new "dataset_uri missing" unless params[:dataset_uri].to_s.size>0 raise OpenTox::BadRequestError.new "algorithm_uri missing" unless params[:algorithm_uri].to_s.size>0 raise OpenTox::BadRequestError.new "prediction_feature missing" unless params[:prediction_feature].to_s.size>0 + check_stratified(params) task = OpenTox::Task.create( "Perform training test split validation", url_for("/training_test_split", :full) ) do |task| #, params - strat = (params[:stratified].size>0 && params[:stratified]!="false" && params[:stratified]!="0") if params[:stratified] params.merge!( Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:prediction_feature], - @subjectid, strat, params[:split_ratio], params[:random_seed], OpenTox::SubTask.create(task,0,33))) + @subjectid, params[:stratified], params[:split_ratio], params[:random_seed], OpenTox::SubTask.create(task,0,33))) v = Validation::Validation.create :validation_type => "training_test_split", :training_dataset_uri => params[:training_dataset_uri], :test_dataset_uri => params[:test_dataset_uri], :test_target_dataset_uri => params[:dataset_uri], :prediction_feature => params[:prediction_feature], - :algorithm_uri => params[:algorithm_uri] + :algorithm_uri => params[:algorithm_uri], + :algorithm_params => params[:algorithm_params] v.subjectid = @subjectid - v.validate_algorithm( params[:algorithm_params], OpenTox::SubTask.create(task,33,100)) + v.validate_algorithm( OpenTox::SubTask.create(task,33,100)) v.validation_uri end return_task(task) @@ -546,10 +562,10 @@ end post '/plain_training_test_split' do LOGGER.info "creating pure training test split "+params.inspect raise OpenTox::BadRequestError.new "dataset_uri missing" unless params[:dataset_uri] + check_stratified(params) task = OpenTox::Task.create( "Create data-split", url_for("/plain_training_test_split", :full) ) do |task| - strat = (params[:stratified].size>0 && params[:stratified]!="false" && params[:stratified]!="0") if params[:stratified] result = Validation::Util.train_test_dataset_split(params[:dataset_uri], params[:prediction_feature], @subjectid, - strat, params[:split_ratio], params[:random_seed]) + params[:stratified], params[:split_ratio], params[:random_seed], task) content_type "text/uri-list" result[:training_dataset_uri]+"\n"+result[:test_dataset_uri]+"\n" end diff --git a/validation/validation_service.rb b/validation/validation_service.rb index 686a287..c433161 100755 --- a/validation/validation_service.rb +++ b/validation/validation_service.rb @@ -111,7 +111,7 @@ module Validation end # validates an algorithm by building a model and validating this model - def validate_algorithm( algorithm_params=nil, task=nil ) + def validate_algorithm( task=nil ) raise "validation_type missing" unless self.validation_type raise OpenTox::BadRequestError.new "no algorithm uri: '"+self.algorithm_uri.to_s+"'" if self.algorithm_uri==nil or self.algorithm_uri.to_s.size<1 @@ -301,9 +301,9 @@ module Validation class Crossvalidation - def perform_cv ( prediction_feature, algorithm_params=nil, task=nil ) - create_cv_datasets( prediction_feature, OpenTox::SubTask.create(task, 0, 33) ) - perform_cv_validations( algorithm_params, OpenTox::SubTask.create(task, 33, 100) ) + def perform_cv ( task=nil ) + create_cv_datasets( OpenTox::SubTask.create(task, 0, 33) ) + perform_cv_validations( OpenTox::SubTask.create(task, 33, 100) ) end def clean_loo_files( delete_feature_datasets ) @@ -349,27 +349,27 @@ module Validation end # creates the cv folds - def create_cv_datasets( prediction_feature, task=nil ) + def create_cv_datasets( task=nil ) if self.loo=="true" orig_dataset = Lib::DatasetCache.find(self.dataset_uri,self.subjectid) self.num_folds = orig_dataset.compounds.size self.random_seed = 0 - self.stratified = false + self.stratified = "false" else self.random_seed = 1 unless self.random_seed self.num_folds = 10 unless self.num_folds - self.stratified = false unless self.stratified + self.stratified = "false" unless self.stratified end - if copy_cv_datasets( prediction_feature ) + if copy_cv_datasets() # dataset folds of a previous crossvalidaiton could be used task.progress(100) if task else - create_new_cv_datasets( prediction_feature, task ) + create_new_cv_datasets( task ) end end # executes the cross-validation (build models and validates them) - def perform_cv_validations( algorithm_params, task=nil ) + def perform_cv_validations( task=nil ) LOGGER.debug "perform cv validations "+algorithm_params.inspect i = 0 @@ -377,8 +377,7 @@ module Validation @tmp_validations.each do | val | validation = Validation.create val validation.subjectid = self.subjectid - validation.validate_algorithm( algorithm_params, - OpenTox::SubTask.create(task, i * task_step, ( i + 1 ) * task_step) ) + validation.validate_algorithm( OpenTox::SubTask.create(task, i * task_step, ( i + 1 ) * task_step) ) raise "validation '"+validation.validation_uri+"' for crossvaldation could not be finished" unless validation.finished i += 1 @@ -395,14 +394,17 @@ module Validation private # copies datasets from an older crossvalidation on the same dataset and the same folds # returns true if successfull, false otherwise - def copy_cv_datasets( prediction_feature ) + def copy_cv_datasets( ) + # for downwards compatibilty: search prediction_feature=nil is ok cvs = Crossvalidation.find( { :dataset_uri => self.dataset_uri, :num_folds => self.num_folds, :stratified => self.stratified, :random_seed => self.random_seed, :loo => self.loo, - :finished => true} ).reject{ |cv| cv.id == self.id } + :finished => true} ).reject{ |cv| (cv.id == self.id || + (cv.prediction_feature && + cv.prediction_feature != self.prediction_feature)) } cvs.each do |cv| next if AA_SERVER and !OpenTox::Authorization.authorized?(cv.crossvalidation_uri,"GET",self.subjectid) tmp_val = [] @@ -420,7 +422,8 @@ module Validation :crossvalidation_id => self.id, :crossvalidation_fold => v.crossvalidation_fold, :prediction_feature => prediction_feature, - :algorithm_uri => self.algorithm_uri } + :algorithm_uri => self.algorithm_uri, + :algorithm_params => self.algorithm_params } end if tmp_val.size == self.num_folds.to_i @tmp_validations = tmp_val @@ -433,111 +436,78 @@ module Validation # creates cv folds (training and testdatasets) # stores uris in validation objects - def create_new_cv_datasets( prediction_feature, task = nil ) + def create_new_cv_datasets( task = nil ) LOGGER.debug "creating datasets for crossvalidation" orig_dataset = Lib::DatasetCache.find(self.dataset_uri,self.subjectid) raise OpenTox::NotFoundError.new "Dataset not found: "+self.dataset_uri.to_s unless orig_dataset - if self.loo=="true" - shuffled_compounds = orig_dataset.compounds - else - shuffled_compounds = orig_dataset.compounds.shuffle( self.random_seed ) - end + train_dataset_uris = [] + test_dataset_uris = [] - unless self.stratified + meta = { DC.creator => self.crossvalidation_uri } + case stratified + when "false" + if self.loo=="true" + shuffled_compounds = orig_dataset.compounds + else + shuffled_compounds = orig_dataset.compounds.shuffle( self.random_seed ) + end split_compounds = shuffled_compounds.chunk( self.num_folds.to_i ) - else - class_compounds = {} # "inactive" => compounds[], "active" => compounds[] .. - accept_values = orig_dataset.accept_values(prediction_feature) - raise OpenTox::BadRequestError.new("cannot apply stratification (not implemented for regression), acceptValue missing for prediction-feature '"+ - prediction_feature.to_s+"' in dataset '"+dataset_uri.to_s+"'") unless accept_values and accept_values.size>0 - accept_values.each do |value| - class_compounds[value] = [] - shuffled_compounds.each do |c| - #PENDING accept values are type string, data_entries may be boolean - class_compounds[value] << c if orig_dataset.data_entries[c][prediction_feature].collect{|v| v.to_s}.include?(value) - end - end - LOGGER.debug "stratified cv: different class values: "+class_compounds.keys.join(", ") - LOGGER.debug "stratified cv: num instances for each class value: "+class_compounds.values.collect{|c| c.size}.join(", ") - - split_class_compounds = [] # inactive_compounds[fold_i][], active_compounds[fold_i][], .. - class_compounds.values.each do |compounds| - split_class_compounds << compounds.chunk( self.num_folds.to_i ) - end - LOGGER.debug "stratified cv: splits for class values: "+split_class_compounds.collect{ |c| c.collect{ |cc| cc.size }.join("/") }.join(", ") - - # we cannot just merge the splits of the different class_values of each fold - # this could lead to folds, which sizes differ for more than 1 compound - split_compounds = [] - split_class_compounds.each do |split_comp| - # step 1: sort current split in ascending order - split_comp.sort!{|x,y| x.size <=> y.size } - # step 2: add splits - (0..self.num_folds.to_i-1).each do |i| - unless split_compounds[i] - split_compounds[i] = split_comp[i] + LOGGER.debug "cv: num instances for each fold: "+split_compounds.collect{|c| c.size}.join(", ") + + self.num_folds.to_i.times do |n| + test_compounds = [] + train_compounds = [] + self.num_folds.to_i.times do |nn| + compounds = split_compounds[nn] + if n == nn + compounds.each{ |compound| test_compounds << compound} else - split_compounds[i] += split_comp[i] - end + compounds.each{ |compound| train_compounds << compound} + end end - # step 3: sort (total) split in descending order - split_compounds.sort!{|x,y| y.size <=> x.size } + raise "internal error, num test compounds not correct,"+ + " is '#{test_compounds.size}', should be '#{(shuffled_compounds.size/self.num_folds.to_i)}'" unless + (shuffled_compounds.size/self.num_folds.to_i - test_compounds.size).abs <= 1 + raise "internal error, num train compounds not correct, should be '"+(shuffled_compounds.size-test_compounds.size).to_s+ + "', is '"+train_compounds.size.to_s+"'" unless shuffled_compounds.size - test_compounds.size == train_compounds.size + datasetname = 'dataset fold '+(n+1).to_s+' of '+self.num_folds.to_s + meta[DC.title] = "training "+datasetname + LOGGER.debug "training set: "+datasetname+"_train, compounds: "+train_compounds.size.to_s + train_dataset_uri = orig_dataset.split( train_compounds, orig_dataset.features.keys, + meta, self.subjectid ).uri + train_dataset_uris << train_dataset_uri + meta[DC.title] = "test "+datasetname + LOGGER.debug "test set: "+datasetname+"_test, compounds: "+test_compounds.size.to_s + test_features = orig_dataset.features.keys.dclone - [self.prediction_feature] + test_dataset_uri = orig_dataset.split( test_compounds, test_features, + meta, self.subjectid ).uri + test_dataset_uris << test_dataset_uri + end + when /true|super/ + if stratified=="true" + features = [ self.prediction_feature ] + else + features = nil end + train_datasets, test_datasets = stratified_k_fold_split(orig_dataset,meta, + "NA",self.num_folds.to_i,@subjectid,self.random_seed, features) + train_dataset_uris = test_datasets.collect{|d| d.uri} + test_dataset_uris = test_datasets.collect{|d| d.uri} + else + raise OpenTox::BadRequestError.new end - LOGGER.debug "cv: num instances for each fold: "+split_compounds.collect{|c| c.size}.join(", ") - - test_features = orig_dataset.features.keys.dclone - [prediction_feature] @tmp_validations = [] - - (1..self.num_folds.to_i).each do |n| - - datasetname = 'cv'+self.id.to_s + - #'_d'+orig_dataset.name.to_s + - '_f'+n.to_s+'of'+self.num_folds.to_s+ - '_r'+self.random_seed.to_s+ - '_s'+self.stratified.to_s - source = self.crossvalidation_uri - - test_compounds = [] - train_compounds = [] - - (1..self.num_folds.to_i).each do |nn| - compounds = split_compounds.at(nn-1) - - if n == nn - compounds.each{ |compound| test_compounds.push(compound)} - else - compounds.each{ |compound| train_compounds.push(compound)} - end - end - - raise "internal error, num test compounds not correct" unless (shuffled_compounds.size/self.num_folds.to_i - test_compounds.size).abs <= 1 - raise "internal error, num train compounds not correct, should be '"+(shuffled_compounds.size-test_compounds.size).to_s+ - "', is '"+train_compounds.size.to_s+"'" unless shuffled_compounds.size - test_compounds.size == train_compounds.size - - LOGGER.debug "training set: "+datasetname+"_train, compounds: "+train_compounds.size.to_s - #train_dataset_uri = orig_dataset.create_new_dataset( train_compounds, orig_dataset.features, datasetname + '_train', source ) - train_dataset_uri = orig_dataset.split( train_compounds, orig_dataset.features.keys, - { DC.title => datasetname + '_train', DC.creator => source }, self.subjectid ).uri - - LOGGER.debug "test set: "+datasetname+"_test, compounds: "+test_compounds.size.to_s - #test_dataset_uri = orig_dataset.create_new_dataset( test_compounds, test_features, datasetname + '_test', source ) - test_dataset_uri = orig_dataset.split( test_compounds, test_features, - { DC.title => datasetname + '_test', DC.creator => source }, self.subjectid ).uri - - #make sure self.id is set - #self.save if self.new? + self.num_folds.to_i.times do |n| tmp_validation = { :validation_type => "crossvalidation", - :training_dataset_uri => train_dataset_uri, - :test_dataset_uri => test_dataset_uri, + :training_dataset_uri => train_dataset_uris[n], + :test_dataset_uri => test_dataset_uris[n], :test_target_dataset_uri => self.dataset_uri, - :crossvalidation_id => self.id, :crossvalidation_fold => n, - :prediction_feature => prediction_feature, + :crossvalidation_id => self.id, :crossvalidation_fold => (n+1), + :prediction_feature => self.prediction_feature, :algorithm_uri => self.algorithm_uri } @tmp_validations << tmp_validation - task.progress( n / self.num_folds.to_f * 100 ) if task end end @@ -636,7 +606,7 @@ module Validation # splits a dataset into test and training dataset # returns map with training_dataset_uri and test_dataset_uri - def self.train_test_dataset_split( orig_dataset_uri, prediction_feature, subjectid, stratified=false, split_ratio=nil, random_seed=nil, task=nil ) + def self.train_test_dataset_split( orig_dataset_uri, prediction_feature, subjectid, stratified="false", split_ratio=nil, random_seed=nil, task=nil ) split_ratio=0.67 unless split_ratio split_ratio = split_ratio.to_f random_seed=1 unless random_seed @@ -652,15 +622,25 @@ module Validation "' not found in dataset, features are: \n"+ orig_dataset.features.keys.inspect unless orig_dataset.features.include?(prediction_feature) else - LOGGER.warn "no prediciton feature given, all features included in test dataset" + LOGGER.warn "no prediciton feature given, all features will be included in test dataset" end - if stratified + meta = { DC.creator => $url_provider.url_for('/training_test_split',:full) } + + case stratified + when /true|super/ + if stratified=="true" + raise OpenTox::BadRequestError.new "prediction feature required for stratified splits" unless prediction_feature + features = [prediction_feature] + else + LOGGER.warn "prediction feature is ignored for super-stratified splits" if prediction_feature + features = nil + end r_util = OpenTox::RUtil.new - split_sets = r_util.stratified_split( orig_dataset, "NA", df, split_ratio, random_seed ) + train, test = r_util.stratified_split( orig_dataset, meta, "NA", split_ratio, @subjectid, random_seed, features ) r_util.quit_r - result = {:training_dataset_uri => split_sets[0], :test_dataset_uri => split_sets[1]} - else + result = {:training_dataset_uri => train.uri, :test_dataset_uri => test.uri} + when "false" compounds = orig_dataset.compounds raise OpenTox::BadRequestError.new "Cannot split datset, num compounds in dataset < 2 ("+compounds.size.to_s+")" if compounds.size<2 split = (compounds.size*split_ratio).to_i @@ -674,22 +654,18 @@ module Validation test_compounds = compounds[(split+1)..-1] task.progress(33) if task + meta[DC.title] = "Training dataset split of "+orig_dataset.uri result = {} result[:training_dataset_uri] = orig_dataset.split( training_compounds, - orig_dataset.features.keys, - { DC.title => "Training dataset split of "+orig_dataset.title.to_s, - DC.creator => $url_provider.url_for('/training_test_split',:full) }, - subjectid ).uri + orig_dataset.features.keys, meta, subjectid ).uri task.progress(66) if task + meta[DC.title] = "Test dataset split of "+orig_dataset.uri result[:test_dataset_uri] = orig_dataset.split( test_compounds, - orig_dataset.features.keys.dclone - [prediction_feature], - { DC.title => "Test dataset split of "+orig_dataset.title.to_s, - DC.creator => $url_provider.url_for('/training_test_split',:full) }, - subjectid ).uri + orig_dataset.features.keys.dclone - [prediction_feature], meta, subjectid ).uri task.progress(100) if task - if !stratified and ENV['RACK_ENV'] =~ /test|debug/ + if ENV['RACK_ENV'] =~ /test|debug/ raise OpenTox::NotFoundError.new "Training dataset not found: '"+result[:training_dataset_uri].to_s+"'" unless Lib::DatasetCache.find(result[:training_dataset_uri],subjectid) test_data = Lib::DatasetCache.find result[:test_dataset_uri],subjectid @@ -698,8 +674,9 @@ module Validation raise "Test dataset num coumpounds != "+(compounds.size-split-1).to_s+", instead: "+ test_data.compounds.size.to_s+"\n"+test_data.to_yaml unless test_data.compounds.size==(compounds.size-1-split) end - LOGGER.debug "split done, training dataset: '"+result[:training_dataset_uri].to_s+"', test dataset: '"+result[:test_dataset_uri].to_s+"'" + else + raise OpenTox::BadRequestError.new "stratified != false|true|super, is #{stratified}" end result end -- cgit v1.2.3 From 87bd2a6e7521eb27df7b47292d2d46fd5a45443e Mon Sep 17 00:00:00 2001 From: Andreas Maunz Date: Wed, 11 Apr 2012 09:53:50 +0200 Subject: Fixed forced conversion to string --- lib/prediction_data.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/prediction_data.rb b/lib/prediction_data.rb index 42da5fc..d387d24 100644 --- a/lib/prediction_data.rb +++ b/lib/prediction_data.rb @@ -270,7 +270,7 @@ module Lib def self.classification_vals(dataset, compound, feature, accept_values) v_indices = [] values(dataset, compound, feature).each do |v| - i = accept_values.index(v.to_s) + i = accept_values.index(v) raise "illegal class_value of prediction (value is '"+v.to_s+"'), accept values are "+ accept_values.inspect unless v==nil or i!=nil v_indices << i @@ -294,4 +294,4 @@ module Lib v_mod end end -end \ No newline at end of file +end -- cgit v1.2.3 From 61552e6d87fcb3a4df951b9bcd2fcabd841e0a54 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Tue, 24 Apr 2012 09:32:52 +0200 Subject: add redis index to algorihtm_uris, fix stratified cv r-util import, explictly add self to access algorithm_params --- report/report_persistance.rb | 1 + validation/validation_service.rb | 12 +++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/report/report_persistance.rb b/report/report_persistance.rb index e02387f..ccdebad 100755 --- a/report/report_persistance.rb +++ b/report/report_persistance.rb @@ -200,6 +200,7 @@ module Reports index :report_type index :validation_uris index :crossvalidation_uris + index :algorithm_uris attr_accessor :subjectid diff --git a/validation/validation_service.rb b/validation/validation_service.rb index c433161..425c648 100755 --- a/validation/validation_service.rb +++ b/validation/validation_service.rb @@ -116,8 +116,8 @@ module Validation raise OpenTox::BadRequestError.new "no algorithm uri: '"+self.algorithm_uri.to_s+"'" if self.algorithm_uri==nil or self.algorithm_uri.to_s.size<1 params = { :dataset_uri => self.training_dataset_uri, :prediction_feature => self.prediction_feature } - if (algorithm_params!=nil) - algorithm_params.split(";").each do |alg_params| + if (self.algorithm_params!=nil) + self.algorithm_params.split(";").each do |alg_params| alg_param = alg_params.split("=",2) raise OpenTox::BadRequestError.new "invalid algorithm param: '"+alg_params.to_s+"'" unless alg_param.size==2 or alg_param[0].to_s.size<1 or alg_param[1].to_s.size<1 LOGGER.warn "algorihtm param contains empty space, encode? "+alg_param[1].to_s if alg_param[1] =~ /\s/ @@ -371,7 +371,7 @@ module Validation # executes the cross-validation (build models and validates them) def perform_cv_validations( task=nil ) - LOGGER.debug "perform cv validations "+algorithm_params.inspect + LOGGER.debug "perform cv validations" i = 0 task_step = 100 / self.num_folds.to_f; @tmp_validations.each do | val | @@ -490,9 +490,11 @@ module Validation else features = nil end - train_datasets, test_datasets = stratified_k_fold_split(orig_dataset,meta, + r_util = OpenTox::RUtil.new + train_datasets, test_datasets = r_util.stratified_k_fold_split(orig_dataset,meta, "NA",self.num_folds.to_i,@subjectid,self.random_seed, features) - train_dataset_uris = test_datasets.collect{|d| d.uri} + r_util.quit_r + train_dataset_uris = train_datasets.collect{|d| d.uri} test_dataset_uris = test_datasets.collect{|d| d.uri} else raise OpenTox::BadRequestError.new -- cgit v1.2.3 From 14ade1644b69da6229ef6f06f83fc32b2d1957ce Mon Sep 17 00:00:00 2001 From: mguetlein Date: Tue, 24 Apr 2012 09:51:55 +0200 Subject: fix setting of alogrithm_params --- validation/validation_service.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/validation/validation_service.rb b/validation/validation_service.rb index 425c648..8c8b11f 100755 --- a/validation/validation_service.rb +++ b/validation/validation_service.rb @@ -508,7 +508,8 @@ module Validation :test_target_dataset_uri => self.dataset_uri, :crossvalidation_id => self.id, :crossvalidation_fold => (n+1), :prediction_feature => self.prediction_feature, - :algorithm_uri => self.algorithm_uri } + :algorithm_uri => self.algorithm_uri, + :algorithm_params => self.algorithm_params} @tmp_validations << tmp_validation task.progress( n / self.num_folds.to_f * 100 ) if task end -- cgit v1.2.3 From ff53b7b3bf1ffe3447d107870eedbbd00ad71eef Mon Sep 17 00:00:00 2001 From: Andreas Maunz Date: Tue, 24 Apr 2012 16:02:26 +0200 Subject: Creating R image commented out. --- lib/predictions.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/predictions.rb b/lib/predictions.rb index 348ac44..d929f1a 100755 --- a/lib/predictions.rb +++ b/lib/predictions.rb @@ -863,7 +863,7 @@ module Lib rutil.r.eval "sum <- summary(fit)" puts "r r-square "+rutil.r.pull("sum$r.squared").to_s puts "r adjusted-r-square "+rutil.r.pull("sum$adj.r.squared").to_s - rutil.r.eval "save.image(\"/tmp/image.R\")" + #rutil.r.eval "save.image(\"/tmp/image.R\")" #rutil.r.eval "require(epiR)" #rutil.r.eval "tmp.ccc <- epi.ccc(v1,v2)" #puts "r ccc "+rutil.r.pull("tmp.ccc$rho.c$est").to_s -- cgit v1.2.3 From 42ef331a9127f44559cdfad1d58dea69faed1168 Mon Sep 17 00:00:00 2001 From: rautenberg Date: Fri, 13 Jul 2012 09:57:48 +0200 Subject: update README date --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4bbec5d..7005b22 100644 --- a/README.md +++ b/README.md @@ -6,4 +6,4 @@ OpenTox Validation [API documentation](http://rdoc.info/github/opentox/validation) -------------------------------------------------------------- -Copyright (c) 2009-2011 Martin Guetlein, Christoph Helma. See LICENSE for details. +Copyright (c) 2009-2012 Martin Guetlein, Christoph Helma. See LICENSE for details. -- cgit v1.2.3