diff options
author | mguetlein <martin.guetlein@gmail.com> | 2011-12-13 11:20:04 +0100 |
---|---|---|
committer | mguetlein <martin.guetlein@gmail.com> | 2011-12-13 11:20:04 +0100 |
commit | d02b54b2c58d2d71e29700bbedbb38768d6c9e35 (patch) | |
tree | f1605efcc90744581e450bea6e2587dd9e8d7511 /report | |
parent | cc5e2bb442a45351a191d1b69d03412991a20500 (diff) |
add filtering of validation reports
Diffstat (limited to 'report')
-rw-r--r-- | report/plot_factory.rb | 132 | ||||
-rwxr-xr-x | report/report_content.rb | 6 | ||||
-rwxr-xr-x | report/report_factory.rb | 34 | ||||
-rw-r--r-- | report/report_service.rb | 10 | ||||
-rw-r--r-- | report/statistical_test.rb | 22 | ||||
-rwxr-xr-x | report/validation_access.rb | 25 | ||||
-rwxr-xr-x | report/validation_data.rb | 26 |
7 files changed, 203 insertions, 52 deletions
diff --git a/report/plot_factory.rb b/report/plot_factory.rb index 6083d26..2d7946f 100644 --- a/report/plot_factory.rb +++ b/report/plot_factory.rb @@ -338,7 +338,6 @@ module Reports accept_values = validation_set.unique_feature_type=="classification" ? validation_set.get_accept_values : nil if (validation_set.size > 1) - names = []; performance = []; confidence = []; faint = [] sum_confidence_values = { :predicted_values => [], :actual_values => [], :confidence_values => []} @@ -378,19 +377,107 @@ module Reports end def self.demo_roc_plot -# roc_values = {:confidence_values => [0.1, 0.9, 0.5, 0.6, 0.6, 0.6], -# :predicted_values => [1, 0, 0, 1, 0, 1], -# :actual_values => [0, 1, 0, 0, 1, 1]} - roc_values = {:confidence_values => [0.9, 0.8, 0.7, 0.6, 0.5, 0.4], - :true_positives => [1, 1, 1, 0, 1, 0]} - tp_fp_rates = get_tp_fp_rates(roc_values) - labels = [] - tp_fp_rates[:youden].each do |point,confidence| - labels << ["confidence: "+confidence.to_s, point[0], point[1]] - end - + + seed = 831 #rand(1000) + puts seed + srand seed + plot_data = [] - plot_data << RubyPlot::LinePlotData.new(:name => "testname", :x_values => tp_fp_rates[:fp_rate], :y_values => tp_fp_rates[:tp_rate], :labels => labels) + n = 250 + a_cutoff = 0.5 + + a_real = [] + a_class = [] + n.times do |i| + a_real << rand + a_class << ( a_real[-1]>a_cutoff ? "a" : "b") + end + + puts a_real.to_csv + puts a_class.to_csv + + p_props = [[],[]] + p_classes = [] + + 2.times do |index| + + if (index==0) + p_noise = 0.15 + p_cutoff = 0.8 + else + p_noise = 0.5 + p_cutoff = 0.5 + end + + p_real = [] + p_class = [] + p_prop = [] + correct = [] + n.times do |i| + if rand<0.04 + p_real << rand + else + p_real << (a_real[i] + ((rand * p_noise) * (rand<0.5 ? 1 : -1))) + end + p_prop << ((p_cutoff-p_real[i]).abs) + p_class << ( p_real[-1]>p_cutoff ? "a" : "b") + correct << ((p_class[i]==a_class[i]) ? 1 : 0) + end + + puts "" + puts p_real.to_csv + puts p_class.to_csv + puts p_prop.to_csv + + p_prop_max = p_prop.max + p_prop_min = p_prop.min + p_prop_delta = p_prop_max - p_prop_min + n.times do |i| + p_prop[i] = (p_prop[i] - p_prop_min)/p_prop_delta.to_f + p_props[index][i] = p_prop[i] + end + + puts p_prop.to_csv + + p_classes << p_class + + (0..n-2).each do |i| + (i+1..n-1).each do |j| + if p_prop[i]<p_prop[j] + tmp = p_prop[i] + p_prop[i] = p_prop[j] + p_prop[j] = tmp + tmp = correct[i] + correct[i] = correct[j] + correct[j] = tmp + end + end + end + + puts p_prop.to_csv + puts correct.to_csv + puts "acc: "+(correct.sum/n.to_f).to_s + + roc_values = {:confidence_values => p_prop, + :true_positives => correct} + tp_fp_rates = get_tp_fp_rates(roc_values) + labels = [] + tp_fp_rates[:youden].each do |point,confidence| + labels << ["confidence: "+confidence.to_s, point[0], point[1]] + end + + plot_data << RubyPlot::LinePlotData.new(:name => "alg"+index.to_s, + :x_values => tp_fp_rates[:fp_rate], + :y_values => tp_fp_rates[:tp_rate]) + #,:labels => labels) + end + + puts "instance,class,prediction_1,propability_1,prediction_2,propability_2" + n.times do |i| + puts (i+1).to_s+","+a_class[i].to_s+","+p_classes[0][i].to_s+ + ","+p_props[0][i].to_s+ + ","+p_classes[1][i].to_s+","+p_props[1][i].to_s + end RubyPlot::plot_lines("/tmp/plot.png", "ROC-Plot", "False positive rate", @@ -424,7 +511,9 @@ module Reports conf.pop end if (predictions == nil) - predictions = Lib::Predictions.new([p[i]],[a[i]],[c[i]],feature_type, accept_values) + data = {:predicted_values => [p[i]],:actual_values => [a[i]], :confidence_values => [c[i]], + :feature_type => feature_type, :accept_values => accept_values} + predictions = Lib::Predictions.new(data) else predictions.update_stats(p[i], a[i], c[i]) end @@ -528,7 +617,20 @@ end #require "rubygems" #require "ruby-plot" -##Reports::PlotFactory::demo_ranking_plot +###Reports::PlotFactory::demo_ranking_plot +#class Array +# def sum +# inject( nil ) { |sum,x| sum ? sum+x : x } +# end +# +# def to_csv +# s = "" +# each do |x| +# s += (x.is_a?(Float) ? ("%.3f"%x) : (" "+x.to_s) )+", " +# end +# s +# end +#end #Reports::PlotFactory::demo_roc_plot #a = [1, 0, 1, 2, 3, 0, 2] diff --git a/report/report_content.rb b/report/report_content.rb index 61db340..3d92b52 100755 --- a/report/report_content.rb +++ b/report/report_content.rb @@ -22,6 +22,12 @@ class Reports::ReportContent @current_section = @xml_report.get_root_element end + def add_warning(warning) + sec = @xml_report.add_section(@current_section, "Warning") + @xml_report.add_paragraph(sec, warning) + end_section() + end + def add_paired_ttest_tables( validation_set, group_attribute, test_attributes, diff --git a/report/report_factory.rb b/report/report_factory.rb index 484cf12..2b978c5 100755 --- a/report/report_factory.rb +++ b/report/report_factory.rb @@ -63,14 +63,26 @@ module Reports::ReportFactory end end - def self.create_report_validation(validation_set, task=nil) + def self.add_filter_warning(report, filter_params) + msg = "The validation results for this report have been filtered." + msg += " Minimum confidence: "+ filter_params[:min_confidence].to_s if + filter_params[:min_confidence]!=nil + msg += " Minimum number of predictions (sorted with confidence): "+ filter_params[:min_num_predictions].to_s if + filter_params[:min_num_predictions]!=nil + msg += " Maximum number of predictions: "+ filter_params[:max_num_predictions].to_s if + filter_params[:max_num_predictions]!=nil + report.add_warning(msg) + end + + def self.create_report_validation(validation_set, params, task=nil) raise OpenTox::BadRequestError.new("num validations is not equal to 1") unless validation_set.size==1 val = validation_set.validations[0] pre_load_predictions( validation_set, OpenTox::SubTask.create(task,0,80) ) report = Reports::ReportContent.new("Validation report") - + add_filter_warning(report, validation_set.filter_params) if validation_set.filter_params!=nil + case val.feature_type when "classification" report.add_result(validation_set, [:validation_uri] + VAL_ATTR_TRAIN_TEST + VAL_ATTR_CLASS, "Results", "Results") @@ -109,8 +121,9 @@ module Reports::ReportFactory report end - def self.create_report_crossvalidation(validation_set, task=nil) + def self.create_report_crossvalidation(validation_set, params, task=nil) + raise OpenTox::BadRequestError.new "cv report not implemented for filter params" if validation_set.filter_params!=nil raise OpenTox::BadRequestError.new("num validations is not >1") unless validation_set.size>1 raise OpenTox::BadRequestError.new("crossvalidation-id not unique and != nil: "+ validation_set.get_values(:crossvalidation_id,false).inspect) if validation_set.unique_value(:crossvalidation_id)==nil @@ -119,7 +132,7 @@ module Reports::ReportFactory validation_set.unique_value(:num_folds).to_s+")") unless validation_set.unique_value(:num_folds).to_i==validation_set.size raise OpenTox::BadRequestError.new("num different folds is not equal to num validations") unless validation_set.num_different_values(:crossvalidation_fold)==validation_set.size raise OpenTox::BadRequestError.new("validations must have unique feature type, i.e. must be either all regression, "+ - "or all classification validations") unless validation_set.unique_feature_type + "or all classification validations") unless validation_set.unique_feature_type pre_load_predictions( validation_set, OpenTox::SubTask.create(task,0,80) ) validation_set.validations.sort! do |x,y| x.crossvalidation_fold.to_f <=> y.crossvalidation_fold.to_f @@ -138,13 +151,12 @@ module Reports::ReportFactory report.add_confusion_matrix(cv_set.validations[0]) report.add_section("Plots") [nil, :crossvalidation_fold].each do |split_attribute| - if (validation_set.get_accept_values.size == 2) if validation_set.get_true_accept_value!=nil report.add_roc_plot(validation_set, validation_set.get_true_accept_value,split_attribute) else - report.add_roc_plot(validation_set, validation_set.get_accept_values[0],split_attribute) - report.add_roc_plot(validation_set, validation_set.get_accept_values[1],split_attribute) + report.add_roc_plot(validation_set, validation_set.get_accept_values[0], split_attribute) + report.add_roc_plot(validation_set, validation_set.get_accept_values[1], split_attribute) report.align_last_two_images "ROC Plots" end end @@ -156,7 +168,8 @@ module Reports::ReportFactory end end report.end_section - report.add_result(validation_set, [:validation_uri, :validation_report_uri]+VAL_ATTR_CV+VAL_ATTR_CLASS-[:num_folds, :dataset_uri, :algorithm_uri], + report.add_result(validation_set, + [:validation_uri, :validation_report_uri]+VAL_ATTR_CV+VAL_ATTR_CLASS-[:num_folds, :dataset_uri, :algorithm_uri], "Results","Results") when "regression" report.add_result(cv_set, [:crossvalidation_uri]+VAL_ATTR_CV+VAL_ATTR_REGR-[:crossvalidation_fold],res_titel, res_titel, res_text) @@ -169,7 +182,9 @@ module Reports::ReportFactory report.add_confidence_plot(validation_set, :r_square, nil, :crossvalidation_fold) report.align_last_two_images "Confidence Plots Across Folds" report.end_section - report.add_result(validation_set, [:validation_uri, :validation_report_uri]+VAL_ATTR_CV+VAL_ATTR_REGR-[:num_folds, :dataset_uri, :algorithm_uri], "Results","Results") + report.add_result(validation_set, + [:validation_uri, :validation_report_uri]+VAL_ATTR_CV+VAL_ATTR_REGR-[:num_folds, :dataset_uri, :algorithm_uri], + "Results","Results") end task.progress(90) if task @@ -219,6 +234,7 @@ module Reports::ReportFactory pre_load_predictions( validation_set, OpenTox::SubTask.create(task,0,80) ) report = Reports::ReportContent.new("Algorithm comparison report") + add_filter_warning(report, validation_set.filter_params) if validation_set.filter_params!=nil if (validation_set.num_different_values(:dataset_uri)>1) all_merged = validation_set.merge([:algorithm_uri, :dataset_uri, :crossvalidation_id, :crossvalidation_uri]) diff --git a/report/report_service.rb b/report/report_service.rb index f299122..53a17ab 100644 --- a/report/report_service.rb +++ b/report/report_service.rb @@ -72,7 +72,15 @@ module Reports LOGGER.debug "identifier: '"+identifier.inspect+"'" raise "illegal num identifiers: "+identifier.size.to_s+" should be equal to num validation-uris ("+validation_uris.size.to_s+")" if identifier and identifier.size!=validation_uris.size - validation_set = Reports::ValidationSet.new(validation_uris, identifier, subjectid) + + filter_params = nil + [:min_confidence, :min_num_predictions, :max_num_predictions].each do |key| + if params[key] != nil + filter_params = {} unless filter_params + filter_params[key] = params[key].to_f + end + end + validation_set = Reports::ValidationSet.new(validation_uris, identifier, filter_params, subjectid) raise OpenTox::BadRequestError.new("cannot get validations from validation_uris '"+validation_uris.inspect+"'") unless validation_set and validation_set.size > 0 LOGGER.debug "loaded "+validation_set.size.to_s+" validation/s" task.progress(10) if task diff --git a/report/statistical_test.rb b/report/statistical_test.rb index 8d6bd62..4d85555 100644 --- a/report/statistical_test.rb +++ b/report/statistical_test.rb @@ -69,8 +69,8 @@ module Reports def self.paired_ttest( validations1, validations2, attribute, class_value, significance_level=0.95 ) - array1 = validations1.collect{ |v| (v.send(attribute).is_a?(Hash) ? v.send(attribute)[class_value] : v.send(attribute)) } - array2 = validations2.collect{ |v| (v.send(attribute).is_a?(Hash) ? v.send(attribute)[class_value] : v.send(attribute)) } + array1 = validations1.collect{ |v| (v.send(attribute).is_a?(Hash) ? v.send(attribute)[class_value].to_f : v.send(attribute).to_f) } + array2 = validations2.collect{ |v| (v.send(attribute).is_a?(Hash) ? v.send(attribute)[class_value].to_f : v.send(attribute).to_f) } LOGGER.debug "paired-t-testing "+attribute.to_s+" "+array1.inspect+" vs "+array2.inspect LIB::StatisticalTest.pairedTTest(array1, array2, significance_level) end @@ -83,12 +83,16 @@ module Reports end -#t1 = Time.new -#10.times do -# puts LIB::StatisticalTest.pairedTTest([1,2,3,4,5,12,4,2],[2,3,3,3,56,3,4,5]) -#end -#LIB::StatisticalTest.quitR -#t2 = Time.new -#puts t2-t1 +#x=["1.36840891838074", "2.89500403404236", "2.58440494537354", "1.96544003486633", "1.4017288684845", "1.68250012397766", "1.65089893341064", "2.24862003326416", "3.73909902572632", "2.36335206031799"] +#y=["1.9675121307373", "2.30981087684631", "2.59359288215637", "2.62243509292603", "1.98700189590454", "2.26789593696594", "2.03917217254639", "2.69466996192932", "1.96487307548523", "1.65820598602295"] +#puts LIB::StatisticalTest.pairedTTest(x,y) +# +##t1 = Time.new +##10.times do +# puts LIB::StatisticalTest.pairedTTest([1.01,2,3,4,5,12,4,2],[2,3,3,3,56,3,4,5]) +##end +#LIB::StatisticalTest.quit_r +##t2 = Time.new +##puts t2-t1 diff --git a/report/validation_access.rb b/report/validation_access.rb index 3b5335c..536923d 100755 --- a/report/validation_access.rb +++ b/report/validation_access.rb @@ -13,7 +13,7 @@ class Reports::ValidationDB self_uri.host == val_uri.host && self_uri.port == val_uri.port end - def resolve_cv_uris(validation_uris, identifier=nil, subjectid=nil) + def resolve_cv_uris(validation_uris, identifier, subjectid) res = {} count = 0 validation_uris.each do |u| @@ -47,8 +47,8 @@ class Reports::ValidationDB res end - def init_validation(validation, uri, subjectid=nil) - + def init_validation(validation, uri, filter_params, subjectid) + raise OpenTox::BadRequestError.new "not a validation uri: "+uri.to_s unless uri =~ /\/[0-9]+$/ validation_id = uri.split("/")[-1] raise OpenTox::BadRequestError.new "invalid validation id "+validation_id.to_s unless validation_id!=nil and @@ -63,6 +63,9 @@ class Reports::ValidationDB else v = YAML::load(OpenTox::RestClientWrapper.get uri, {:subjectid=>subjectid, :accept=>"application/serialize"}) end + v.filter_predictions(filter_params[:min_confidence], filter_params[:min_num_predictions], filter_params[:max_num_predictions]) if + filter_params + raise OpenTox::NotFoundError.new "validation with id "+validation_id.to_s+" not found" unless v raise OpenTox::BadRequestError.new "validation with id "+validation_id.to_s+" is not finished yet" unless v.finished (Validation::VAL_PROPS + Validation::VAL_CV_PROPS).each do |p| @@ -80,7 +83,7 @@ class Reports::ValidationDB end end - def init_validation_from_cv_statistics( validation, cv_uri, subjectid=nil ) + def init_validation_from_cv_statistics( validation, cv_uri, filter_params, subjectid ) raise OpenTox::BadRequestError.new "not a crossvalidation uri: "+cv_uri.to_s unless cv_uri.uri? and cv_uri =~ /crossvalidation.*\/[0-9]+$/ @@ -96,6 +99,9 @@ class Reports::ValidationDB cv = YAML::load(OpenTox::RestClientWrapper.get cv_uri, {:subjectid=>subjectid, :accept=>"application/serialize"}) v = YAML::load(OpenTox::RestClientWrapper.get cv_uri+"/statistics", {:subjectid=>subjectid, :accept=>"application/serialize"}) end + v.filter_predictions(filter_params[:min_confidence], filter_params[:min_num_predictions], filter_params[:max_num_predictions]) if + filter_params + (Validation::VAL_PROPS + Validation::VAL_CV_PROPS).each do |p| validation.send("#{p.to_s}=".to_sym, v.send(p)) end @@ -126,11 +132,14 @@ class Reports::ValidationDB end end - def get_predictions(validation, subjectid=nil, task=nil) - - Lib::OTPredictions.new( validation.feature_type, validation.test_dataset_uri, + def get_predictions(validation, filter_params, subjectid, task) + # we need compound info, cannot reuse stored prediction data + data = Lib::PredictionData.create( validation.feature_type, validation.test_dataset_uri, validation.test_target_dataset_uri, validation.prediction_feature, validation.prediction_dataset_uri, - validation.predicted_variable, validation.predicted_confidence, subjectid, task) + validation.predicted_variable, validation.predicted_confidence, subjectid, task ) + data = Lib::PredictionData.filter_data( data.data, data.compounds, + filter_params[:min_confidence], filter_params[:min_num_predictions], filter_params[:max_num_predictions] ) if filter_params!=nil + Lib::OTPredictions.new( data.data, data.compounds ) end def get_accept_values( validation, subjectid=nil ) diff --git a/report/validation_data.rb b/report/validation_data.rb index 61761ab..e91348d 100755 --- a/report/validation_data.rb +++ b/report/validation_data.rb @@ -86,18 +86,20 @@ module Reports VAL_ATTR_RANKING.collect{ |a| (a.to_s+"_ranking").to_sym } @@validation_attributes.each{ |a| attr_accessor a } - attr_reader :predictions, :subjectid + attr_reader :predictions, :subjectid, :filter_params attr_accessor :identifier, :validation_report_uri, :crossvalidation_report_uri - def initialize(uri = nil, subjectid = nil) - Reports.validation_access.init_validation(self, uri, subjectid) if uri + def initialize(uri = nil, filter_params=nil, subjectid = nil) + Reports.validation_access.init_validation(self, uri, filter_params, subjectid) if uri @subjectid = subjectid + raise unless filter_params==nil || filter_params.is_a?(Hash) + @filter_params = filter_params #raise "subjectid is nil" unless subjectid end - def self.from_cv_statistics( cv_uri, subjectid = nil ) - v = ReportValidation.new(nil, subjectid) - Reports.validation_access.init_validation_from_cv_statistics(v, cv_uri, subjectid) + def self.from_cv_statistics( cv_uri, filter_params, subjectid ) + v = ReportValidation.new(nil, filter_params, subjectid) + Reports.validation_access.init_validation_from_cv_statistics(v, cv_uri, filter_params, subjectid) v end @@ -116,7 +118,7 @@ module Reports task.progress(100) if task nil else - @predictions = Reports.validation_access.get_predictions( self, @subjectid, task ) + @predictions = Reports.validation_access.get_predictions( self, @filter_params, @subjectid, task ) end end end @@ -167,13 +169,13 @@ module Reports # class ValidationSet - def initialize(validation_uris=nil, identifier=nil, subjectid=nil) + def initialize(validation_uris=nil, identifier=nil, filter_params=nil, subjectid=nil) @unique_values = {} @validations = [] if validation_uris validation_uri_and_ids = ReportValidation.resolve_cv_uris(validation_uris, identifier, subjectid) validation_uri_and_ids.each do |u,id| - v = ReportValidation.new(u, subjectid) + v = ReportValidation.new(u, filter_params, subjectid) v.identifier = id if id ids = Reports.persistance.list_reports("validation",{:validation_uris=>v.validation_uri }) v.validation_report_uri = ReportService.instance.get_uri("validation",ids[-1]) if ids and ids.size>0 @@ -228,6 +230,10 @@ module Reports return false end + def filter_params + @validations.first.filter_params + end + # loads the attributes of the related crossvalidation into all validation objects # def load_cv_attributes @@ -424,7 +430,7 @@ module Reports new_set = ValidationSet.new grouping = Util.group(@validations, [:crossvalidation_id]) grouping.each do |g| - v = ReportValidation.from_cv_statistics(g[0].crossvalidation_uri, g[0].subjectid) + v = ReportValidation.from_cv_statistics(g[0].crossvalidation_uri, @validations.first.filter_params, g[0].subjectid) v.identifier = g.collect{|vv| vv.identifier}.uniq.join(";") new_set.validations << v end |