add confidence plots

author: mguetlein <martin.guetlein@gmail.com> 2011-05-16 14:46:50 +0200
committer: mguetlein <martin.guetlein@gmail.com> 2011-05-16 14:46:50 +0200
commit: eb5f8b5da9b247d62abc8a7b9eb2e44fe46a1c79 (patch)
tree: 99bbf52ad3b7495114ffe50194b8f9c606f5f248 /report
parent: 8afc018a179b254905f93ef8607338a7826baf4e (diff)
4 files changed, 224 insertions, 15 deletions
diff --git a/report/environment.rb b/report/environment.rb
index 19ea3a2..59465aa 100755
--- a/report/environment.rb
+++ b/report/environment.rb
@@ -4,7 +4,7 @@
   'rexml/document',  'ruby-plot', 'opentox-ruby' ].each do |g|
     require g
 end
-gem 'ruby-plot', "~>0.3.0"
+gem 'ruby-plot', "~>0.4.0"
 
 #R.quit
 
diff --git a/report/plot_factory.rb b/report/plot_factory.rb
index 5fd20bb..a4e415a 100644
--- a/report/plot_factory.rb
+++ b/report/plot_factory.rb
@@ -79,7 +79,7 @@ module Reports
       end
   
       raise "no predictions performed" if x.size==0 || x[0].size==0
-      RubyPlot::plot_points(out_file, "Regression plot", "Predicted values", "Actual values", names, x, y )
+      RubyPlot::regression_point_plot(out_file, "Regression plot", "Predicted values", "Actual values", names, x, y )
     end
     
     
@@ -102,7 +102,7 @@ module Reports
         tp_rates = []
         attribute_values.each do |value|
           begin
-            data = transform_predictions(validation_set.filter({split_set_attribute => value}), class_value, false)
+            data = transform_roc_predictions(validation_set.filter({split_set_attribute => value}), class_value, false)
             names << value.to_s
             fp_rates << data[:fp_rate][0]
             tp_rates << data[:tp_rate][0]
@@ -112,11 +112,50 @@ module Reports
         end
         RubyPlot::plot_lines(out_file, "ROC-Plot", "False positive rate", "True Positive Rate", names, fp_rates, tp_rates )
       else
-        data = transform_predictions(validation_set, class_value, show_single_curves)
+        data = transform_roc_predictions(validation_set, class_value, show_single_curves)
         RubyPlot::plot_lines(out_file, "ROC-Plot", "False positive rate", "True Positive Rate", data[:names], data[:fp_rate], data[:tp_rate], data[:faint] )
       end  
     end
     
+    
+    def self.create_confidence_plot( out_file, validation_set, class_value, split_set_attribute=nil, show_single_curves=false )
+      
+      LOGGER.debug "creating confidence plot for '"+validation_set.size.to_s+"' validations, out-file:"+out_file.to_s
+      
+      if split_set_attribute
+        attribute_values = validation_set.get_values(split_set_attribute)
+        names = []
+        confidence = []
+        performance = []
+        attribute_values.each do |value|
+          begin
+            data = transform_confidence_predictions(validation_set.filter({split_set_attribute => value}), class_value, false)
+            names << value.to_s
+            confidence << data[:confidence][0]
+            performance << data[:performance][0]
+          rescue
+            LOGGER.warn "could not create confidence plot for "+value.to_s
+          end
+        end
+        #RubyPlot::plot_lines(out_file, "Percent Correct vs Confidence Plot", "Confidence", "Percent Correct", names, fp_rates, tp_rates )
+        case validation_set.unique_feature_type
+        when "classification"
+          RubyPlot::accuracy_confidence_plot(out_file, "Percent Correct vs Confidence Plot", "Confidence", "Percent Correct", names, confidence, performance)
+        when "regression"
+          RubyPlot::accuracy_confidence_plot(out_file, "RMSE vs Confidence Plot", "Confidence", "RMSE", names, confidence, performance, true)
+        end
+      else
+        data = transform_confidence_predictions(validation_set, class_value, show_single_curves)
+        case validation_set.unique_feature_type
+        when "classification"
+          RubyPlot::accuracy_confidence_plot(out_file, "Percent Correct vs Confidence Plot", "Confidence", "Percent Correct", data[:names], data[:confidence], data[:performance])
+        when "regression"
+          RubyPlot::accuracy_confidence_plot(out_file, "RMSE vs Confidence Plot", "Confidence", "RMSE", data[:names], data[:confidence], data[:performance], true)
+        end
+      end  
+    end
+    
+    
     def self.create_bar_plot( out_file, validation_set, title_attribute, value_attributes )
   
       LOGGER.debug "creating bar plot, out-file:"+out_file.to_s
@@ -128,7 +167,7 @@ module Reports
       validation_set.validations.each do |v|
         values = []
         value_attributes.each do |a|
-          validation_set.get_domain_for_attr(a).each do |class_value|
+          validation_set.get_accept_values_for_attr(a).each do |class_value|
             value = v.send(a)
             if value.is_a?(Hash)
               if class_value==nil
@@ -222,7 +261,7 @@ module Reports
     end
     
     private
-    def self.transform_predictions(validation_set, class_value, add_single_folds=false)
+    def self.transform_roc_predictions(validation_set, class_value, add_single_folds=false)
       
       if (validation_set.size > 1)
         
@@ -230,7 +269,7 @@ module Reports
         sum_roc_values = { :predicted_values => [], :actual_values => [], :confidence_values => []}
         
         (0..validation_set.size-1).each do |i|
-          roc_values = validation_set.get(i).get_predictions.get_roc_values(class_value)
+          roc_values = validation_set.get(i).get_predictions.get_prediction_values(class_value)
           sum_roc_values[:predicted_values] += roc_values[:predicted_values]
           sum_roc_values[:confidence_values] += roc_values[:confidence_values]
           sum_roc_values[:actual_values] += roc_values[:actual_values]
@@ -253,12 +292,51 @@ module Reports
         faint << false
         return { :names => names, :fp_rate => fp_rate, :tp_rate => tp_rate, :faint => faint }
       else
-        roc_values = validation_set.validations[0].get_predictions.get_roc_values(class_value)
+        roc_values = validation_set.validations[0].get_predictions.get_prediction_values(class_value)
         tp_fp_rates = get_tp_fp_rates(roc_values)
         return { :names => ["default"], :fp_rate => [tp_fp_rates[:fp_rate]], :tp_rate => [tp_fp_rates[:tp_rate]] }
       end
     end
     
+    def self.transform_confidence_predictions(validation_set, class_value, add_single_folds=false)
+      
+      if (validation_set.size > 1)
+        
+        names = []; performance = []; confidence = []; faint = []
+        sum_confidence_values = { :predicted_values => [], :actual_values => [], :confidence_values => []}
+        
+        (0..validation_set.size-1).each do |i|
+          confidence_values = validation_set.get(i).get_predictions.get_prediction_values(class_value)
+          sum_confidence_values[:predicted_values] += confidence_values[:predicted_values]
+          sum_confidence_values[:confidence_values] += confidence_values[:confidence_values]
+          sum_confidence_values[:actual_values] += confidence_values[:actual_values]
+          
+          if add_single_folds
+            begin
+              pref_conf_rates = get_performance_confidence_rates(confidence_values)
+              names << "fold "+i.to_s
+              performance << pref_conf_rates[:performance]
+              confidence << pref_conf_rates[:confidence]
+              faint << true
+            rescue
+              LOGGER.warn "could not get confidence vals for fold "+i.to_s
+            end
+          end
+        end
+        pref_conf_rates = get_performance_confidence_rates(sum_confidence_values, validation_set.unique_feature_type)
+        names << nil # "all"
+        performance << pref_conf_rates[:performance]
+        confidence << pref_conf_rates[:confidence]
+        faint << false
+        return { :names => names, :performance => performance, :confidence => confidence, :faint => faint }
+        
+      else
+        confidence_values = validation_set.validations[0].get_predictions.get_prediction_values(class_value)
+        pref_conf_rates = get_performance_confidence_rates(confidence_values, validation_set.unique_feature_type)
+        return { :names => ["default"], :performance => [pref_conf_rates[:performance]], :confidence => [pref_conf_rates[:confidence]] }
+      end
+    end    
+    
     def self.demo_rock_plot
       roc_values = {:confidence_values => [0.1, 0.9, 0.5, 0.6, 0.6, 0.6], 
                     :predicted_values =>  [1, 0, 0, 1, 0, 1],
@@ -271,6 +349,61 @@ module Reports
         "True Positive Rate", data[:names], data[:fp_rate], data[:tp_rate], data[:faint] )
     end
     
+    def self.get_performance_confidence_rates(roc_values, feature_type)
+      
+      c = roc_values[:confidence_values]
+      p = roc_values[:predicted_values]
+      a = roc_values[:actual_values]
+      raise "no prediction values for roc-plot" if p.size==0
+     
+      (0..p.size-2).each do |i|
+        ((i+1)..p.size-1).each do |j|
+          if c[i]<c[j]
+            c.swap!(i,j)
+            a.swap!(i,j)
+            p.swap!(i,j)
+          end
+        end
+      end
+      #puts c.inspect+"\n"+a.inspect+"\n"+p.inspect+"\n\n"
+      
+      perf = []
+      conf = []
+      
+      case feature_type
+      when "classification"
+        count = 0
+        correct = 0
+        (0..p.size-1).each do |i|
+          count += 1
+          correct += 1 if p[i]==a[i]
+          if i>0 && (c[i]>=conf[-1]-0.00001)
+            perf.pop
+            conf.pop
+          end
+          perf << correct/count.to_f * 100
+          conf << c[i]
+        end
+      when "regression"
+        count = 0
+        sum_squared_error = 0
+        (0..p.size-1).each do |i|
+          count += 1
+          sum_squared_error += (p[i]-a[i])**2
+          if i>0 && (c[i]>=conf[-1]-0.00001)
+            perf.pop
+            conf.pop
+          end
+          perf << Math.sqrt(sum_squared_error/count.to_f)
+          conf << c[i]
+        end
+      end
+      #puts perf.inspect
+      
+      return {:performance => perf,:confidence => conf}
+    end
+    
+    
     def self.get_tp_fp_rates(roc_values)
       
       c = roc_values[:confidence_values]
diff --git a/report/report_content.rb b/report/report_content.rb
index 6c8148e..ca04f25 100755
--- a/report/report_content.rb
+++ b/report/report_content.rb
@@ -147,8 +147,8 @@ class Reports::ReportContent
                             image_caption=nil)
                             
     image_title = "Regression plot" unless image_title
-    
-    section_regr = @xml_report.add_section(@current_section, section_title)
+    #section_regr = @xml_report.add_section(@current_section, section_title)
+    section_regr = @current_section
     prediction_set = validation_set.collect{ |v| v.get_predictions }
         
     if prediction_set.size>0
@@ -178,7 +178,8 @@ class Reports::ReportContent
                             image_titles=nil,
                             image_captions=nil)
                             
-    section_roc = @xml_report.add_section(@current_section, section_title)
+    #section_roc = @xml_report.add_section(@current_section, section_title)
+    section_roc = @current_section
     prediction_set = validation_set.collect{ |v| v.get_predictions && v.get_predictions.confidence_values_available? }
         
     if prediction_set.size>0
@@ -213,6 +214,49 @@ class Reports::ReportContent
     
   end
   
+  def add_confidence_plot( validation_set,
+                            split_set_attribute = nil,
+                            section_title="Confidence plots",
+                            section_text=nil,
+                            image_titles=nil,
+                            image_captions=nil)
+                            
+    #section_conf = @xml_report.add_section(@current_section, section_title)
+    section_conf = @current_section
+    prediction_set = validation_set.collect{ |v| v.get_predictions && v.get_predictions.confidence_values_available? }
+        
+    if prediction_set.size>0
+      if prediction_set.size!=validation_set.size
+        section_text += "\nWARNING: plot information not available for all validation results"
+        LOGGER.error "WARNING: plot information not available for all validation results:\n"+
+          "validation set size: "+validation_set.size.to_s+", prediction set size: "+prediction_set.size.to_s
+      end
+      @xml_report.add_paragraph(section_conf, section_text) if section_text
+
+      image_title = image_titles ? image_titles[i] : "Percent Correct vs Confidence Plot"
+      image_caption = image_captions ? image_captions[i] : nil
+      plot_file_name = "conf_plot"+@tmp_file_count.to_s+".png"
+      @tmp_file_count += 1
+      
+      begin
+      
+        plot_file_path = add_tmp_file(plot_file_name)
+        Reports::PlotFactory.create_confidence_plot( plot_file_path, prediction_set, nil, split_set_attribute, false )
+        @xml_report.add_imagefigure(section_conf, image_title, plot_file_name, "PNG", 100, image_caption)
+      
+      rescue Exception => ex
+        msg = "WARNING could not create confidence plot: "+ex.message
+        LOGGER.error(msg)
+        rm_tmp_file(plot_file_name)
+        @xml_report.add_paragraph(section_conf, msg)
+      end  
+    
+    else
+      @xml_report.add_paragraph(section_conf, "No prediction-confidence info for confidence plot available.")
+    end
+    
+  end  
+  
   def add_ranking_plots( validation_set,
                             compare_attribute,
                             equal_attribute,
diff --git a/report/report_factory.rb b/report/report_factory.rb
index f48d11a..08d9418 100755
--- a/report/report_factory.rb
+++ b/report/report_factory.rb
@@ -68,11 +68,17 @@ module Reports::ReportFactory
     case val.feature_type
     when "classification"
       report.add_result(validation_set, [:validation_uri] + VAL_ATTR_TRAIN_TEST + VAL_ATTR_CLASS, "Results", "Results")
-      report.add_roc_plot(validation_set)
       report.add_confusion_matrix(val)
+      report.add_section("Plots")
+      report.add_roc_plot(validation_set)
+      report.add_confidence_plot(validation_set)
+      report.end_section
     when "regression"
       report.add_result(validation_set, [:validation_uri] + VAL_ATTR_TRAIN_TEST + VAL_ATTR_REGR, "Results", "Results")
+      report.add_section("Plots")
       report.add_regression_plot(validation_set, :model_uri)
+      report.add_confidence_plot(validation_set)
+      report.end_section
     end
     task.progress(90) if task
     
@@ -104,14 +110,22 @@ module Reports::ReportFactory
     case validation_set.unique_feature_type
     when "classification"
       report.add_result(merged, [:crossvalidation_uri]+VAL_ATTR_CV+VAL_ATTR_CLASS-[:crossvalidation_fold],"Mean Results","Mean Results")
-      report.add_roc_plot(validation_set, nil, "ROC Plots over all folds")
-      report.add_roc_plot(validation_set, :crossvalidation_fold)
       report.add_confusion_matrix(merged.validations[0])
+      report.add_section("Plots")
+      report.add_roc_plot(validation_set)
+      report.add_roc_plot(validation_set, :crossvalidation_fold)
+      report.add_confidence_plot(validation_set)
+      report.add_confidence_plot(validation_set, :crossvalidation_fold)
+      report.end_section
       report.add_result(validation_set, VAL_ATTR_CV+VAL_ATTR_CLASS-[:num_folds],
         "Results","Results",nil,"validation")
     when "regression"
       report.add_result(merged, [:crossvalidation_uri]+VAL_ATTR_CV+VAL_ATTR_REGR-[:crossvalidation_fold],"Mean Results","Mean Results")
+      report.add_section("Plots")
       report.add_regression_plot(validation_set, :crossvalidation_fold)
+      report.add_confidence_plot(validation_set)
+      report.add_confidence_plot(validation_set, :crossvalidation_fold)
+      report.end_section
       report.add_result(validation_set, VAL_ATTR_CV+VAL_ATTR_REGR-[:num_folds], "Results","Results")
     end
     task.progress(90) if task
@@ -194,7 +208,25 @@ module Reports::ReportFactory
       end
       
     when "regression"
-      raise OpenTox::BadRequestError.new("algorithm comparison for regression not yet implemented")
+      
+      attributes = VAL_ATTR_CV+VAL_ATTR_REGR-[:crossvalidation_fold]
+      attributes = ([ :dataset_uri ] + attributes).uniq
+      
+      dataset_grouping.each do |validations|
+      
+        set = Reports::ValidationSet.create(validations)
+        
+        dataset = validations[0].dataset_uri
+        merged = set.merge([:algorithm_uri, :dataset_uri, :crossvalidation_id, :crossvalidation_uri])
+        merged.sort(:dataset_uri)
+        
+        report.add_section("Dataset: "+dataset)
+        report.add_result(merged,attributes,
+          "Mean Results","Mean Results",nil,"crossvalidation")
+        report.add_paired_ttest_table(set, :algorithm_uri, :r_square)
+        report.end_section
+      end
+      
     end
     task.progress(100) if task
     report
author	mguetlein <martin.guetlein@gmail.com>	2011-05-16 14:46:50 +0200
committer	mguetlein <martin.guetlein@gmail.com>	2011-05-16 14:46:50 +0200
commit	eb5f8b5da9b247d62abc8a7b9eb2e44fe46a1c79 (patch)
tree	99bbf52ad3b7495114ffe50194b8f9c606f5f248 /report
parent	8afc018a179b254905f93ef8607338a7826baf4e (diff)