1 files changed, 42 insertions, 22 deletions
diff --git a/lib/predictions.rb b/lib/predictions.rb
index 420790e..2409375 100755
--- a/lib/predictions.rb
+++ b/lib/predictions.rb
@@ -36,7 +36,7 @@ module Lib
       #puts "actual:     "+actual_values.inspect
       #puts "confidence: "+confidence_values.inspect
       
-      raise "unknown feature_type: "+@feature_type.to_s unless 
+      raise "unknown feature_type: '"+@feature_type.to_s+"'" unless 
         @feature_type=="classification" || @feature_type=="regression"
       raise "no predictions" if @predicted_values.size == 0
       num_info = "predicted:"+@predicted_values.size.to_s+
@@ -45,16 +45,6 @@ module Lib
       raise "illegal num confidence values "+num_info if  @confidence_values.size != @predicted_values.size
       
       @confidence_values.each{ |c| raise "illegal confidence value: '"+c.to_s+"'" unless c==nil or (c.is_a?(Numeric) and c>=0 and c<=1) }
-      ## check if there is more than one different conf value
-      ## DEPRECATED? not sure anymore what this was about, 
-      ##             I am pretty sure this was for r-plot of roc curves
-      ##             roc curvers are now plotted manually
-      #conf_val_tmp = {}
-      #@confidence_values.each{ |c| conf_val_tmp[c] = nil }
-      #if conf_val_tmp.keys.size<2
-      #  LOGGER.warn("prediction w/o confidence values");
-      #  @confidence_values=nil
-      #end
       
       case @feature_type
       when "classification"
@@ -75,11 +65,13 @@ module Lib
       
       init_stats()
       (0..@predicted_values.size-1).each do |i|
-        update_stats( @predicted_values[i], @actual_values[i], (@confidence_values!=nil)?@confidence_values[i]:nil )
+        update_stats( @predicted_values[i], @actual_values[i], @confidence_values[i] )
       end
     end
     
     def init_stats
+      @conf_provided = false
+      
       @num_no_actual_value = 0
       @num_with_actual_value = 0 
       
@@ -134,6 +126,8 @@ module Lib
         else
           @num_predicted += 1
           
+          @conf_provided |= confidence_value!=nil
+          
           case @feature_type
           when "classification"
             @confusion_matrix[actual_value][predicted_value] += 1
@@ -170,8 +164,8 @@ module Lib
     
     def percent_correct
       raise "no classification" unless @feature_type=="classification"
-      return 0 if @num_with_actual_value==0
-      return 100 * @num_correct / (@num_with_actual_value - @num_unpredicted).to_f
+      pct = 100 * @num_correct / (@num_with_actual_value - @num_unpredicted).to_f
+      pct.nan? ? 0 : pct 
     end
     
     def percent_incorrect
@@ -181,10 +175,12 @@ module Lib
     end
     
     def accuracy
-      return percent_correct / 100.0
+      acc = percent_correct / 100.0
+      acc.nan? ? 0 : acc
     end
     
     def weighted_accuracy
+      return 0 unless confidence_values_available?      
       raise "no classification" unless @feature_type=="classification"
       total = 0
       correct = 0
@@ -250,10 +246,11 @@ module Lib
       return res
     end
     
+    # does only take the instances that are classified as <class-index> into account
     def area_under_roc(class_index=nil)
       return prediction_feature_value_map( lambda{ |i| area_under_roc(i) } ) if 
         class_index==nil
-      return 0.0 if @confidence_values==nil
+      return 0 unless confidence_values_available?
       
       LOGGER.warn("TODO: implement approx computiation of AUC,"+
         "so far Wilcoxon-Man-Whitney is used (exponential)") if 
@@ -427,8 +424,13 @@ module Lib
       return incorrect
     end
     
+    # Note:
+    # * (un-weighted) area under roc is computed with all __predicted__ isntances for a certain class
+    # * weighted weights each auc with the number of __acutal__ instances
+    # its like that, because its like that in weka   
     def weighted_area_under_roc
-      return weighted_measure( :area_under_roc )
+      w_auc = weighted_measure( :area_under_roc )
+      w_auc.nan? ? 0 : w_auc
     end
     
     def weighted_f_measure
@@ -436,6 +438,7 @@ module Lib
     end
     
     private
+    # the <measure> is weighted with the number of instances for each actual class value 
     def weighted_measure( measure )
       
       sum_instances = 0
@@ -473,18 +476,35 @@ module Lib
     end
     
     def r_square
-      return sample_correlation_coefficient ** 2
+      #return sample_correlation_coefficient ** 2
+      
+      # see http://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions
+      # see http://web.maths.unsw.edu.au/~adelle/Garvan/Assays/GoodnessOfFit.html
+      ss_tot = total_sum_of_squares
+      return 0 if ss_tot==0
+      r_2 = 1 - residual_sum_of_squares / ss_tot
+      ( r_2.infinite? || r_2.nan? ) ? 0 : r_2
     end
     
     def sample_correlation_coefficient
       # formula see http://en.wikipedia.org/wiki/Correlation_and_dependence#Pearson.27s_product-moment_coefficient
-      return ( @num_predicted * @sum_multiply - @sum_actual * @sum_predicted ) /
+      scc = ( @num_predicted * @sum_multiply - @sum_actual * @sum_predicted ) /
              ( Math.sqrt( [0, @num_predicted * @sum_squares_actual - @sum_actual**2].max ) *
                Math.sqrt( [0, @num_predicted * @sum_squares_predicted - @sum_predicted**2].max ) )
+      ( scc.infinite? || scc.nan? ) ? 0 : scc
     end
     
     def total_sum_of_squares
-      return @variance_actual * ( @num_predicted - 1 )
+      #return @variance_actual * ( @num_predicted - 1 )
+      sum = 0
+      @predicted_values.size.times do |i|
+        sum += (@actual_values[i]-@actual_mean)**2 if @predicted_values[i]!=nil
+      end
+      sum
+    end
+    
+    def residual_sum_of_squares
+      sum_squared_error
     end
     
     def target_variance_predicted
@@ -500,7 +520,7 @@ module Lib
     def get_prediction_values(class_value)
       
       #puts "get_roc_values for class_value: "+class_value.to_s
-      raise "no confidence values" if @confidence_values==nil
+      raise "no confidence values" unless confidence_values_available?
       #raise "no class-value specified" if class_value==nil
       
       class_index = @accept_values.index(class_value) if class_value!=nil
@@ -571,7 +591,7 @@ module Lib
     end
     
     def confidence_values_available?
-      return @confidence_values!=nil
+      @conf_provided
     end
     
     ###################################################################################################################