cv statistics, ie accept header hack, alg comp report

author: Martin Gütlein <martin.guetlein@gmail.com> 2010-03-02 14:11:57 +0100
committer: Martin Gütlein <martin.guetlein@gmail.com> 2010-03-02 14:11:57 +0100
commit: b7efeaaf79233de8bbc173fa426e4561c458d44f (patch)
tree: 9326f1d711a0536f069266a4613e0e0c65e26b62
parent: e2b814301c323bc787ad9d75eceb786e3cb7dde9 (diff)
16 files changed, 447 insertions, 168 deletions
diff --git a/lib/merge.rb b/lib/merge.rb
new file mode 100644
index 0000000..f35198d
--- /dev/null
+++ b/lib/merge.rb
@@ -0,0 +1,141 @@
+
+$merge_count = {}
+
+class Array
+  def merge_array( merge_attributes, equal_attributes=nil )
+    return nil if self.size == nil
+    return self[0] if self.size==1
+      
+    m = self[0].merge_object(self[1], merge_attributes, equal_attributes)
+    (2..self.size-1).each do |i|
+      m = m.merge_object(self[i], merge_attributes, equal_attributes)
+    end
+    return m
+  end
+end
+
+class Object
+  
+  def merge_count()
+    $merge_count[self] = 1 if $merge_count[self]==nil
+    return $merge_count[self] 
+  end
+  
+  def set_merge_count(merge_count)
+    $merge_count[self] = merge_count
+  end
+  
+  def self.compute_variance( old_variance, n, new_mean, old_mean, new_value )
+    # use revursiv formular for computing the variance
+    # ( see Tysiak, Folgen: explizit und rekursiv, ISSN: 0025-5866
+    #  http://www.frl.de/tysiakpapers/07_TY_Papers.pdf )
+    return (n>1 ? old_variance * (n-2)/(n-1) : 0) +
+           (new_mean - old_mean)**2 +
+           (n>1 ? (new_value - new_mean)**2/(n-1) : 0 )
+  end
+    
+  def self.merge_value( value1, weight1, compute_variance, variance1, value2 )
+    
+    if value1.is_a?(Numeric) and value2.is_a?(Numeric)
+      value = (value1 * weight1 + value2) / (weight1 + 1).to_f;
+      if compute_variance
+        variance = compute_variance( variance1!=nil ? variance1 : 0, weight1+1, value, value1, value2 )
+      end
+    elsif value1.is_a?(Array) and value2.is_a?(Array)
+      raise "cannot merge arrays with unequal sizes" if !value2.is_a?(Array) || value1.size!=value2.size
+      value = []
+      variance = []
+      (0..value1.size-1).each do |i|
+        m = merge_value( value1[i], weight1, compute_variance, variance1==nil ? nil : variance1[i], value2[i] )
+        value[i] = m[:value]
+        variance[i] = m[:variance] if compute_variance
+      end
+    elsif value1.is_a?(Hash) and value2.is_a?(Hash)
+      value = {}
+      variance = {}
+      value1.keys.each do |k|
+        m = merge_value( value1[k], weight1, compute_variance, variance1==nil ? nil : variance1[k], value2[k] )
+        value[k] = m[:value]
+        variance[k] = m[:variance] if compute_variance
+      end
+    else
+      if value1.to_s != value2.to_s
+        value = value1.to_s + "/" + value2.to_s
+      else
+        value = value2.to_s
+      end
+    end
+    
+    {:value => value, :variance => (compute_variance ? variance : nil) }
+  end 
+  
+  def merge_object( object, merge_attributes, equal_attributes=nil )
+
+    raise "classes not equal" if object.class != self.class
+    raise "not supported, successivly add unmerged object to a merge object" if object.merge_count>1
+    
+    new_object = self.class.new
+    merge_attributes.each do |variable|
+      next if variable.to_s =~ /_variance$/
+      
+      if (equal_attributes and equal_attributes.index(variable) != nil)
+        new_object.send("#{variable.to_s}=".to_sym, send(variable))
+      else
+        compute_variance = self.respond_to?( (variable.to_s+"_variance").to_sym ) #VAL_ATTR_VARIANCE.index(a)!=nil
+        old_variance = compute_variance ? send((variable.to_s+"_variance").to_sym) : nil 
+        m = Object::merge_value( send(variable), self.merge_count, compute_variance, old_variance, object.send(variable) )
+        new_object.send("#{variable.to_s}=".to_sym, m[:value])
+        new_object.send("#{variable.to_s}_variance=".to_sym, m[:variance]) if compute_variance
+      end
+    end
+
+    new_object.set_merge_count self.merge_count+1
+    return new_object
+  end 
+  
+end
+
+class MergeTest
+  
+  attr_accessor :string, :integer, :float, :hash_value, :float_variance 
+ 
+  def to_s
+    res = [:string, :integer, :float, :hash_value].collect do |var|
+       variance = nil
+       variance = "+-"+send((var.to_s+"_variance")).inspect if self.respond_to?( (var.to_s+"_variance").to_sym )
+       var.to_s+":"+send(var).inspect+variance.to_s
+    end
+    res.join(" ")
+  end
+  
+  def self.demo
+    to_merge = []
+    p = MergeTest.new
+    p.string = "asdf"
+    p.integer = 39
+    p.float = 78.6
+    p.hash_value = {:mixed_key=>80, :string_key=>"tu", :int_key=>70}
+    to_merge << p
+    
+    p = MergeTest.new
+    p.string = "jkl"
+    p.integer = 25
+    p.float = 35.6
+    p.hash_value = {:mixed_key=>"bla", :string_key=>"iu", :int_key=>34}
+    to_merge << p
+    
+    p = MergeTest.new
+    p.string = "qwert"
+    p.integer = 100
+    p.float = 100
+    p.hash_value = {:mixed_key=>45, :string_key=>"op", :int_key=>20}
+    to_merge << p
+    
+    puts "merged: "+to_merge.merge_array([:string, :integer, :float, :hash_value]).to_s    
+  end
+  
+end
+
+#MergeTest.demo
+
+
diff --git a/lib/ot_predictions.rb b/lib/ot_predictions.rb
index ee67c2c..3e11f2a 100644
--- a/lib/ot_predictions.rb
+++ b/lib/ot_predictions.rb
@@ -86,7 +86,7 @@ module Lib
     
       res = {}
       if @is_classification
-        (Lib::VAL_CLASS_PROPS).each{ |s| res[s] = send(s)}  
+        (Lib::VAL_CLASS_PROPS_EXTENDED).each{ |s| res[s] = send(s)}  
       else
         (Lib::VAL_REGR_PROPS).each{ |s| res[s] = send(s) }  
       end
diff --git a/lib/predictions.rb b/lib/predictions.rb
index f673f8c..259a990 100644
--- a/lib/predictions.rb
+++ b/lib/predictions.rb
@@ -35,6 +35,12 @@ module Lib
       raise "illegal num confidence values "+num_info if  @confidence_values.size != @predicted_values.size
       
       @confidence_values.each{ |c| raise "illegal confidence value: '"+c.to_s+"'" unless c==nil or (c.is_a?(Numeric) and c>=0 and c<=1) }
+      conf_val_tmp = {}
+      @confidence_values.each{ |c| conf_val_tmp[c] = nil }
+      if conf_val_tmp.keys.size<2
+        LOGGER.warn("prediction w/o confidence values");
+        @confidence_values=nil
+      end
       
       if @is_classification
         raise "prediction_feature_values missing while performing classification" unless @prediction_feature_values
@@ -54,13 +60,13 @@ module Lib
       
       init_stats()
       (0..@predicted_values.size-1).each do |i|
-        update_stats( @predicted_values[i], @actual_values[i], @confidence_values[i] )
+        update_stats( @predicted_values[i], @actual_values[i], (@confidence_values!=nil)?@confidence_values[i]:nil )
       end
     end
     
     def init_stats
       @num_no_actual_value = 0
-      @num_with_actual_value = 0
+      @num_with_actual_value = 0 
       
       @num_predicted = 0
       @num_unpredicted = 0
@@ -137,6 +143,10 @@ module Lib
       return 100 * @num_incorrect / @num_with_actual_value.to_f
     end
     
+    def accuracy
+      return percent_correct / 100.0
+    end
+    
     def percent_unpredicted
       return 0 if @num_with_actual_value==0
       return 100 * @num_unpredicted / @num_with_actual_value.to_f
@@ -188,6 +198,7 @@ module Lib
     
     def area_under_roc(class_index=nil)
       return prediction_feature_value_map( lambda{ |i| area_under_roc(i) } ) if class_index==nil
+      return 0.0 if @confidence_values==nil
       
       LOGGER.warn("TODO: implement approx computiation of AUC,"+
         "so far Wilcoxon-Man-Whitney is used (exponential)") if @predicted_values.size>1000
@@ -212,6 +223,7 @@ module Lib
           sum += 1 if tp>fp
         end
       end
+      
       return sum / (tp_conf.size * fp_conf.size).to_f
     end
     
@@ -378,7 +390,7 @@ module Lib
     # data for roc-plots ###################################################################################
     
     def get_roc_values(class_value)
-      
+      raise "no confidence values" if @confidence_values==nil
       class_index = @prediction_feature_values.index(class_value)
       raise "class not found "+class_value.to_s if class_index==nil and class_value!=nil
       
@@ -430,6 +442,10 @@ module Lib
       @is_classification
     end
     
+    def confidence_values_available?
+      return @confidence_values!=nil
+    end
+    
     ###################################################################################################################
     
     private
diff --git a/lib/rdf_provider.rb b/lib/rdf_provider.rb
index fab0eaf..1715566 100644
--- a/lib/rdf_provider.rb
+++ b/lib/rdf_provider.rb
@@ -78,7 +78,7 @@ module Lib
           set_literal( k, v, node)
         elsif @rdf_provider.object_property?(k)
           add_object_property( k, v, node)
-        elsif [ :uri, :id, :finished ].index(k)!=nil
+        elsif [ :uri, :id ].index(k)!=nil
           #skip
         else
           raise "illegal value k:"+k.to_s+" v:"+v.to_s
diff --git a/lib/validation_db.rb b/lib/validation_db.rb
index ca4a7e0..01607ce 100644
--- a/lib/validation_db.rb
+++ b/lib/validation_db.rb
@@ -6,7 +6,7 @@ end
 module Lib
 
   VAL_PROPS = [ :id, :uri, :model_uri, :training_dataset_uri, :prediction_feature,
-                :test_dataset_uri, :prediction_dataset_uri, :finished, 
+                :test_dataset_uri, :prediction_dataset_uri,  
                 :created_at, :real_runtime, # :cpu_runtime, 
                 :num_instances, :num_without_class, :percent_without_class, :num_unpredicted, :percent_unpredicted ] 
   
@@ -21,13 +21,14 @@ module Lib
                                 :num_true_positives, :num_true_negatives, :precision, 
                                 :recall, :true_negative_rate, :true_positive_rate ]
   VAL_CLASS_PROPS = VAL_CLASS_PROPS_SINGLE + VAL_CLASS_PROPS_PER_CLASS + [ :confusion_matrix ]
+  VAL_CLASS_PROPS_EXTENDED = VAL_CLASS_PROPS + [:accuracy]
 
   # :regression_statistics
   VAL_REGR_PROPS = [ :root_mean_squared_error, :mean_absolute_error, :r_square ]
   
   CROSS_VAL_PROPS = [:algorithm_uri, :dataset_uri, :num_folds, :stratified, :random_seed]
   
-  ALL_PROPS = VAL_PROPS + VAL_CV_PROPS + VAL_CLASS_PROPS + VAL_REGR_PROPS + CROSS_VAL_PROPS
+  ALL_PROPS = VAL_PROPS + VAL_CV_PROPS + VAL_CLASS_PROPS_EXTENDED + VAL_REGR_PROPS + CROSS_VAL_PROPS
 
   class Validation
     include DataMapper::Resource 
@@ -39,7 +40,6 @@ module Lib
     property :test_dataset_uri, String, :length => 255
     property :prediction_dataset_uri, String, :length => 255
     property :prediction_feature, String, :length => 255
-    property :finished, Boolean, :default => false
     property :created_at, DateTime
     property :real_runtime, Float
     
@@ -65,7 +65,6 @@ module Lib
     property :num_folds, Integer, :default => 10
     property :stratified, Boolean, :default => false
     property :random_seed, Integer, :default => 1
-    property :finished, Boolean, :default => false
   end
 end
 
diff --git a/report/external/mimeparse.rb b/report/external/mimeparse.rb
index f572c64..553c431 100644
--- a/report/external/mimeparse.rb
+++ b/report/external/mimeparse.rb
@@ -214,3 +214,7 @@ if __FILE__ == $0
     end
   end
 end
+
+
+#puts MIMEParse::best_match(["text/xml","text/html","application/pdf"],
+#  'application/x-ms-application,image/jpeg, application/xaml+xml, image/gif, image/pjpeg, application/x-ms-xbap, */*')
diff --git a/report/plot_factory.rb b/report/plot_factory.rb
index d2884e3..c1a731f 100644
--- a/report/plot_factory.rb
+++ b/report/plot_factory.rb
@@ -34,8 +34,8 @@ module Reports
         fp_rates = []
         tp_rates = []
         attribute_values.each do |value|
-          names << value
           data = transform_predictions(validation_set.filter({split_set_attribute => value}), class_value, false)
+          names << value
           fp_rates << data[:fp_rate][0]
           tp_rates << data[:tp_rate][0]
         end
@@ -53,14 +53,21 @@ module Reports
       data = []
       validation_set.validations.each do |v|
         values = []
-        value_attributes.collect do |a|
+        value_attributes.each do |a|
           value = v.send(a)
           if value.is_a?(Hash)
-            raise "bar plot value is hash, but no entry for class-value ("+class_value.to_s+")" unless value.key?(class_value)
-            value = value[class_value]
+            if class_value==nil
+              avg_value = 0
+              value.values.each{ |val| avg_value+=val }
+              value = avg_value/value.values.size.to_f
+            else
+              raise "bar plot value is hash, but no entry for class-value ("+class_value.to_s+"); value for "+a.to_s+" -> "+value.inspect unless value.key?(class_value)
+              value = value[class_value]
+            end
           end
           values.push(value)
         end
+        
         data << [v.send(title_attribute).to_s] + values
       end
       
@@ -74,10 +81,10 @@ module Reports
     end
     
     
-    def self.create_ranking_plot( svg_out_file, validation_set, compare_attribute, equal_attribute, rank_attribute )
+    def self.create_ranking_plot( svg_out_file, validation_set, compare_attribute, equal_attribute, rank_attribute, class_value=nil )
 
       #compute ranks
-      rank_set = validation_set.compute_ranking([equal_attribute],rank_attribute)
+      rank_set = validation_set.compute_ranking([equal_attribute],rank_attribute,class_value)
       #puts rank_set.to_array([:algorithm_uri, :dataset_uri, :acc, :acc_ranking]).collect{|a| a.inspect}.join("\n")
 
       #compute avg ranks
@@ -85,7 +92,7 @@ module Reports
       #puts merge_set.to_array([:algorithm_uri, :dataset_uri, :acc, :acc_ranking]).collect{|a| a.inspect}.join("\n")
       
       comparables = merge_set.get_values(compare_attribute)
-      ranks = merge_set.get_values((rank_attribute.to_s+"_ranking").to_sym)
+      ranks = merge_set.get_values((rank_attribute.to_s+"_ranking").to_sym,false)
       
       plot_ranking( rank_attribute.to_s+" ranking",
                     comparables, 
diff --git a/report/prediction_util.rb b/report/prediction_util.rb
index f35d73a..fbe7531 100644
--- a/report/prediction_util.rb
+++ b/report/prediction_util.rb
@@ -14,6 +14,7 @@ module Reports::PredictionUtil
   
       res = []
       
+      
       validation_set.validations.each do |v|
         (0..v.get_predictions.num_instances-1).each do |i|
           a = []
@@ -22,15 +23,15 @@ module Reports::PredictionUtil
           a.push(v.get_predictions.actual_value(i).to_nice_s) 
           a.push(v.get_predictions.predicted_value(i).to_nice_s)
           a.push(v.get_predictions.classification_miss?(i)?"X":"") if v.get_predictions.classification?
-          a.push(v.get_predictions.confidence_value(i).to_nice_s)
+          a.push(v.get_predictions.confidence_value(i).to_nice_s) if v.get_predictions.confidence_values_available?
           res.push(a)
         end
       end
         
       #res = res.sort{|x,y| y[3] <=> x[3] }
       header = [ "compound", "actual value", "predicted value"]
-      header.push "missclassified" if validation_set.validations[0].get_predictions.classification?
-      header.push "confidence value"
+      header.push "missclassified" if validation_set.first.get_predictions.classification?
+      header.push "confidence value" if validation_set.first.get_predictions.confidence_values_available?
       res.insert(0, validation_attributes + header)
       #puts res.collect{|c| c.inspect}.join("\n")
       
diff --git a/report/report_application.rb b/report/report_application.rb
index c68df11..4346f7f 100644
--- a/report/report_application.rb
+++ b/report/report_application.rb
@@ -31,9 +31,16 @@ end
 
 get '/report/:type/:id' do
   perform do |rs| 
+    
+    accept_header = request.env['HTTP_ACCEPT']
+    if request.env['HTTP_USER_AGENT'] =~ /MSIE/
+      LOGGER.info "Changing MSIE accept-header to text/html"
+      accept_header = "text/html"
+    end
     #request.env['HTTP_ACCEPT'] = "application/pdf"
-    content_type Reports::ReportFormat.get_format(request.env['HTTP_ACCEPT'])
-    result = body(File.new( rs.get_report(params[:type],params[:id],request.env['HTTP_ACCEPT']) ))
+    
+    content_type Reports::ReportFormat.get_format(accept_header)
+    result = body(File.new( rs.get_report(params[:type],params[:id],accept_header) ))
   end
 end
 
diff --git a/report/report_factory.rb b/report/report_factory.rb
index e577d70..a522901 100644
--- a/report/report_factory.rb
+++ b/report/report_factory.rb
@@ -5,6 +5,7 @@ VAL_ATTR_TRAIN_TEST = [ :model_uri, :training_dataset_uri, :test_dataset_uri, :p
 VAL_ATTR_CV = [ :algorithm_uri, :dataset_uri, :num_folds, :crossvalidation_fold ]
 # selected attributes of interest when performing classification
 VAL_ATTR_CLASS = [ :area_under_roc, :percent_correct, :true_positive_rate, :true_negative_rate ]
+VAL_ATTR_BAR_PLOT_CLASS = [ :area_under_roc, :accuracy, :true_positive_rate, :true_negative_rate ]
 VAL_ATTR_REGR = [ :root_mean_squared_error, :mean_absolute_error, :r_square ]
 
 
@@ -81,7 +82,7 @@ module Reports::ReportFactory
     #puts merged.get_values(:percent_correct_variance, false).inspect
     report = Reports::ReportContent.new("Crossvalidation report")
     
-    if (validation_set.validations[0].percent_correct!=nil) #classification
+    if (validation_set.first.classification?)
       report.add_section_result(merged, VAL_ATTR_CV+VAL_ATTR_CLASS-[:crossvalidation_fold],"Mean Results","Mean Results")
       
       report.add_section_roc_plot(validation_set, nil, nil, "roc-plot.svg")
@@ -106,13 +107,52 @@ module Reports::ReportFactory
     raise Reports::BadRequest.new("num validations is not >1") unless validation_set.size>1
     raise Reports::BadRequest.new("validations must be either all regression, "+
       +"or all classification validations") unless validation_set.all_classification? or validation_set.all_regression?
+    raise Reports::BadRequest.new("number of different algorithms <2") if validation_set.num_different_values(:algorithm_uri)<2
       
     if validation_set.has_nil_values?(:crossvalidation_id)
-      raise Reports::BadRequest.new("so far, algorithm comparison is only supported for crossvalidation results")
+      if validation_set.num_different_values(:test_dataset_uri)>1
+        
+        # groups results into sets with equal test and training dataset
+        dataset_grouping = Reports::Util.group(validation_set.validations, [:test_dataset_uri, :training_dataset_uri])
+        # check if the same algorithms exists for each test and training dataset
+        Reports::Util.check_group_matching(dataset_grouping, [:algorithm_uri])
+        
+        #merged = validation_set.merge([:algorithm_uri, :dataset_uri])
+        report = Reports::ReportContent.new("Algorithm comparison report Many datasets")
+        
+        if (validation_set.first.classification?)
+          report.add_section_result(validation_set,[:algorithm_uri, :test_dataset_uri]+VAL_ATTR_CLASS,"Mean Results","Mean Results")
+          report.add_section_ranking_plots(validation_set, :algorithm_uri, :test_dataset_uri,
+            [:accuracy, :true_positive_rate, :true_negative_rate], "true")
+        else # regression
+          raise Reports::BadRequest.new("not implemented yet for regression")
+        end
+        return report
+      else
+        # this groups all validations in x different groups (arrays) according to there algorithm-uri
+        algorithm_grouping = Reports::Util.group(validation_set.validations, [:algorithm_uri])
+        # we check if there are corresponding validations in each group that have equal attributes (folds, num-folds,..)
+        Reports::Util.check_group_matching(algorithm_grouping, [:training_dataset_uri, :test_dataset_uri, :prediction_feature])
+        
+        report = Reports::ReportContent.new("Algorithm comparison report")
+        
+        if (validation_set.first.classification?)
+          report.add_section_bar_plot(validation_set,nil,:algorithm_uri,VAL_ATTR_BAR_PLOT_CLASS, "bar-plot.svg")   
+          report.add_section_roc_plot(validation_set,nil, :algorithm_uri, "roc-plot.svg")
+          #validation_set.validations[0].get_prediction_feature_values.each do |class_value|
+            #report.add_section_bar_plot(validation_set,class_value,:algorithm_uri,VAL_ATTR_CLASS, "bar-plot-"+class_value+".svg")   
+            #report.add_section_roc_plot(validation_set, class_value, :algorithm_uri, "roc-plot-"+class_value+".svg")
+          #end
+          report.add_section_result(validation_set,[:algorithm_uri]+VAL_ATTR_CLASS,"Results","Results")
+        else #regression
+          #report.add_section_result(merged, VAL_ATTR_CV+VAL_ATTR_REGR-[:crossvalidation_fold],"Mean Results","Mean Results")
+          #report.add_section_result(validation_set, VAL_ATTR_CV+VAL_ATTR_REGR-[:num_folds], "Results","Results")
+        end
+        return report
+      end
     else
       raise Reports::BadRequest.new("num different cross-validation-ids <2") if validation_set.num_different_values(:crossvalidation_id)<2
       validation_set.load_cv_attributes
-      raise Reports::BadRequest.new("number of different algorithms <2") if validation_set.num_different_values(:algorithm_uri)<2
       
       if validation_set.num_different_values(:dataset_uri)>1
         # groups results into sets with equal dataset 
@@ -128,9 +168,9 @@ module Reports::ReportFactory
         merged = validation_set.merge([:algorithm_uri, :dataset_uri])
         report = Reports::ReportContent.new("Algorithm comparison report - Many datasets")
         
-        if (validation_set.validations[0].percent_correct!=nil) #classification
+        if (validation_set.first.classification?)
           report.add_section_result(merged,VAL_ATTR_CV+VAL_ATTR_CLASS-[:crossvalidation_fold],"Mean Results","Mean Results")
-          report.add_section_ranking_plots(merged, :algorithm_uri, :dataset_uri, [:acc, :auc, :sens, :spec])
+          report.add_section_ranking_plots(merged, :algorithm_uri, :dataset_uri, [:acc, :auc, :sens, :spec], "true")
         else # regression
           report.add_section_result(merged,VAL_ATTR_CV+VAL_ATTR_REGR-[:crossvalidation_fold],"Mean Results","Mean Results")
         end
@@ -145,7 +185,7 @@ module Reports::ReportFactory
         
         report = Reports::ReportContent.new("Algorithm comparison report")
         
-        if (validation_set.validations[0].percent_correct!=nil) #classification
+        if (validation_set.first.classification?)
           validation_set.validations[0].get_prediction_feature_values.each do |class_value|
             report.add_section_bar_plot(merged,class_value,:algorithm_uri,VAL_ATTR_CLASS, "bar-plot-"+class_value+".svg")   
             report.add_section_roc_plot(validation_set, class_value, :algorithm_uri, "roc-plot-"+class_value+".svg")
@@ -204,7 +244,7 @@ class Reports::ReportContent
     vals = vals.collect{|a| a.collect{|v| v.to_s[0,66] }}
     #PENDING transpose values if there more than 4 columns, and there are more than columns than rows
     transpose = vals[0].size>4 && vals[0].size>vals.size
-    @xml_report.add_table(section_table, table_title, vals, !transpose, transpose)   
+    @xml_report.add_table(section_table, table_title, vals, !transpose, transpose)
   end
   
   def add_section_confusion_matrix(  validation, 
@@ -235,12 +275,16 @@ class Reports::ReportContent
     end
     
     section_roc = @xml_report.add_section(@xml_report.get_root_element, section_title)
-    if validation_set.first.get_predictions
+    
+    prediction_set = validation_set.collect{ |v| v.get_predictions && v.get_predictions.confidence_values_available? }
+        
+    if prediction_set.size>0
+      
+      section_text += "\nWARNING: roc plot information not available for all validation results" if prediction_set.size!=validation_set.size
       @xml_report.add_paragraph(section_roc, section_text) if section_text
-
       begin
         plot_file_path = add_tmp_file(plot_file_name)
-        Reports::PlotFactory.create_roc_plot( plot_file_path, validation_set, class_value, split_set_attribute, validation_set.size>1 )
+        Reports::PlotFactory.create_roc_plot( plot_file_path, prediction_set, class_value, split_set_attribute, prediction_set.size>1 )
         @xml_report.add_imagefigure(section_roc, image_title, plot_file_name, "SVG", image_caption)
       rescue RuntimeError => ex
         LOGGER.error("could not create roc plot: "+ex.message)
@@ -248,7 +292,7 @@ class Reports::ReportContent
         @xml_report.add_paragraph(section_roc, "could not create roc plot: "+ex.message)
       end  
     else
-      @xml_report.add_paragraph(section_roc, "No prediction info for roc plot available.")
+      @xml_report.add_paragraph(section_roc, "No prediction-confidence info for roc plot available.")
     end
     
   end
@@ -257,13 +301,14 @@ class Reports::ReportContent
                             compare_attribute,
                             equal_attribute,
                             rank_attributes,
+                            class_value,
                             section_title="Ranking Plots",
                             section_text="This section contains the ranking plots.")
     
     section_rank = @xml_report.add_section(@xml_report.get_root_element, section_title)
     @xml_report.add_paragraph(section_rank, section_text) if section_text
 
-    rank_attributes.each{|a| add_ranking_plot(section_rank, validation_set, compare_attribute, equal_attribute, a, a.to_s+"-ranking.svg")}
+    rank_attributes.each{|a| add_ranking_plot(section_rank, validation_set, compare_attribute, equal_attribute, a, class_value, a.to_s+"-ranking.svg")}
   end
   
   def add_ranking_plot( report_section, 
@@ -271,12 +316,13 @@ class Reports::ReportContent
                         compare_attribute,
                         equal_attribute,
                         rank_attribute,
+                        class_value=nil,
                         plot_file_name="ranking.svg", 
                         image_title="Ranking Plot",
                         image_caption=nil)
     
     plot_file_path = add_tmp_file(plot_file_name)
-    Reports::PlotFactory::create_ranking_plot(plot_file_path, validation_set, compare_attribute, equal_attribute, rank_attribute)
+    Reports::PlotFactory::create_ranking_plot(plot_file_path, validation_set, compare_attribute, equal_attribute, rank_attribute, class_value)
     @xml_report.add_imagefigure(report_section, image_title, plot_file_name, "SVG", image_caption)
     
   end
@@ -287,11 +333,16 @@ class Reports::ReportContent
                             value_attributes,
                             plot_file_name="bar-plot.svg", 
                             section_title="Bar Plot",
-                            section_text="This section contains the bar plot.",
+                            section_text=nil,
                             image_title=nil,
                             image_caption=nil)
-    image_title = "Bar Plot for class-value '"+class_value+"'" unless image_title
-  
+    if class_value
+      section_text = "This section contains the bar plot for class '"+class_value+"'." unless section_text
+      image_title = "Bar Plot for class-value '"+class_value+"'" unless image_title
+    else
+      section_text = "This section contains the bar plot." unless section_text
+      image_title = "Bar Plot for all classes" unless image_title
+    end                            
     section_bar = @xml_report.add_section(@xml_report.get_root_element, section_title)
     @xml_report.add_paragraph(section_bar, section_text) if section_text
     
diff --git a/report/report_test.rb b/report/report_test.rb
index 3e0d093..ed6b377 100644
--- a/report/report_test.rb
+++ b/report/report_test.rb
@@ -8,12 +8,19 @@ require 'rack/test'
 require "lib/test_util.rb"
 
 
-#class Reports::ApplicationTest < Test::Unit::TestCase
-#  include Rack::Test::Methods
-#
-#  def app
-#    Sinatra::Application
-#  end
+class Reports::ApplicationTest < Test::Unit::TestCase
+  include Rack::Test::Methods
+
+  def app
+    Sinatra::Application
+  end
+  
+  def test_nothing
+    
+    get '/report/validation/1'     
+    
+    puts last_response.body
+  end
 #
 #  def test_webservice
 #    
@@ -68,122 +75,122 @@ require "lib/test_util.rb"
 #    end
 #  end
 #  
-#end
-
-
-
-class Reports::ReportServiceTest < Test::Unit::TestCase
-  include Lib::TestUtil
+end
 
-  WS_VAL = @@config[:services]["opentox-validation"]
-  WS_DATA=@@config[:services]["opentox-dataset"]
-  FILE=File.new("data/hamster_carcinogenicity.owl","r")
-  
-  WS_CLASS_ALG=File.join(@@config[:services]["opentox-algorithm"],"lazar")
-  WS_FEATURE_ALG=File.join(@@config[:services]["opentox-algorithm"],"fminer")
-  
-  #WS_CLASS_ALG_2="localhost:4008/algorithm"
-  #WS_FEATURE_ALG_2=nil
 
-  def test_service_ot_webservice
 
-    begin
-      
-      rep = Reports::ReportService.new("http://some.location")
-      types = rep.get_report_types
-      assert types.is_a?(String)
-      assert types.split("\n").size == Reports::ReportFactory::REPORT_TYPES.size
-      #Reports::ReportFactory::REPORT_TYPES.each{|t| rep.get_all_reports(t)}
-      #assert_raise(Reports::NotFound){rep.get_all_reports("osterhase")}
-      
-      ### using ot_mock_layer (reporting component does not rely on ot validation webservice)
-      
-      #ENV['REPORT_VALIDATION_ACCESS'] = "mock_layer"
-      #Reports::Validation.reset_validation_access
-      
-#      create_report(rep, "validation_uri_1", "validation")
-#      assert_raise(Reports::BadRequest){create_report(rep, ["validation_uri_1","validation_uri_2"], "validation")}
+#class Reports::ReportServiceTest < Test::Unit::TestCase
+#  include Lib::TestUtil
+#
+#  WS_VAL = @@config[:services]["opentox-validation"]
+#  WS_DATA=@@config[:services]["opentox-dataset"]
+#  FILE=File.new("data/hamster_carcinogenicity.owl","r")
+#  
+#  WS_CLASS_ALG=File.join(@@config[:services]["opentox-algorithm"],"lazar")
+#  WS_FEATURE_ALG=File.join(@@config[:services]["opentox-algorithm"],"fminer")
+#  
+#  #WS_CLASS_ALG_2="localhost:4008/algorithm"
+#  #WS_FEATURE_ALG_2=nil
+#
+#  def test_service_ot_webservice
+#
+#    begin
 #      
-#      create_report(rep, "crossvalidation_uri_1", "crossvalidation")
-#      create_report(rep, ["validation_uri_1"]*Reports::OTMockLayer::NUM_FOLDS, "crossvalidation")
-#      assert_raise(Reports::BadRequest){create_report(rep, ["validation_uri_1"]*(Reports::OTMockLayer::NUM_FOLDS-1), "crossvalidation")}
+#      rep = Reports::ReportService.new("http://some.location")
+#      types = rep.get_report_types
+#      assert types.is_a?(String)
+#      assert types.split("\n").size == Reports::ReportFactory::REPORT_TYPES.size
+#      #Reports::ReportFactory::REPORT_TYPES.each{|t| rep.get_all_reports(t)}
+#      #assert_raise(Reports::NotFound){rep.get_all_reports("osterhase")}
 #      
-#      create_report(rep, ["crossvalidation_uri_1"]* (Reports::OTMockLayer::NUM_DATASETS * Reports::OTMockLayer::NUM_ALGS), "algorithm_comparison")
-#      create_report(rep, ["validation_uri_1"]* (Reports::OTMockLayer::NUM_DATASETS * Reports::OTMockLayer::NUM_ALGS * Reports::OTMockLayer::NUM_FOLDS), "algorithm_comparison")
-
-      ### using ot webservices (instead of mock layer)
-
-      #ENV['REPORT_VALIDATION_ACCESS'] = nil
-      #Reports::Validation.reset_validation_access
-      
-      #data_uri = upload_data WS_DATA,  FILE
-      #data_uri= File.join(WS_DATA,"1")
-      
-#      #val_uri = create_single_validation(data_uri)
-#      #val_uri = create_single_validation(data_uri, WS_CLASS_ALG_2, WS_FEATURE_ALG_2)
-#      val_uri = File.join(WS_VAL,"15")
-##      #add_resource val_uri
-#      create_report(rep, val_uri, "validation")
-        
-       #val_uri = create_cross_validation(data_uri, WS_CLASS_ALG_2, WS_FEATURE_ALG_2)
-       #val_uri = create_cross_validation(data_uri)
-       val_uri = File.join(WS_VAL,"crossvalidation/1")
-       #val_uri2 = "http://localhost:4007/crossvalidation/14"
-#       # add_resource val_uri
-       create_report(rep, val_uri, "crossvalidation")
-        
-#         #val_uri2 = create_cross_validation(data_uri, WS_CLASS_ALG_2, WS_FEATURE_ALG_2)
-#         #val_uri = ["http://localhost:4007/crossvalidation/6", "http://localhost:4007/crossvalidation/8"]
-         #val_uri = ["http://localhost:4007/crossvalidation/7", "http://localhost:4007/crossvalidation/8"]
-#         #add_resource val_uri
-         #create_report(rep, val_uri, "algorithm_comparison")
-      
-    ensure
-     # delete_resources
-    end
-  end
-  
-  private
-  def create_single_validation(data_uri, ws_class_alg=WS_CLASS_ALG, ws_feat_alg=WS_FEATURE_ALG)
-    puts "validating"
-    val_params = { 
-        :dataset_uri => data_uri, 
-        :algorithm_uri => ws_class_alg, 
-        :split_ratio=>0.7,
-        :prediction_feature => "classification",}
-    val_params[:feature_generation_uri] = ws_feat_alg if ws_feat_alg
-    begin
-      RestClient.post WS_VAL+"/validation/training_test_split", val_params
-    rescue => ex
-      raise "error validating "+WS_VAL+"/validation/training_test_split\n "+val_params.inspect+" \n -> "+ex.message
-    end
-  end
-  
-  def create_cross_validation(data_uri, ws_class_alg=WS_CLASS_ALG, ws_feat_alg=WS_FEATURE_ALG)
-    puts "cross-validating"
-    ext("curl -X POST -d num_folds=3 -d dataset_uri="+data_uri+" -d algorithm_uri="+ws_class_alg+" -d prediction_feature=classification"+
-        (ws_feat_alg ? " -d feature_generation_uri="+ws_feat_alg : "")+
-        " "+WS_VAL+"/crossvalidation",nil)
-  end
-  
-  def create_report(report_service, val_uri, type)
-    
-    Reports.reset_ot_access if ENV['USE_OT_MOCK_LAYER']
-    report_uri = report_service.create_report(type, val_uri)
-    assert type == report_service.parse_type(report_uri)
-    id = report_service.parse_id(report_uri)
-    
-    #puts "created report with id "+id.to_s
-    
-    #assert_raise(Reports::BadRequest){report_service.get_report(type, id, "weihnachtsmann")}
-    
-    report_service.get_report(type, id, "text/html")
-    #report_service.get_report(type, id, "application/pdf")
-    #assert_raise(Reports::NotFound){report_service.delete_report(type, 877658)}
-
-#      rep.delete_report(type, id)
-  end
-end
+#      ### using ot_mock_layer (reporting component does not rely on ot validation webservice)
+#      
+#      #ENV['REPORT_VALIDATION_ACCESS'] = "mock_layer"
+#      #Reports::Validation.reset_validation_access
+#      
+##      create_report(rep, "validation_uri_1", "validation")
+##      assert_raise(Reports::BadRequest){create_report(rep, ["validation_uri_1","validation_uri_2"], "validation")}
+##      
+##      create_report(rep, "crossvalidation_uri_1", "crossvalidation")
+##      create_report(rep, ["validation_uri_1"]*Reports::OTMockLayer::NUM_FOLDS, "crossvalidation")
+##      assert_raise(Reports::BadRequest){create_report(rep, ["validation_uri_1"]*(Reports::OTMockLayer::NUM_FOLDS-1), "crossvalidation")}
+##      
+##      create_report(rep, ["crossvalidation_uri_1"]* (Reports::OTMockLayer::NUM_DATASETS * Reports::OTMockLayer::NUM_ALGS), "algorithm_comparison")
+##      create_report(rep, ["validation_uri_1"]* (Reports::OTMockLayer::NUM_DATASETS * Reports::OTMockLayer::NUM_ALGS * Reports::OTMockLayer::NUM_FOLDS), "algorithm_comparison")
+#
+#      ### using ot webservices (instead of mock layer)
+#
+#      #ENV['REPORT_VALIDATION_ACCESS'] = nil
+#      #Reports::Validation.reset_validation_access
+#      
+#      #data_uri = upload_data WS_DATA,  FILE
+#      #data_uri= File.join(WS_DATA,"1")
+#      
+##      #val_uri = create_single_validation(data_uri)
+##      #val_uri = create_single_validation(data_uri, WS_CLASS_ALG_2, WS_FEATURE_ALG_2)
+##      val_uri = File.join(WS_VAL,"15")
+###      #add_resource val_uri
+##      create_report(rep, val_uri, "validation")
+#        
+#       #val_uri = create_cross_validation(data_uri, WS_CLASS_ALG_2, WS_FEATURE_ALG_2)
+#       #val_uri = create_cross_validation(data_uri)
+#       val_uri = File.join(WS_VAL,"crossvalidation/1")
+#       #val_uri2 = "http://localhost:4007/crossvalidation/14"
+##       # add_resource val_uri
+#       create_report(rep, val_uri, "crossvalidation")
+#        
+##         #val_uri2 = create_cross_validation(data_uri, WS_CLASS_ALG_2, WS_FEATURE_ALG_2)
+##         #val_uri = ["http://localhost:4007/crossvalidation/6", "http://localhost:4007/crossvalidation/8"]
+#         #val_uri = ["http://localhost:4007/crossvalidation/7", "http://localhost:4007/crossvalidation/8"]
+##         #add_resource val_uri
+#         #create_report(rep, val_uri, "algorithm_comparison")
+#      
+#    ensure
+#     # delete_resources
+#    end
+#  end
+#  
+#  private
+#  def create_single_validation(data_uri, ws_class_alg=WS_CLASS_ALG, ws_feat_alg=WS_FEATURE_ALG)
+#    puts "validating"
+#    val_params = { 
+#        :dataset_uri => data_uri, 
+#        :algorithm_uri => ws_class_alg, 
+#        :split_ratio=>0.7,
+#        :prediction_feature => "classification",}
+#    val_params[:feature_generation_uri] = ws_feat_alg if ws_feat_alg
+#    begin
+#      RestClient.post WS_VAL+"/validation/training_test_split", val_params
+#    rescue => ex
+#      raise "error validating "+WS_VAL+"/validation/training_test_split\n "+val_params.inspect+" \n -> "+ex.message
+#    end
+#  end
+#  
+#  def create_cross_validation(data_uri, ws_class_alg=WS_CLASS_ALG, ws_feat_alg=WS_FEATURE_ALG)
+#    puts "cross-validating"
+#    ext("curl -X POST -d num_folds=3 -d dataset_uri="+data_uri+" -d algorithm_uri="+ws_class_alg+" -d prediction_feature=classification"+
+#        (ws_feat_alg ? " -d feature_generation_uri="+ws_feat_alg : "")+
+#        " "+WS_VAL+"/crossvalidation",nil)
+#  end
+#  
+#  def create_report(report_service, val_uri, type)
+#    
+#    Reports.reset_ot_access if ENV['USE_OT_MOCK_LAYER']
+#    report_uri = report_service.create_report(type, val_uri)
+#    assert type == report_service.parse_type(report_uri)
+#    id = report_service.parse_id(report_uri)
+#    
+#    #puts "created report with id "+id.to_s
+#    
+#    #assert_raise(Reports::BadRequest){report_service.get_report(type, id, "weihnachtsmann")}
+#    
+#    report_service.get_report(type, id, "text/html")
+#    #report_service.get_report(type, id, "application/pdf")
+#    #assert_raise(Reports::NotFound){report_service.delete_report(type, 877658)}
+#
+##      rep.delete_report(type, id)
+#  end
+#end
 
     
 
diff --git a/report/validation_access.rb b/report/validation_access.rb
index 2a5ce51..e06c1f0 100644
--- a/report/validation_access.rb
+++ b/report/validation_access.rb
@@ -73,7 +73,7 @@ class Reports::ValidationDB < Reports::ValidationAccess
       validation.send("#{p.to_s}=".to_sym, v[p])
     end
     
-    {:classification_statistics => Lib::VAL_CLASS_PROPS, 
+    {:classification_statistics => Lib::VAL_CLASS_PROPS_EXTENDED, 
      :regression_statistics => Lib::VAL_REGR_PROPS}.each do |subset_name,subset_props|
       subset = v[subset_name]
       subset_props.each{ |prop| validation.send("#{prop.to_s}=".to_sym, subset[prop]) } if subset
@@ -148,7 +148,7 @@ class Reports::ValidationWebservice < Reports::ValidationAccess
     #validation.prediction_feature = model.get_prediction_feature
     
     {Lib::VAL_CV_PROP => Lib::VAL_CV_PROPS,
-     Lib::VAL_CLASS_PROP => Lib::VAL_CLASS_PROPS}.each do |subset_name,subset_props|
+     Lib::VAL_CLASS_PROP => Lib::VAL_CLASS_PROPS_EXTENDED}.each do |subset_name,subset_props|
       subset = data[subset_name]
       subset_props.each{ |prop| validation.send("#{prop}=".to_sym, subset[prop]) } if subset
     end
diff --git a/report/validation_data.rb b/report/validation_data.rb
index 13a5175..a2b8905 100644
--- a/report/validation_data.rb
+++ b/report/validation_data.rb
@@ -1,7 +1,7 @@
 
 # the variance is computed when merging results for these attributes 
-VAL_ATTR_VARIANCE = [ :area_under_roc, :percent_correct, :root_mean_squared_error, :mean_absolute_error, :r_square ]
-VAL_ATTR_RANKING = [ :area_under_roc, :percent_correct, :true_positive_rate, :true_negative_rate ]
+VAL_ATTR_VARIANCE = [ :area_under_roc, :percent_correct, :root_mean_squared_error, :mean_absolute_error, :r_square, :accuracy  ]
+VAL_ATTR_RANKING = [ :area_under_roc, :percent_correct, :true_positive_rate, :true_negative_rate, :accuracy ]
 
 class Object
   
@@ -25,6 +25,19 @@ class Object
   end
 end
 
+class Hash
+  
+  def mean_value
+    sum = 0
+    self.values.collect do |v|
+      raise "cannot compute mean of non-numeric values '"+self.inspect+"'" unless v!=nil and v.is_a?(Numeric)
+      sum+=v
+    end
+    sum/=self.values.size.to_f
+  end
+  
+end
+
 
 module Reports
   
@@ -261,6 +274,18 @@ module Reports
       return new_set
     end
     
+    # returns a new set with all validation that the attached block accepted
+    # e.g. create set with predictions: collect{ |validation| validation.get_predictions!=null } 
+    #
+    # call-seq:
+    #   filter_proc(proc) => Reports::ValidationSet
+    # 
+    def collect
+      new_set = Reports::ValidationSet.new
+      validations.each{ |v| new_set.validations.push(v) if yield(v) }
+      return new_set
+    end
+    
     # returns an array, with values for __attributes__, that can be use for a table
     # * first row is header row
     # * other rows are values
@@ -321,7 +346,7 @@ module Reports
     # call-seq:
     #   compute_ranking(equal_attributes, ranking_attribute) => array
     # 
-    def compute_ranking(equal_attributes, ranking_attribute)
+    def compute_ranking(equal_attributes, ranking_attribute, class_value=nil )
     
       new_set = Reports::ValidationSet.new
       (0..@validations.size-1).each do |i|
@@ -334,7 +359,16 @@ module Reports
         # put indices and ranking values for current group into hash
         rank_hash = {}
         (0..group.size-1).each do |i|
-          rank_hash[i] = group[i].send(ranking_attribute)
+          val = group[i].send(ranking_attribute)
+          if val.is_a?(Hash)
+            if class_value != nil
+              raise "no value for class value "+class_value.class.to_s+" "+class_value.to_s+" in hash "+val.inspect.to_s unless val.has_key?(class_value)
+              val = val[class_value]
+            else
+              val = val.mean_value
+            end
+          end
+          rank_hash[i] = val
         end
               
         # sort group accrording to second value (= ranking value)
diff --git a/validation/validation_application.rb b/validation/validation_application.rb
index d4e1a2e..eb3e4a4 100644
--- a/validation/validation_application.rb
+++ b/validation/validation_application.rb
@@ -4,6 +4,7 @@
 end
 
 require 'validation/validation_service.rb'
+require 'lib/merge.rb'
 
 
 # hack: store self in $sinatra to make url_for method accessible in validation_service
@@ -46,8 +47,6 @@ get '/crossvalidation/:id' do
   else
     halt 400, "MIME type '"+request.env['HTTP_ACCEPT'].to_s+"' not supported."
   end
-  
-  halt 202, result unless crossvalidation.finished
   result
 end
 
@@ -65,6 +64,22 @@ get '/crossvalidation/:id/validations' do
   Validation::Validation.all(:crossvalidation_id => params[:id]).collect{ |v| v.uri.to_s }.join("\n")+"\n"
 end
 
+
+get '/crossvalidation/:id/statistics' do
+  LOGGER.info "get merged validation-result for crossvalidation with id "+params[:id].to_s
+  halt 404, "Crossvalidation #{params[:id]} not found." unless crossvalidation = Validation::Crossvalidation.get(params[:id])
+  
+  to_merge = [:prediction_feature, :num_instances,:num_without_class,:percent_without_class,:num_unpredicted,:percent_unpredicted,
+    :classification_statistics,:regression_statistics,:crossvalidation_id]
+  v = Validation::Validation.all(:crossvalidation_id => params[:id]).merge_array(to_merge)
+  v.uri = nil
+  v.created_at = nil
+  v.id = nil
+  content_type "text/x-yaml"
+  v.to_yaml
+end
+
+
 post '/crossvalidation/?' do
   OpenTox::Task.as_task do
     LOGGER.info "creating crossvalidation "+params.inspect
@@ -102,8 +117,6 @@ get '/:id' do
   else
     halt 400, "MIME type '"+request.env['HTTP_ACCEPT'].to_s+"' not supported."
   end
-  
-  halt 202, result unless validation.finished
   result
 end
 
diff --git a/validation/validation_service.rb b/validation/validation_service.rb
index 1e2c103..469a717 100644
--- a/validation/validation_service.rb
+++ b/validation/validation_service.rb
@@ -110,7 +110,6 @@ module Validation
       end
       
       update :prediction_dataset_uri => prediction_dataset_uri, 
-             :finished => true, 
              :real_runtime => benchmark.real,
              :num_instances => prediction.num_instances,
              :num_without_class => prediction.num_without_class,
diff --git a/validation/validation_test.rb b/validation/validation_test.rb
index 5cfd319..643ea81 100644
--- a/validation/validation_test.rb
+++ b/validation/validation_test.rb
@@ -190,16 +190,16 @@ class ValidationTest < Test::Unit::TestCase
     #get '/'     
 
     #get '/prepare_examples'
-    get '/test_examples'
+    #get '/test_examples'
 
     #get '/1',nil,'HTTP_ACCEPT' => "application/rdf+xml"
     #get '/1',nil,'HTTP_ACCEPT' => "text/x-yaml"
 
     
     #get '/crossvalidation/1',nil,'HTTP_ACCEPT' => "application/rdf+xml"
-    #get '/crossvalidation/1',nil,'HTTP_ACCEPT' => "text/x-yaml"
+    get '/crossvalidation/1/statistics',nil,'HTTP_ACCEPT' => "text/x-yaml"
     
-    #puts last_response.body
+    puts last_response.body
   end
   
 #  private
author	Martin Gütlein <martin.guetlein@gmail.com>	2010-03-02 14:11:57 +0100
committer	Martin Gütlein <martin.guetlein@gmail.com>	2010-03-02 14:11:57 +0100
commit	b7efeaaf79233de8bbc173fa426e4561c458d44f (patch)
tree	9326f1d711a0536f069266a4613e0e0c65e26b62
parent	e2b814301c323bc787ad9d75eceb786e3cb7dde9 (diff)