diff options
Diffstat (limited to 'report/plot_factory.rb')
-rw-r--r-- | report/plot_factory.rb | 228 |
1 files changed, 137 insertions, 91 deletions
diff --git a/report/plot_factory.rb b/report/plot_factory.rb index a4e415a..bf59960 100644 --- a/report/plot_factory.rb +++ b/report/plot_factory.rb @@ -52,10 +52,12 @@ module Reports module PlotFactory - def self.create_regression_plot( out_file, validation_set, name_attribute ) + def self.create_regression_plot( out_files, validation_set, name_attribute, logscale=true ) - LOGGER.debug "Creating regression plot, out-file:"+out_file.to_s + out_files = [out_files] unless out_files.is_a?(Array) + LOGGER.debug "Creating regression plot, out-file:"+out_files.to_s + omit_count = 0 names = [] x = [] y = [] @@ -63,23 +65,34 @@ module Reports x_i = v.get_predictions.predicted_values y_i = v.get_predictions.actual_values - # filter out nil-predictions - not_nil_indices = [] + # filter out nil-predictions and <=0 predictions if log-scale wanted + valid_indices = [] x_i.size.times do |i| - not_nil_indices << i if x_i[i]!=nil && y_i[i]!=nil + if x_i[i]!=nil and y_i[i]!=nil + if !logscale or (x_i[i]>0 and y_i[i]>0) + valid_indices << i + else + omit_count += 1 + end + end end - if not_nil_indices.size < x_i.size - x_i = not_nil_indices.collect{ |i| x_i[i] } - y_i = not_nil_indices.collect{ |i| y_i[i] } + if valid_indices.size < x_i.size + x_i = valid_indices.collect{ |i| x_i[i] } + y_i = valid_indices.collect{ |i| y_i[i] } end names << ( name_attribute==:crossvalidation_fold ? "fold " : "" ) + v.send(name_attribute).to_s x << x_i y << y_i end - - raise "no predictions performed" if x.size==0 || x[0].size==0 - RubyPlot::regression_point_plot(out_file, "Regression plot", "Predicted values", "Actual values", names, x, y ) + names = [""] if names.size==1 + + omit_str = omit_count>0 ? " ("+omit_count.to_s+" predictions omitted)" : "" + raise "no predictions performed"+omit_str if x.size==0 || x[0].size==0 + out_files.each do |out_file| + RubyPlot::regression_point_plot(out_file, "Regression plot", "Predicted values", "Actual values", names, x, y, logscale) + end + omit_count end @@ -91,36 +104,37 @@ module Reports # * the validation set is splitted into sets of validation_sets with equal attribute values # * each of theses validation sets is plotted as a roc-curve # - def self.create_roc_plot( out_file, validation_set, class_value, split_set_attribute=nil, show_single_curves=false ) + def self.create_roc_plot( out_files, validation_set, class_value, split_set_attribute=nil, + x_label="False positive rate", y_label="True Positive Rate" ) - LOGGER.debug "creating roc plot for '"+validation_set.size.to_s+"' validations, out-file:"+out_file.to_s + out_files = [out_files] unless out_files.is_a?(Array) + LOGGER.debug "creating roc plot for '"+validation_set.size.to_s+"' validations, out-files:"+out_files.inspect + data = [] if split_set_attribute attribute_values = validation_set.get_values(split_set_attribute) - names = [] - fp_rates = [] - tp_rates = [] attribute_values.each do |value| begin - data = transform_roc_predictions(validation_set.filter({split_set_attribute => value}), class_value, false) - names << value.to_s - fp_rates << data[:fp_rate][0] - tp_rates << data[:tp_rate][0] + data << transform_roc_predictions(validation_set.filter({split_set_attribute => value}), class_value, false ) + data[-1].name = split_set_attribute.to_s.nice_attr+" "+value.to_s rescue LOGGER.warn "could not create ROC plot for "+value.to_s end end - RubyPlot::plot_lines(out_file, "ROC-Plot", "False positive rate", "True Positive Rate", names, fp_rates, tp_rates ) else - data = transform_roc_predictions(validation_set, class_value, show_single_curves) - RubyPlot::plot_lines(out_file, "ROC-Plot", "False positive rate", "True Positive Rate", data[:names], data[:fp_rate], data[:tp_rate], data[:faint] ) + data << transform_roc_predictions(validation_set, class_value ) end + + out_files.each do |out_file| + RubyPlot::plot_lines(out_file, "ROC-Plot", x_label, y_label, data ) + end end - def self.create_confidence_plot( out_file, validation_set, class_value, split_set_attribute=nil, show_single_curves=false ) + def self.create_confidence_plot( out_files, validation_set, class_value, split_set_attribute=nil, show_single_curves=false ) - LOGGER.debug "creating confidence plot for '"+validation_set.size.to_s+"' validations, out-file:"+out_file.to_s + out_files = [out_files] unless out_files.is_a?(Array) + LOGGER.debug "creating confidence plot for '"+validation_set.size.to_s+"' validations, out-file:"+out_files.inspect if split_set_attribute attribute_values = validation_set.get_values(split_set_attribute) @@ -130,7 +144,7 @@ module Reports attribute_values.each do |value| begin data = transform_confidence_predictions(validation_set.filter({split_set_attribute => value}), class_value, false) - names << value.to_s + names << split_set_attribute.to_s.nice_attr+" "+value.to_s confidence << data[:confidence][0] performance << data[:performance][0] rescue @@ -138,27 +152,32 @@ module Reports end end #RubyPlot::plot_lines(out_file, "Percent Correct vs Confidence Plot", "Confidence", "Percent Correct", names, fp_rates, tp_rates ) - case validation_set.unique_feature_type - when "classification" - RubyPlot::accuracy_confidence_plot(out_file, "Percent Correct vs Confidence Plot", "Confidence", "Percent Correct", names, confidence, performance) - when "regression" - RubyPlot::accuracy_confidence_plot(out_file, "RMSE vs Confidence Plot", "Confidence", "RMSE", names, confidence, performance, true) + out_files.each do |out_file| + case validation_set.unique_feature_type + when "classification" + RubyPlot::accuracy_confidence_plot(out_file, "Percent Correct vs Confidence Plot", "Confidence", "Percent Correct", names, confidence, performance) + when "regression" + RubyPlot::accuracy_confidence_plot(out_file, "RMSE vs Confidence Plot", "Confidence", "RMSE", names, confidence, performance, true) + end end else data = transform_confidence_predictions(validation_set, class_value, show_single_curves) - case validation_set.unique_feature_type - when "classification" - RubyPlot::accuracy_confidence_plot(out_file, "Percent Correct vs Confidence Plot", "Confidence", "Percent Correct", data[:names], data[:confidence], data[:performance]) - when "regression" - RubyPlot::accuracy_confidence_plot(out_file, "RMSE vs Confidence Plot", "Confidence", "RMSE", data[:names], data[:confidence], data[:performance], true) + out_files.each do |out_file| + case validation_set.unique_feature_type + when "classification" + RubyPlot::accuracy_confidence_plot(out_file, "Percent Correct vs Confidence Plot", "Confidence", "Percent Correct", data[:names], data[:confidence], data[:performance]) + when "regression" + RubyPlot::accuracy_confidence_plot(out_file, "RMSE vs Confidence Plot", "Confidence", "RMSE", data[:names], data[:confidence], data[:performance], true) + end end end end - def self.create_bar_plot( out_file, validation_set, title_attribute, value_attributes ) + def self.create_bar_plot( out_files, validation_set, title_attribute, value_attributes ) - LOGGER.debug "creating bar plot, out-file:"+out_file.to_s + out_files = [out_files] unless out_files.is_a?(Array) + LOGGER.debug "creating bar plot, out-files:"+out_files.inspect data = [] titles = [] @@ -167,25 +186,35 @@ module Reports validation_set.validations.each do |v| values = [] value_attributes.each do |a| - validation_set.get_accept_values_for_attr(a).each do |class_value| - value = v.send(a) - if value.is_a?(Hash) - if class_value==nil - avg_value = 0 - value.values.each{ |val| avg_value+=val } - value = avg_value/value.values.size.to_f - else - raise "bar plot value is hash, but no entry for class-value ("+class_value.to_s+"); value for "+a.to_s+" -> "+value.inspect unless value.key?(class_value) - value = value[class_value] + + accept = validation_set.get_accept_values_for_attr(a) + if accept and accept.size>0 + accept.each do |class_value| + value = v.send(a) + if value.is_a?(Hash) + if class_value==nil + avg_value = 0 + value.values.each{ |val| avg_value+=val } + value = avg_value/value.values.size.to_f + else + raise "bar plot value is hash, but no entry for class-value ("+class_value.to_s+"); value for "+a.to_s+" -> "+value.inspect unless value.key?(class_value) + value = value[class_value] + end end + raise "value is nil\nattribute: "+a.to_s+"\nvalidation: "+v.inspect if value==nil + values.push(value) + labels.push(a.to_s.gsub("_","-") + ( class_value==nil ? "" : "("+class_value.to_s+")" )) end - raise "value is nil\nattribute: "+a.to_s+"\nvalidation: "+v.inspect if value==nil + else + value = v.send(a) values.push(value) - labels.push(a.to_s.gsub("_","-") + ( class_value==nil ? "" : "("+class_value.to_s+")" )) + labels.push(a.to_s.gsub("_","-")) end + end titles << v.send(title_attribute).to_s + raise "no title for '"+title_attribute.to_s+"' in validation: "+v.to_yaml if titles[-1].to_s.size==0 data << values end @@ -197,7 +226,9 @@ module Reports LOGGER.debug "bar plot labels: "+labels.inspect LOGGER.debug "bar plot data: "+data.inspect - RubyPlot::plot_bars('Bar plot', labels, data, out_file) + out_files.each do |out_file| + RubyPlot::plot_bars('Bar plot', labels, data, out_file) + end end @@ -261,43 +292,27 @@ module Reports end private - def self.transform_roc_predictions(validation_set, class_value, add_single_folds=false) - + def self.transform_roc_predictions(validation_set, class_value, add_label=true ) if (validation_set.size > 1) - - names = []; fp_rate = []; tp_rate = []; faint = [] - sum_roc_values = { :predicted_values => [], :actual_values => [], :confidence_values => []} - + values = { :predicted_values => [], :actual_values => [], :confidence_values => []} (0..validation_set.size-1).each do |i| roc_values = validation_set.get(i).get_predictions.get_prediction_values(class_value) - sum_roc_values[:predicted_values] += roc_values[:predicted_values] - sum_roc_values[:confidence_values] += roc_values[:confidence_values] - sum_roc_values[:actual_values] += roc_values[:actual_values] - if add_single_folds - begin - tp_fp_rates = get_tp_fp_rates(roc_values) - names << "fold "+i.to_s - fp_rate << tp_fp_rates[:fp_rate] - tp_rate << tp_fp_rates[:tp_rate] - faint << true - rescue - LOGGER.warn "could not get ROC vals for fold "+i.to_s - end - end + values[:predicted_values] += roc_values[:predicted_values] + values[:confidence_values] += roc_values[:confidence_values] + values[:actual_values] += roc_values[:actual_values] end - tp_fp_rates = get_tp_fp_rates(sum_roc_values) - names << nil # "all" - fp_rate << tp_fp_rates[:fp_rate] - tp_rate << tp_fp_rates[:tp_rate] - faint << false - return { :names => names, :fp_rate => fp_rate, :tp_rate => tp_rate, :faint => faint } else - roc_values = validation_set.validations[0].get_predictions.get_prediction_values(class_value) - tp_fp_rates = get_tp_fp_rates(roc_values) - return { :names => ["default"], :fp_rate => [tp_fp_rates[:fp_rate]], :tp_rate => [tp_fp_rates[:tp_rate]] } + values = validation_set.validations[0].get_predictions.get_prediction_values(class_value) end + tp_fp_rates = get_tp_fp_rates(values) + labels = [] + tp_fp_rates[:youden].each do |point,confidence| + labels << ["confidence: "+confidence.to_nice_s, point[0], point[1]] + end if add_label + RubyPlot::LinePlotData.new(:name => "", :x_values => tp_fp_rates[:fp_rate], :y_values => tp_fp_rates[:tp_rate], :labels => labels) end + def self.transform_confidence_predictions(validation_set, class_value, add_single_folds=false) if (validation_set.size > 1) @@ -333,20 +348,29 @@ module Reports else confidence_values = validation_set.validations[0].get_predictions.get_prediction_values(class_value) pref_conf_rates = get_performance_confidence_rates(confidence_values, validation_set.unique_feature_type) - return { :names => ["default"], :performance => [pref_conf_rates[:performance]], :confidence => [pref_conf_rates[:confidence]] } + return { :names => [""], :performance => [pref_conf_rates[:performance]], :confidence => [pref_conf_rates[:confidence]] } end end - def self.demo_rock_plot - roc_values = {:confidence_values => [0.1, 0.9, 0.5, 0.6, 0.6, 0.6], - :predicted_values => [1, 0, 0, 1, 0, 1], - :actual_values => [0, 1, 0, 0, 1, 1]} + def self.demo_roc_plot +# roc_values = {:confidence_values => [0.1, 0.9, 0.5, 0.6, 0.6, 0.6], +# :predicted_values => [1, 0, 0, 1, 0, 1], +# :actual_values => [0, 1, 0, 0, 1, 1]} + roc_values = {:confidence_values => [0.9, 0.8, 0.7, 0.6, 0.5, 0.4], + :predicted_values => [1, 1, 1, 1, 1, 1], + :actual_values => [1, 0, 1, 0, 1, 0]} tp_fp_rates = get_tp_fp_rates(roc_values) - data = { :names => ["default"], :fp_rate => [tp_fp_rates[:fp_rate]], :tp_rate => [tp_fp_rates[:tp_rate]] } + labels = [] + tp_fp_rates[:youden].each do |point,confidence| + labels << ["confidence: "+confidence.to_s, point[0], point[1]] + end + + plot_data = [] + plot_data << RubyPlot::LinePlotData.new(:name => "testname", :x_values => tp_fp_rates[:fp_rate], :y_values => tp_fp_rates[:tp_rate], :labels => labels) RubyPlot::plot_lines("/tmp/plot.png", "ROC-Plot", "False positive rate", - "True Positive Rate", data[:names], data[:fp_rate], data[:tp_rate], data[:faint] ) + "True Positive Rate", plot_data ) end def self.get_performance_confidence_rates(roc_values, feature_type) @@ -354,7 +378,7 @@ module Reports c = roc_values[:confidence_values] p = roc_values[:predicted_values] a = roc_values[:actual_values] - raise "no prediction values for roc-plot" if p.size==0 + raise "no prediction values for confidence plot" if p.size==0 (0..p.size-2).each do |i| ((i+1)..p.size-1).each do |j| @@ -462,21 +486,43 @@ module Reports w = w.compress_sum(c2) #puts tp_rate.inspect+"\n"+fp_rate.inspect+"\n"+w.inspect+"\n\n" + youden = [] + (0..tp_rate.size-1).each do |i| + tpr = tp_rate[i]/tp_rate[-1].to_f + fpr = fp_rate[i]/fp_rate[-1].to_f + youden << tpr + (1 - fpr) + #puts youden[-1].to_s+" ("+tpr.to_s+" "+fpr.to_s+")" + end + max = youden.max + youden_hash = {} + (0..tp_rate.size-1).each do |i| + if youden[i]==max and i>0 + youden_hash[i] = c2[i] + end + end + #puts youden.inspect+"\n"+youden_hash.inspect+"\n\n" + (0..tp_rate.size-1).each do |i| tp_rate[i] = tp_rate[-1]>0 ? tp_rate[i]/tp_rate[-1].to_f*100 : 100 fp_rate[i] = fp_rate[-1]>0 ? fp_rate[i]/fp_rate[-1].to_f*100 : 100 end #puts tp_rate.inspect+"\n"+fp_rate.inspect+"\n\n" - return {:tp_rate => tp_rate,:fp_rate => fp_rate} + youden_coordinates_hash = {} + youden_hash.each do |i,c| + youden_coordinates_hash[[fp_rate[i],tp_rate[i]]] = c + end + #puts youden_coordinates_hash.inspect+"\n\n" + + return {:tp_rate => tp_rate,:fp_rate => fp_rate,:youden => youden_coordinates_hash} end end end #require "rubygems" #require "ruby-plot" -#Reports::PlotFactory::demo_ranking_plot -#Reports::PlotFactory::demo_rock_plot +##Reports::PlotFactory::demo_ranking_plot +#Reports::PlotFactory::demo_roc_plot #a = [1, 0, 1, 2, 3, 0, 2] #puts a.compress_sum([100, 90, 70, 70, 30, 10, 0]).inspect |