summaryrefslogtreecommitdiff
path: root/lib/statistics.rb
blob: e14dc7ce0403da8776e906071e4fcdbe39812557 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
class ClassificationStatistics

  def initialize dir
    @dir = dir
    @folds = Dir[File.join(@dir,"[0-9]*")]
    @confusion_matrix_dir = File.join(@dir,"confusion-matrices")
    @summaries_dir = File.join(@dir,"summaries")
  end

  def confusion_matrix

    confusion_matrices = {
      :all => {:tp => 0, :fp => 0, :tn => 0, :fn => 0},
      :high_confidence  => {:tp => 0, :fp => 0, :tn => 0, :fn => 0},
      :low_confidence  => {:tp => 0, :fp => 0, :tn => 0, :fn => 0},
    }

    @folds.each do |dir|
      test_dir = File.join(dir,"test")
      classifications = File.readlines(File.join(test_dir,"predictions")).collect{|row| row.chomp.split(",")}
      measurements = File.readlines(File.join(test_dir,"dependent-variables")).collect{|v| v.to_i}
      similarity_thresholds = File.readlines(File.join(dir,"train","similarity-thresholds")).collect{|v| v.chomp.to_f}
      classifications.each_with_index do |c,i|
        prediction = c[2]
        max_sim = c[5].to_f
        unless prediction.empty?
          prediction = prediction.to_i
          if prediction == 1 and measurements[i] == 1
            confusion_matrices[:all][:tp] +=1
            max_sim > similarity_thresholds[1] ?  confusion_matrices[:high_confidence][:tp] +=1 : confusion_matrices[:low_confidence][:tp] +=1
          elsif prediction == 0 and measurements[i] == 0
            confusion_matrices[:all][:tn] +=1
            max_sim > similarity_thresholds[1] ?  confusion_matrices[:high_confidence][:tn] +=1 : confusion_matrices[:low_confidence][:tn] +=1
          elsif prediction == 1 and measurements[i] == 0
            confusion_matrices[:all][:fp] +=1
            max_sim > similarity_thresholds[1] ?  confusion_matrices[:high_confidence][:fp] +=1 : confusion_matrices[:low_confidence][:fp] +=1
          elsif prediction == 0 and measurements[i] == 1
            confusion_matrices[:all][:fn] +=1
            max_sim > similarity_thresholds[1] ?  confusion_matrices[:high_confidence][:fn] +=1 : confusion_matrices[:low_confidence][:fn] +=1
          end
        end
      end
      FileUtils.mkdir_p @confusion_matrix_dir
      confusion_matrices.each do |t,m|
        File.open(File.join(@confusion_matrix_dir,t.to_s),"w+"){ |f| f.puts "#{m[:tp]},#{m[:fp]}\n#{m[:fn]},#{m[:tn]}" }
      end
    end

  end

  def summary
    [:all,:high_confidence,:low_confidence].each do |cat|
      confusion_matrix_file = File.join(@confusion_matrix_dir,cat.to_s)
      confusion_matrix unless File.exists? confusion_matrix_file
      matrix = File.readlines(confusion_matrix_file).collect{|row| row.chomp.split(",").collect{|v| v.to_f}}
      tp = matrix[0][0]
      fp = matrix[0][1]
      fn = matrix[1][0]
      tn = matrix[1][1]
      FileUtils.mkdir_p @summaries_dir
      File.open(File.join(@summaries_dir,cat.to_s),"w+") do |f|
        f.puts "accuracy,#{(tp+tn)/(tp+fp+tn+fn)}"
        f.puts "true_positive_rate,#{tp/(tp+fn)}"
        f.puts "true_negative_rate,#{tn/(tn+fp)}"
        f.puts "positive_predictive_value,#{tp/(tp+fp)}"
        f.puts "negative_predictive_value,#{tn/(tn+fn)}"
      end
    end
  end
end