summaryrefslogtreecommitdiff
path: root/lib/validation.rb
blob: c2250de87f5bb57a4f532067bf212cf71c3de307 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
module OpenTox

  class Validation
    include OpenTox
    include Mongoid::Document
    include Mongoid::Timestamps
    store_in collection: "validations"

    field :prediction_dataset_id, type: BSON::ObjectId
    field :test_dataset_id, type: BSON::ObjectId
    field :nr_instances, type: Integer
    field :nr_unpredicted, type: Integer
    field :accept_values, type: String
    field :confusion_matrix, type: Array
    field :weighted_confusion_matrix, type: Array
    field :predictions, type: Array

    # TODO classification und regression in subclasses
    def self.create model, training_set, test_set
      validation = self.class.new
      feature_dataset = Dataset.find model.feature_dataset_id
      if feature_dataset.is_a? FminerDataset
        features = Algorithm.run feature_dataset.training_algorithm, training_set, feature_dataset.training_parameters
      else
        # TODO search for descriptors
      end
      validation_model = Model::Lazar.create training_set, features
      test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
      prediction_dataset = validation_model.predict test_set_without_activities
      accept_values = prediction_dataset.prediction_feature.accept_values
      confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
      weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
      predictions = []
      prediction_dataset.data_entries.each_with_index do |pe,i|
        if pe[0] and pe[1] and pe[1].numeric? 
          prediction = pe[0]
          # TODO prediction_feature, convention??
          # TODO generalize for multiple classes
          activity = test_set.data_entries[i].first
          confidence = prediction_dataset.data_entries[i][1]
          predictions << [prediction_dataset.compound_ids[i], activity, prediction, confidence]
          if prediction == activity
            if prediction == accept_values[0]
              confusion_matrix[0][0] += 1
              weighted_confusion_matrix[0][0] += confidence
            elsif prediction == accept_values[1]
              confusion_matrix[1][1] += 1
              weighted_confusion_matrix[1][1] += confidence
            end
          elsif prediction != activity
            if prediction == accept_values[0]
              confusion_matrix[0][1] += 1
              weighted_confusion_matrix[0][1] += confidence
            elsif prediction == accept_values[1]
              confusion_matrix[1][0] += 1
              weighted_confusion_matrix[1][0] += confidence
            end
          end
        end
      end
      validation = self.new(
        :prediction_dataset_id => prediction_dataset.id,
        :test_dataset_id => test_set.id,
        :nr_instances => test_set.compound_ids.size,
        :nr_unpredicted => prediction_dataset.data_entries.count{|de| de.first.nil?},
        :accept_values => accept_values,
        :confusion_matrix => confusion_matrix,
        :weighted_confusion_matrix => weighted_confusion_matrix,
        :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
      )
      validation.save
      validation
    end

    def prediction_dataset
      Dataset.find prediction_dataset_id
    end

    def test_dataset
      Dataset.find test_dataset_id
    end

  end

  class CrossValidation
    include OpenTox
    include Mongoid::Document
    include Mongoid::Timestamps
    store_in collection: "crossvalidations"

    field :validation_ids, type: Array, default: []
    field :folds, type: Integer
    field :nr_instances, type: Integer
    field :nr_unpredicted, type: Integer
    field :accept_values, type: Array
    field :confusion_matrix, type: Array
    field :weighted_confusion_matrix, type: Array
    field :accuracy, type: Float
    field :weighted_accuracy, type: Float
    field :true_rate, type: Hash
    field :predictivity, type: Hash
    field :predictions, type: Array
    # TODO auc, f-measure (usability??)

    def self.create model, n=10
      validation_ids = []
      nr_instances = 0
      nr_unpredicted = 0
      accept_values = model.prediction_feature.accept_values
      confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
      weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
      true_rate = {}
      predictivity = {}
      predictions = []
      model.training_dataset.folds(n).each do |fold|
        validation = Validation.create(model, fold[0], fold[1])
        validation_ids << validation.id
        nr_instances += validation.nr_instances
        nr_unpredicted += validation.nr_unpredicted
        validation.confusion_matrix.each_with_index do |r,i|
          r.each_with_index do |c,j|
            confusion_matrix[i][j] += c
            weighted_confusion_matrix[i][j] += validation.weighted_confusion_matrix[i][j]
          end
        end
        predictions << validation.predictions
      end
      true_rate = {}
      predictivity = {}
      accept_values.each_with_index do |v,i|
        true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
        predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
      end
      confidence_sum = 0
      weighted_confusion_matrix.each do |r|
        r.each do |c|
          confidence_sum += c
        end
      end
      cv = CrossValidation.new(
        :folds => n,
        :validation_ids => validation_ids,
        :nr_instances => nr_instances,
        :nr_unpredicted => nr_unpredicted,
        :accept_values => accept_values,
        :confusion_matrix => confusion_matrix,
        :weighted_confusion_matrix => weighted_confusion_matrix,
        :accuracy => (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
        :weighted_accuracy => (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
        :true_rate => true_rate,
        :predictivity => predictivity,
        :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
      )
      cv.save
      cv
    end

    #Average area under roc  0.646
    #Area under roc  0.646
    #F measure carcinogen: 0.769, noncarcinogen: 0.348

  end

end