lib/validation.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114

module OpenTox

  class Validation

    field :prediction_dataset_id, type: BSON::ObjectId
    field :test_dataset_id, type: BSON::ObjectId
    field :nr_instances, type: Integer
    field :nr_unpredicted, type: Integer
    field :predictions, type: Array

    def prediction_dataset
      Dataset.find prediction_dataset_id
    end

    def test_dataset
      Dataset.find test_dataset_id
    end

  end

  class ClassificationValidation < Validation
    field :accept_values, type: String
    field :confusion_matrix, type: Array
    field :weighted_confusion_matrix, type: Array

    def self.create model, training_set, test_set
      validation = self.class.new
      #feature_dataset = Dataset.find model.feature_dataset_id
      # TODO check and delegate to Algorithm
      #features = Algorithm.run feature_dataset.training_algorithm, training_set, feature_dataset.training_parameters
      validation_model = model.class.create training_set#, features
      test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
      prediction_dataset = validation_model.predict test_set_without_activities
      accept_values = prediction_dataset.prediction_feature.accept_values
      confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
      weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
      predictions = []
      nr_unpredicted = 0
      prediction_dataset.data_entries.each_with_index do |pe,i|
        if pe[0] and pe[1] and pe[1].numeric? 
          prediction = pe[0]
          # TODO prediction_feature, convention??
          # TODO generalize for multiple classes
          activity = test_set.data_entries[i].first
          confidence = prediction_dataset.data_entries[i][1]
          predictions << [prediction_dataset.compound_ids[i], activity, prediction, confidence]
          if prediction == activity
            if prediction == accept_values[0]
              confusion_matrix[0][0] += 1
              weighted_confusion_matrix[0][0] += confidence
            elsif prediction == accept_values[1]
              confusion_matrix[1][1] += 1
              weighted_confusion_matrix[1][1] += confidence
            end
          elsif prediction != activity
            if prediction == accept_values[0]
              confusion_matrix[0][1] += 1
              weighted_confusion_matrix[0][1] += confidence
            elsif prediction == accept_values[1]
              confusion_matrix[1][0] += 1
              weighted_confusion_matrix[1][0] += confidence
            end
          end
        else
          nr_unpredicted += 1 if pe[0].nil?
        end
      end
      validation = self.new(
        :prediction_dataset_id => prediction_dataset.id,
        :test_dataset_id => test_set.id,
        :nr_instances => test_set.compound_ids.size,
        :nr_unpredicted => nr_unpredicted,
        :accept_values => accept_values,
        :confusion_matrix => confusion_matrix,
        :weighted_confusion_matrix => weighted_confusion_matrix,
        :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
      )
      validation.save
      validation
    end
  end

  class RegressionValidation < Validation
    def self.create model, training_set, test_set
      
      validation_model = Model::LazarRegression.create training_set
      test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
      prediction_dataset = validation_model.predict test_set_without_activities
      predictions = []
      nr_unpredicted = 0
      activities = test_set.data_entries.collect{|de| de.first}
      prediction_dataset.data_entries.each_with_index do |de,i|
        if de[0] and de[1] and de[1].numeric? 
          activity = activities[i]
          prediction = de.first
          confidence = de[1]
          predictions << [prediction_dataset.compound_ids[i], activity, prediction,confidence]
        else
          nr_unpredicted += 1
        end
      end
      validation = self.new(
        :prediction_dataset_id => prediction_dataset.id,
        :test_dataset_id => test_set.id,
        :nr_instances => test_set.compound_ids.size,
        :nr_unpredicted => nr_unpredicted,
        :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
      )
      validation.save
      validation
    end
  end

end