lib/prediction_data.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253


module Lib
  
  
  class PredictionData
    
    CHECK_VALUES = true #ENV['RACK_ENV'] =~ /debug|test/
    
    def self.filter_data( data, compounds, min_confidence, min_num_predictions, max_num_predictions, prediction_index=nil )
      
      internal_server_error "cannot filter anything, no confidence values available" if data[:confidence_values][0]==nil
      bad_request_error "please specify either min_confidence XOR max_num_predictions" if 
        (min_confidence!=nil and max_num_predictions!=nil) || (min_confidence==nil and max_num_predictions==nil)
      bad_request_error "min_num_predictions only valid for min_confidence" if 
        (min_confidence==nil and min_num_predictions!=nil)
      min_num_predictions = 0 if min_num_predictions==nil
      
      $logger.debug("filtering predictions, conf:'"+min_confidence.to_s+"' min_num_predictions: '"+
        min_num_predictions.to_s+"' max_num_predictions: '"+max_num_predictions.to_s+"' ")
      #$logger.debug("to filter:\nconf: "+data[:confidence_values].inspect)
       
      orig_size = data[:predicted_values].size
      valid_indices = []
      data[:confidence_values].size.times do |i|
        next if prediction_index!=nil and prediction_index!=data[:predicted_values][i]
        valid = false
        if min_confidence!=nil
          valid = (valid_indices.size<=min_num_predictions or 
            (data[:confidence_values][i]!=nil and data[:confidence_values][i]>=min_confidence))
        else
          valid = valid_indices.size<max_num_predictions
        end
        valid_indices << i if valid
      end
      [ :predicted_values, :actual_values, :confidence_values ].each do |key|
        arr = []
        valid_indices.each{|i| arr << data[key][i]}
        data[key] = arr
      end
      if compounds!=nil
        new_compounds = []
        valid_indices.each{|i| new_compounds << compounds[i]}
      end
      $logger.debug("filtered predictions remaining: "+data[:predicted_values].size.to_s+"/"+orig_size.to_s)
      
      PredictionData.new(data, new_compounds)
    end
    
    def data
      @data
    end
    
    def compounds
      @compounds
    end
    
    def self.create( feature_type, test_dataset_uris, prediction_feature, prediction_dataset_uris, 
      predicted_variables, predicted_confidences, task=nil )      
      
      test_dataset_uris = [test_dataset_uris] unless test_dataset_uris.is_a?(Array)
      prediction_dataset_uris = [prediction_dataset_uris] unless prediction_dataset_uris.is_a?(Array)
      predicted_variables = [predicted_variables] unless predicted_variables.is_a?(Array)
      predicted_confidences = [predicted_confidences] unless predicted_confidences.is_a?(Array)
      $logger.debug "loading prediction - test-dataset: "+test_dataset_uris.inspect+" - prediction-dataset: "+prediction_dataset_uris.inspect+" - predicted_variable: "+predicted_variables.inspect+" - predicted_confidence: "+predicted_confidences.inspect
      $logger.debug "loading prediction -- prediction_feature: "+prediction_feature.to_s
      internal_server_error "prediction_feature missing" unless prediction_feature
      
      all_compounds = []
      all_predicted_values = []
      all_actual_values = []
      all_confidence_values = []
      accept_values = nil
      
      if task
        task_step = 100 / (test_dataset_uris.size*2 + 1)
        task_status = 0
      end

      test_dataset_uris.size.times do |i|
        
        test_dataset_uri = test_dataset_uris[i]
        prediction_dataset_uri = prediction_dataset_uris[i]
        predicted_variable = predicted_variables[i]
        predicted_confidence = predicted_confidences[i]
        
        predicted_variable=prediction_feature if predicted_variable==nil
      
        test_dataset = Lib::DatasetCache.find test_dataset_uri
        internal_server_error "test dataset not found: '"+test_dataset_uri.to_s+"'" unless test_dataset
      
        internal_server_error "prediction_feature not found in test_dataset\n"+
              "prediction_feature: '"+prediction_feature.to_s+"'\n"+
              "test_dataset: '"+test_dataset_uri.to_s+"'\n"+  
              "available features are: "+test_dataset.features.inspect if test_dataset.find_feature_uri(prediction_feature)==nil
        
        # $logger.debug "test dataset size: "+test_dataset.compounds.size.to_s
        internal_server_error "test dataset is empty "+test_dataset_uri.to_s unless test_dataset.compounds.size>0
        
        if feature_type=="classification"
          av = OpenTox::Feature.find(prediction_feature).accept_values
          internal_server_error "'"+RDF::OT.acceptValue.to_s+"' missing/invalid for feature '"+prediction_feature.to_s+"' in dataset '"+
            test_dataset_uri.to_s+"', acceptValues are: '"+av.inspect+"'" if av==nil or av.length<2
          if accept_values==nil
            accept_values=av
          else
            internal_server_error "accept values (in folds) differ "+av.inspect+" != "+accept_values.inspect if av!=accept_values
          end
        end
        
        actual_values = []
        test_dataset.compounds.size.times do |c_idx|
          case feature_type
          when "classification"
            actual_values << classification_val(test_dataset, c_idx, prediction_feature, accept_values)
          when "regression"
            actual_values << numeric_val(test_dataset, c_idx, prediction_feature)
          end
          #internal_server_error "WTF #{c_idx} #{test_dataset.compounds[c_idx]} #{actual_values[-1]} #{actual_values[-2]}" if c_idx>0 and test_dataset.compounds[c_idx]==test_dataset.compounds[c_idx-1] and actual_values[-1]!=actual_values[-2]
        end
        task.progress( task_status += task_step ) if task # loaded actual values
      
        prediction_dataset = Lib::DatasetCache.find prediction_dataset_uri
        internal_server_error "prediction dataset not found: '"+prediction_dataset_uri.to_s+"'" unless prediction_dataset
        
        # allow missing prediction feature if there are no compounds in the prediction dataset
        internal_server_error "predicted_variable not found in prediction_dataset\n"+
            "predicted_variable '"+predicted_variable.to_s+"'\n"+
            "prediction_dataset: '"+prediction_dataset_uri.to_s+"'\n"+
            "available features are: "+prediction_dataset.features.inspect if prediction_dataset.find_feature_uri(predicted_variable)==nil and prediction_dataset.compounds.size>0
        internal_server_error "predicted_confidence not found in prediction_dataset\n"+
                "predicted_confidence '"+predicted_confidence.to_s+"'\n"+
                "prediction_dataset: '"+prediction_dataset_uri.to_s+"'\n"+
                "available features are: "+prediction_dataset.features.inspect if predicted_confidence and prediction_dataset.find_feature_uri(predicted_confidence)==nil and prediction_dataset.compounds.size>0

        #internal_server_error "more predicted than test compounds, #test: "+test_dataset.compounds.size.to_s+" < #prediction: "+
        #  prediction_dataset.compounds.size.to_s+", test-dataset: "+test_dataset_uri.to_s+", prediction-dataset: "+
        #   prediction_dataset_uri if test_dataset.compounds.size < prediction_dataset.compounds.size
        if CHECK_VALUES
          prediction_dataset.compounds.size.times do |c_idx| 
            c = prediction_dataset.compounds[c_idx]
            internal_server_error "predicted compound #{c.uri}\nfrom prediction dataset #{prediction_dataset.uri}\nnot found in test-dataset #{test_dataset.uri}\ntest-compounds:\n"+
              test_dataset.compounds.collect{|c| c.uri}.join("\n") if prediction_dataset.data_entry_value(c_idx,predicted_variable)!=nil and !test_dataset.compounds.include?(c)
          end
        end
        
        predicted_values = []
        confidence_values = []
        
        test_dataset.compounds.size.times do |test_c_idx|
          c = test_dataset.compounds[test_c_idx].uri

          # handle special case before mapping: the test compound might occur multiple times in the training dataset ..
          # .. and is therefore added multiple times without predicted variable into the prediction dataset
          pred_c_idx = prediction_dataset.compound_indices(c)
          if (pred_c_idx and pred_c_idx.size>1 and pred_c_idx.all?{|idx| prediction_dataset.data_entry_value(idx,predicted_variable)==nil})
            predicted_values << nil
            confidence_values << nil
          else
            # find single corresponding compound for test-compound in prediction-dataset
            pred_c_idx = prediction_dataset.compound_index(test_dataset,test_c_idx)
            if pred_c_idx==nil
              # mapping index = nil, make sure that compound-uri is really not included
              internal_server_error "internal error: mapping failed" if prediction_dataset.compounds.collect{|c| c.uri}.include?(c)
              predicted_values << nil
              confidence_values << nil
            else
              # mapping index != nil, make sure that compound-uri is really equal
              internal_server_error "internal error: mapping failed" unless c==prediction_dataset.compounds[pred_c_idx].uri  
              case feature_type
              when "classification"
                predicted_values << classification_val(prediction_dataset, pred_c_idx, predicted_variable, accept_values)
              when "regression"
                predicted_values << numeric_val(prediction_dataset, pred_c_idx, predicted_variable)
              end
              if predicted_confidence
                confidence_values << numeric_val(prediction_dataset, pred_c_idx, predicted_confidence)
              else
                confidence_values << nil
              end
            end
          end
        end
        all_compounds += test_dataset.compounds.collect{|c| c.uri}
        all_predicted_values += predicted_values
        all_actual_values += actual_values
        all_confidence_values += confidence_values
        
        task.progress( task_status += task_step ) if task # loaded predicted values and confidence
      end
        puts all_compounds.inspect
        puts all_predicted_values.inspect
        puts all_actual_values.inspect
        puts all_confidence_values.inspect
      
      #sort according to confidence if available
      if all_confidence_values.compact.size>0
        values = []
        all_predicted_values.size.times do |i|
          values << [all_predicted_values[i], all_actual_values[i], all_confidence_values[i], all_compounds[i]]
        end
        values = values.sort_by{ |v| v[2] || 0 }.reverse # sorting by confidence
        all_predicted_values = []
        all_actual_values = []
        all_confidence_values = []
        all_compounds = []
        values.each do |v|
          all_predicted_values << v[0]
          all_actual_values << v[1]
          all_confidence_values << v[2]
          all_compounds << v[3]
        end
      end
      
      internal_server_error "illegal num compounds "+all_compounds.size.to_s+" != "+all_predicted_values.size.to_s if 
        all_compounds.size != all_predicted_values.size
      task.progress(100) if task # done with the mathmatics
      data = { :predicted_values => all_predicted_values, :actual_values => all_actual_values, :confidence_values => all_confidence_values,
        :feature_type => feature_type, :accept_values => accept_values }
        
      puts data.inspect
      PredictionData.new(data, all_compounds)
    end
    
    private
    def initialize( data, compounds )
      @data = data
      @compounds = compounds
    end
    
    private
    def self.numeric_val(dataset, compound_index, feature)
      v = dataset.data_entry_value(compound_index, feature)
      begin
        v = v.to_f unless v==nil or v.is_a?(Numeric)
        v
      rescue
        $logger.warn "no numeric value for feature '#{feature}' : '#{v}'"
        nil
      end
    end
    
    def self.classification_val(dataset, compound_index, feature, accept_values)
      puts compound_index
      puts feature.inspect
      v = dataset.data_entry_value(compound_index, feature)
      puts v.to_s
      i = accept_values.index(v)
      internal_server_error "illegal class_value of prediction (value is '"+v.to_s+"'), accept values are "+
        accept_values.inspect unless v==nil or i!=nil
      i
    end
  end
end