1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
|
module Lib
class PredictionData
CHECK_VALUES = ENV['RACK_ENV'] =~ /debug|test/
def self.filter_data( data, compounds, min_confidence, min_num_predictions, max_num_predictions, prediction_index=nil )
raise "cannot filter anything, no confidence values available" if data[:confidence_values][0]==nil
raise OpenTox::BadRequestError.new "please specify either min_confidence or max_num_predictions" if
(min_confidence!=nil and max_num_predictions!=nil) || (min_confidence==nil and max_num_predictions==nil)
raise OpenTox::BadRequestError.new "min_num_predictions only valid for min_confidence" if
(min_confidence==nil and min_num_predictions!=nil)
min_num_predictions = 0 if min_num_predictions==nil
LOGGER.debug("filtering predictions, conf:'"+min_confidence.to_s+"' min_num_predictions: '"+
min_num_predictions.to_s+"' max_num_predictions: '"+max_num_predictions.to_s+"' ")
#LOGGER.debug("to filter:\nconf: "+data[:confidence_values].inspect)
orig_size = data[:predicted_values].size
valid_indices = []
data[:confidence_values].size.times do |i|
next if prediction_index!=nil and prediction_index!=data[:predicted_values][i]
valid = false
if min_confidence!=nil
valid = (valid_indices.size<=min_num_predictions or
(data[:confidence_values][i]!=nil and data[:confidence_values][i]>=min_confidence))
else
valid = valid_indices.size<max_num_predictions
end
valid_indices << i if valid
end
[ :predicted_values, :actual_values, :confidence_values ].each do |key|
arr = []
valid_indices.each{|i| arr << data[key][i]}
data[key] = arr
end
if compounds!=nil
new_compounds = []
valid_indices.each{|i| new_compounds << compounds[i]}
end
LOGGER.debug("filtered predictions remaining: "+data[:predicted_values].size.to_s+"/"+orig_size.to_s)
PredictionData.new(data, new_compounds)
end
def data
@data
end
def compounds
@compounds
end
def self.create( feature_type, test_dataset_uris, test_target_dataset_uris,
prediction_feature, prediction_dataset_uris, predicted_variables, predicted_confidences,
subjectid=nil, task=nil )
test_dataset_uris = [test_dataset_uris] unless test_dataset_uris.is_a?(Array)
test_target_dataset_uris = [test_target_dataset_uris] unless test_target_dataset_uris.is_a?(Array)
prediction_dataset_uris = [prediction_dataset_uris] unless prediction_dataset_uris.is_a?(Array)
predicted_variables = [predicted_variables] unless predicted_variables.is_a?(Array)
predicted_confidences = [predicted_confidences] unless predicted_confidences.is_a?(Array)
LOGGER.debug "loading prediction -- test-dataset: "+test_dataset_uris.inspect
LOGGER.debug "loading prediction -- test-target-datset: "+test_target_dataset_uris.inspect
LOGGER.debug "loading prediction -- prediction-dataset: "+prediction_dataset_uris.inspect
LOGGER.debug "loading prediction -- predicted_variable: "+predicted_variables.inspect
LOGGER.debug "loading prediction -- predicted_confidence: "+predicted_confidences.inspect
LOGGER.debug "loading prediction -- prediction_feature: "+prediction_feature.to_s
raise "prediction_feature missing" unless prediction_feature
all_compounds = []
all_predicted_values = []
all_actual_values = []
all_confidence_values = []
accept_values = nil
if task
task_step = 100 / (test_dataset_uris.size*2 + 1)
task_status = 0
end
test_dataset_uris.size.times do |i|
test_dataset_uri = test_dataset_uris[i]
test_target_dataset_uri = test_target_dataset_uris[i]
prediction_dataset_uri = prediction_dataset_uris[i]
predicted_variable = predicted_variables[i]
predicted_confidence = predicted_confidences[i]
predicted_variable=prediction_feature if predicted_variable==nil
test_dataset = Lib::DatasetCache.find test_dataset_uri,subjectid
raise "test dataset not found: '"+test_dataset_uri.to_s+"'" unless test_dataset
if test_target_dataset_uri == nil || test_target_dataset_uri.strip.size==0 || test_target_dataset_uri==test_dataset_uri
test_target_dataset_uri = test_dataset_uri
test_target_dataset = test_dataset
raise "prediction_feature not found in test_dataset, specify a test_target_dataset\n"+
"prediction_feature: '"+prediction_feature.to_s+"'\n"+
"test_dataset: '"+test_target_dataset_uri.to_s+"'\n"+
"available features are: "+test_target_dataset.features.inspect if test_target_dataset.features.keys.index(prediction_feature)==nil
else
test_target_dataset = Lib::DatasetCache.find test_target_dataset_uri,subjectid
raise "test target datset not found: '"+test_target_dataset_uri.to_s+"'" unless test_target_dataset
if CHECK_VALUES
test_dataset.compounds.each do |c|
raise "test compound not found on test class dataset "+c.to_s unless test_target_dataset.compounds.include?(c)
end
end
raise "prediction_feature not found in test_target_dataset\n"+
"prediction_feature: '"+prediction_feature.to_s+"'\n"+
"test_target_dataset: '"+test_target_dataset_uri.to_s+"'\n"+
"available features are: "+test_target_dataset.features.inspect if test_target_dataset.features.keys.index(prediction_feature)==nil
end
compounds = test_dataset.compounds
LOGGER.debug "test dataset size: "+compounds.size.to_s
raise "test dataset is empty "+test_dataset_uri.to_s unless compounds.size>0
if feature_type=="classification"
av = test_target_dataset.accept_values(prediction_feature)
raise "'"+OT.acceptValue.to_s+"' missing/invalid for feature '"+prediction_feature.to_s+"' in dataset '"+
test_target_dataset_uri.to_s+"', acceptValues are: '"+av.inspect+"'" if av==nil or av.length<2
if accept_values==nil
accept_values=av
else
raise "accept values (in folds) differ "+av.inspect+" != "+accept_values.inspect if av!=accept_values
end
end
actual_values = []
compounds.each do |c|
case feature_type
when "classification"
actual_values << classification_val(test_target_dataset, c, prediction_feature, accept_values)
when "regression"
actual_values << regression_val(test_target_dataset, c, prediction_feature)
end
end
task.progress( task_status += task_step ) if task # loaded actual values
prediction_dataset = Lib::DatasetCache.find prediction_dataset_uri,subjectid
raise "prediction dataset not found: '"+prediction_dataset_uri.to_s+"'" unless prediction_dataset
# allow missing prediction feature if there are no compounds in the prediction dataset
raise "predicted_variable not found in prediction_dataset\n"+
"predicted_variable '"+predicted_variable.to_s+"'\n"+
"prediction_dataset: '"+prediction_dataset_uri.to_s+"'\n"+
"available features are: "+prediction_dataset.features.inspect if prediction_dataset.features.keys.index(predicted_variable)==nil and prediction_dataset.compounds.size>0
raise "predicted_confidence not found in prediction_dataset\n"+
"predicted_confidence '"+predicted_confidence.to_s+"'\n"+
"prediction_dataset: '"+prediction_dataset_uri.to_s+"'\n"+
"available features are: "+prediction_dataset.features.inspect if predicted_confidence and prediction_dataset.features.keys.index(predicted_confidence)==nil and prediction_dataset.compounds.size>0
raise "more predicted than test compounds, #test: "+compounds.size.to_s+" < #prediction: "+
prediction_dataset.compounds.size.to_s+", test-dataset: "+test_dataset_uri.to_s+", prediction-dataset: "+
prediction_dataset_uri if compounds.size < prediction_dataset.compounds.size
if CHECK_VALUES
prediction_dataset.compounds.each do |c|
raise "predicted compound not found in test dataset:\n"+c+"\ntest-compounds:\n"+
compounds.collect{|c| c.to_s}.join("\n") if compounds.index(c)==nil
end
end
predicted_values = []
confidence_values = []
count = 0
compounds.each do |c|
if prediction_dataset.compounds.index(c)==nil
predicted_values << nil
confidence_values << nil
else
case feature_type
when "classification"
predicted_values << classification_val(prediction_dataset, c, predicted_variable, accept_values)
when "regression"
predicted_values << regression_val(prediction_dataset, c, predicted_variable)
end
if predicted_confidence
confidence_values << confidence_val(prediction_dataset, c, predicted_confidence)
else
confidence_values << nil
end
end
count += 1
end
all_compounds += compounds
all_predicted_values += predicted_values
all_actual_values += actual_values
all_confidence_values += confidence_values
task.progress( task_status += task_step ) if task # loaded predicted values and confidence
end
#sort according to confidence if available
if all_confidence_values.compact.size>0
values = []
all_predicted_values.size.times do |i|
values << [all_predicted_values[i], all_actual_values[i], all_confidence_values[i], all_compounds[i]]
end
values = values.sort_by{ |v| v[2] || 0 }.reverse # sorting by confidence
all_predicted_values = []
all_actual_values = []
all_confidence_values = []
all_compounds = []
values.each do |v|
all_predicted_values << v[0]
all_actual_values << v[1]
all_confidence_values << v[2]
all_compounds << v[3]
end
end
raise "illegal num compounds "+all_compounds.size.to_s+" != "+all_predicted_values.size.to_s if
all_compounds.size != all_predicted_values.size
task.progress(100) if task # done with the mathmatics
data = { :predicted_values => all_predicted_values, :actual_values => all_actual_values, :confidence_values => all_confidence_values,
:feature_type => feature_type, :accept_values => accept_values }
PredictionData.new(data, all_compounds)
end
private
def initialize( data, compounds )
@data = data
@compounds = compounds
end
private
def self.regression_val(dataset, compound, feature)
v = value(dataset, compound, feature)
begin
v = v.to_f unless v==nil or v.is_a?(Numeric)
v
rescue
LOGGER.warn "no numeric value for regression: '"+v.to_s+"'"
nil
end
end
def self.confidence_val(dataset, compound, confidence)
v = value(dataset, compound, confidence)
begin
v = v.to_f unless v==nil or v.is_a?(Numeric)
v
rescue
LOGGER.warn "no numeric value for confidence '"+v.to_s+"'"
nil
end
end
def self.classification_val(dataset, compound, feature, accept_values)
v = value(dataset, compound, feature)
i = accept_values.index(v.to_s)
raise "illegal class_value of prediction (value is '"+v.to_s+"'), accept values are "+
accept_values.inspect unless v==nil or i!=nil
i
end
def self.value(dataset, compound, feature)
return nil if dataset.data_entries[compound]==nil
if feature==nil
v = dataset.data_entries[compound].values[0]
else
v = dataset.data_entries[compound][feature]
end
return nil if v==nil
raise "no array "+v.class.to_s+" : '"+v.to_s+"'" unless v.is_a?(Array)
if v.size>1
v.uniq!
if v.size>1
v = nil
LOGGER.warn "not yet implemented: multiple non-equal values "+compound.to_s+" "+v.inspect
else
v = v[0]
end
elsif v.size==1
v = v[0]
else
v = nil
end
raise "array" if v.is_a?(Array)
v = nil if v.to_s.size==0
v
end
end
end
|