1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
|
require "lib/predictions.rb"
module Lib
class OTPredictions < Predictions
CHECK_VALUES = ENV['RACK_ENV'] =~ /debug|test/
def identifier(instance_index)
return compound(instance_index)
end
def compound(instance_index)
return @compounds[instance_index]
end
def initialize(feature_type, test_dataset_uri, test_target_dataset_uri,
prediction_feature, prediction_dataset_uri, predicted_variable, subjectid=nil, task=nil)
LOGGER.debug("loading prediciton via test-dataset:'"+test_dataset_uri.to_s+
"', test-target-datset:'"+test_target_dataset_uri.to_s+
"', prediction-dataset:'"+prediction_dataset_uri.to_s+
"', prediction_feature: '"+prediction_feature.to_s+"' "+
"', predicted_variable: '"+predicted_variable.to_s+"'")
predicted_variable=prediction_feature if predicted_variable==nil
test_dataset = OpenTox::Dataset.find test_dataset_uri,subjectid
raise "test dataset not found: '"+test_dataset_uri.to_s+"'" unless test_dataset
raise "prediction_feature missing" unless prediction_feature
if test_target_dataset_uri == nil || test_target_dataset_uri.strip.size==0 || test_target_dataset_uri==test_dataset_uri
test_target_dataset_uri = test_dataset_uri
test_target_dataset = test_dataset
raise "prediction_feature not found in test_dataset, specify a test_target_dataset\n"+
"prediction_feature: '"+prediction_feature.to_s+"'\n"+
"test_dataset: '"+test_target_dataset_uri.to_s+"'\n"+
"available features are: "+test_target_dataset.features.inspect if test_target_dataset.features.keys.index(prediction_feature)==nil
else
test_target_dataset = OpenTox::Dataset.find test_target_dataset_uri,subjectid
raise "test target datset not found: '"+test_target_dataset_uri.to_s+"'" unless test_target_dataset
if CHECK_VALUES
test_dataset.compounds.each do |c|
raise "test compound not found on test class dataset "+c.to_s unless test_target_dataset.compounds.include?(c)
end
end
raise "prediction_feature not found in test_target_dataset\n"+
"prediction_feature: '"+prediction_feature.to_s+"'\n"+
"test_target_dataset: '"+test_target_dataset_uri.to_s+"'\n"+
"available features are: "+test_target_dataset.features.inspect if test_target_dataset.features.keys.index(prediction_feature)==nil
end
@compounds = test_dataset.compounds
LOGGER.debug "test dataset size: "+@compounds.size.to_s
raise "test dataset is empty "+test_dataset_uri.to_s unless @compounds.size>0
if feature_type=="classification"
accept_values = test_target_dataset.features[prediction_feature][OT.acceptValue]
raise "'"+OT.acceptValue.to_s+"' missing/invalid for feature '"+prediction_feature.to_s+"' in dataset '"+
test_target_dataset_uri.to_s+"', acceptValues are: '"+accept_values.inspect+"'" if accept_values==nil or accept_values.length<2
else
accept_values=nil
end
actual_values = []
@compounds.each do |c|
case feature_type
when "classification"
actual_values << classification_value(test_target_dataset, c, prediction_feature, accept_values)
when "regression"
actual_values << regression_value(test_target_dataset, c, prediction_feature)
end
end
task.progress(40) if task # loaded actual values
prediction_dataset = OpenTox::Dataset.find prediction_dataset_uri,subjectid
raise "prediction dataset not found: '"+prediction_dataset_uri.to_s+"'" unless prediction_dataset
# TODO: remove LAZAR_PREDICTION_DATASET_HACK
no_prediction_feature = prediction_dataset.features.keys.index(predicted_variable)==nil
if no_prediction_feature
one_entry_per_compound = true
@compounds.each do |c|
if prediction_dataset.data_entries[c] and prediction_dataset.data_entries[c].size != 1
one_entry_per_compound = false
break
end
end
msg = "prediction-feature not found: '"+predicted_variable+"' in prediction-dataset: "+prediction_dataset_uri.to_s+", available features: "+
prediction_dataset.features.keys.inspect
if one_entry_per_compound
LOGGER.warn msg
else
raise msg
end
end
raise "more predicted than test compounds test:"+@compounds.size.to_s+" < prediction:"+
prediction_dataset.compounds.size.to_s if @compounds.size < prediction_dataset.compounds.size
if CHECK_VALUES
prediction_dataset.compounds.each do |c|
raise "predicted compound not found in test dataset:\n"+c+"\ntest-compounds:\n"+
@compounds.collect{|c| c.to_s}.join("\n") if @compounds.index(c)==nil
end
end
predicted_values = []
confidence_values = []
@compounds.each do |c|
if prediction_dataset.compounds.index(c)==nil
predicted_values << nil
confidence_values << nil
else
case feature_type
when "classification"
# TODO: remove LAZAR_PREDICTION_DATASET_HACK
predicted_values << classification_value(prediction_dataset, c, no_prediction_feature ? nil : predicted_variable, accept_values)
when "regression"
predicted_values << regression_value(prediction_dataset, c, no_prediction_feature ? nil : predicted_variable)
end
# TODO confidence_values << prediction_dataset.get_prediction_confidence(c, predicted_variable)
conf = 1
begin
feature = prediction_dataset.data_entries[c].keys[0]
feature_data = prediction_dataset.features[feature]
conf = feature_data[OT.confidence] if feature_data[OT.confidence]!=nil
rescue
LOGGER.warn "could not get confidence"
end
confidence_values << conf
end
end
task.progress(80) if task # loaded predicted values and confidence
super(predicted_values, actual_values, confidence_values, feature_type, accept_values)
raise "illegal num compounds "+num_info if @compounds.size != @predicted_values.size
task.progress(100) if task # done with the mathmatics
end
private
def regression_value(dataset, compound, feature)
v = value(dataset, compound, feature)
begin
v = v.to_f unless v==nil or v.is_a?(Numeric)
v
rescue
LOGGER.warn "no numeric value for regression: '"+v.to_s+"'"
nil
end
end
def classification_value(dataset, compound, feature, accept_values)
v = value(dataset, compound, feature)
i = accept_values.index(v.to_s)
raise "illegal class_value of prediction (value is '"+v.to_s+"'), accept values are "+
accept_values.inspect unless v==nil or i!=nil
i
end
def value(dataset, compound, feature)
return nil if dataset.data_entries[compound]==nil
if feature==nil
v = dataset.data_entries[compound].values[0]
else
v = dataset.data_entries[compound][feature]
end
return nil if v==nil
raise "no array "+v.class.to_s+" : '"+v.to_s+"'" unless v.is_a?(Array)
if v.size>1
v.uniq!
if v.size>1
v = nil
LOGGER.warn "not yet implemented: multiple non-equal values "+compound.to_s+" "+v.inspect
else
v = v[0]
end
elsif v.size==1
v = v[0]
else
v = nil
end
raise "array" if v.is_a?(Array)
v = nil if v.to_s.size==0
v
end
public
def compute_stats
res = {}
case @feature_type
when "classification"
(Validation::VAL_CLASS_PROPS).each{ |s| res[s] = send(s)}
when "regression"
(Validation::VAL_REGR_PROPS).each{ |s| res[s] = send(s) }
end
return res
end
def to_array()
OTPredictions.to_array( [self] )
end
def self.to_array( predictions, add_pic=false, format=false )
res = []
predictions.each do |p|
(0..p.num_instances-1).each do |i|
a = []
#PENDING!
begin
#a.push( "http://ambit.uni-plovdiv.bg:8080/ambit2/depict/cdk?search="+
# URI.encode(OpenTox::Compound.new(:uri=>p.identifier(i)).smiles) ) if add_pic
a << p.identifier(i)+"?media=image/png"
rescue => ex
raise ex
#a.push("Could not add pic: "+ex.message)
#a.push(p.identifier(i))
end
a << (format ? p.actual_value(i).to_nice_s : p.actual_value(i))
a << (format ? p.predicted_value(i).to_nice_s : p.predicted_value(i))
if p.feature_type=="classification"
if (p.predicted_value(i)!=nil and p.actual_value(i)!=nil)
a << (p.classification_miss?(i) ? 1 : 0)
else
a << nil
end
end
if p.confidence_values_available?
a << (format ? p.confidence_value(i).to_nice_s : p.confidence_value(i))
end
a << p.identifier(i)
res << a
end
end
header = []
header << "compound" if add_pic
header << "actual value"
header << "predicted value"
header << "missclassified" if predictions[0].feature_type=="classification"
header << "confidence value" if predictions[0].confidence_values_available?
header << "compound-uri"
res.insert(0, header)
return res
end
end
end
|