1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
|
module OpenTox
class Validation
field :model_id, type: BSON::ObjectId
field :prediction_dataset_id, type: BSON::ObjectId
field :crossvalidation_id, type: BSON::ObjectId
field :test_dataset_id, type: BSON::ObjectId
field :nr_instances, type: Integer
field :nr_unpredicted, type: Integer
field :predictions, type: Array
def prediction_dataset
Dataset.find prediction_dataset_id
end
def test_dataset
Dataset.find test_dataset_id
end
def model
Model::Lazar.find model_id
end
def self.create model, training_set, test_set, crossvalidation=nil
atts = model.attributes.dup # do not modify attributes from original model
atts["_id"] = BSON::ObjectId.new
atts[:training_dataset_id] = training_set.id
validation_model = model.class.create training_set, atts
validation_model.save
cids = test_set.compound_ids
test_set_without_activities = Dataset.new(:compound_ids => cids.uniq) # remove duplicates and make sure that activities cannot be used
prediction_dataset = validation_model.predict test_set_without_activities
predictions = []
nr_unpredicted = 0
activities = test_set.data_entries.collect{|de| de.first}
prediction_dataset.data_entries.each_with_index do |de,i|
if de[0] #and de[1]
cid = prediction_dataset.compound_ids[i]
rows = cids.each_index.select{|r| cids[r] == cid }
activities = rows.collect{|r| test_set.data_entries[r][0]}
prediction = de.first
confidence = de[1]
predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]]
else
nr_unpredicted += 1
end
end
validation = self.new(
:model_id => validation_model.id,
:prediction_dataset_id => prediction_dataset.id,
:test_dataset_id => test_set.id,
:nr_instances => test_set.compound_ids.size,
:nr_unpredicted => nr_unpredicted,
:predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence
)
validation.crossvalidation_id = crossvalidation.id if crossvalidation
validation.save
validation
end
end
class ClassificationValidation < Validation
end
class RegressionValidation < Validation
def statistics
rmse = 0
weighted_rmse = 0
rse = 0
weighted_rse = 0
mae = 0
weighted_mae = 0
confidence_sum = 0
predictions.each do |pred|
compound_id,activity,prediction,confidence = pred
if activity and prediction
error = Math.log10(prediction)-Math.log10(activity.median)
rmse += error**2
weighted_rmse += confidence*error**2
mae += error.abs
weighted_mae += confidence*error.abs
confidence_sum += confidence
else
warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
$logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
end
end
x = predictions.collect{|p| p[1].median}
y = predictions.collect{|p| p[2]}
R.assign "measurement", x
R.assign "prediction", y
R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
r = R.eval("r").to_ruby
mae = mae/predictions.size
weighted_mae = weighted_mae/confidence_sum
rmse = Math.sqrt(rmse/predictions.size)
weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
{ "R^2" => r**2, "RMSE" => rmse, "MAE" => mae }
end
end
end
|