1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
|
require_relative "setup.rb"
class ValidationTest < MiniTest::Test
def test_fminer_crossvalidation
skip
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
model = Model::LazarFminerClassification.create dataset
cv = ClassificationCrossValidation.create model
refute_empty cv.validation_ids
assert cv.accuracy > 0.8, "Crossvalidation accuracy lower than 0.8"
assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy (#{cv.weighted_accuracy}) larger than unweighted accuracy(#{cv.accuracy}) "
end
def test_classification_crossvalidation
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
model = Model::LazarClassification.create dataset#, features
cv = ClassificationCrossValidation.create model
#p cv
assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7"
#File.open("tmp.svg","w+"){|f| f.puts cv.confidence_plot}
#`inkview tmp.svg`
p cv.nr_unpredicted
p cv.accuracy
assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy (#{cv.weighted_accuracy}) should be larger than unweighted accuracy (#{cv.accuracy}) ."
end
def test_default_regression_crossvalidation
dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
model = Model::LazarRegression.create dataset
cv = RegressionCrossValidation.create model
#cv = RegressionCrossValidation.find '561503262b72ed54fd000001'
p cv
#File.open("tmp.svg","w+"){|f| f.puts cv.correlation_plot}
#`inkview tmp.svg`
#File.open("tmp.svg","w+"){|f| f.puts cv.confidence_plot}
#`inkview tmp.svg`
#puts cv.misclassifications.to_yaml
p cv.rmse
p cv.weighted_rmse
assert cv.rmse < 1.5, "RMSE > 1.5"
#assert cv.weighted_rmse < cv.rmse, "Weighted RMSE (#{cv.weighted_rmse}) larger than unweighted RMSE(#{cv.rmse}) "
p cv.mae
p cv.weighted_mae
assert cv.mae < 1
#assert cv.weighted_mae < cv.mae
end
def test_regression_crossvalidation
dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
#dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.csv"
params = {
:prediction_algorithm => "OpenTox::Algorithm::Regression.weighted_average",
:neighbor_algorithm => "fingerprint_neighbors",
:neighbor_algorithm_parameters => {
:type => "MACCS",
:min_sim => 0.7,
}
}
model = Model::LazarRegression.create dataset, params
cv = RegressionCrossValidation.create model
cv.validation_ids.each do |vid|
model = Model::Lazar.find(Validation.find(vid).model_id)
assert_equal params[:neighbor_algorithm_parameters][:type], model[:neighbor_algorithm_parameters][:type]
assert_equal params[:neighbor_algorithm_parameters][:min_sim], model[:neighbor_algorithm_parameters][:min_sim]
refute_equal params[:neighbor_algorithm_parameters][:training_dataset_id], model[:neighbor_algorithm_parameters][:training_dataset_id]
end
assert cv.rmse < 1.5, "RMSE > 30"
assert cv.mae < 1
end
def test_pls_regression_crossvalidation
dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
params = { :prediction_algorithm => "OpenTox::Algorithm::Regression.local_pls_regression", }
model = Model::LazarRegression.create dataset, params
cv = RegressionCrossValidation.create model
p cv.nr_instances
p cv.nr_unpredicted
assert cv.rmse < 1.5, "RMSE > 1.5"
assert cv.mae < 1
end
def test_repeated_crossvalidation
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
model = Model::LazarClassification.create dataset
repeated_cv = RepeatedCrossValidation.create model
repeated_cv.crossvalidations.each do |cv|
assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
assert_operator cv.weighted_accuracy, :>, cv.accuracy
end
end
def test_crossvalidation_parameters
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
params = {
:neighbor_algorithm_parameters => {
:min_sim => 0.3,
:type => "FP3"
}
}
model = Model::LazarClassification.create dataset, params
model.save
cv = ClassificationCrossValidation.create model
params = model.neighbor_algorithm_parameters
params.delete :training_dataset_id
params = Hash[params.map{ |k, v| [k.to_s, v] }] # convert symbols to string
cv.validations.each do |validation|
validation_params = validation.model.neighbor_algorithm_parameters
validation_params.delete "training_dataset_id"
assert_equal params, validation_params
end
end
def test_physchem_regression_crossvalidation
skip
@descriptors = OpenTox::Algorithm::Descriptor::OBDESCRIPTORS.keys
refute_empty @descriptors
# UPLOAD DATA
training_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
feature_dataset = Algorithm::Descriptor.physchem training_dataset, @descriptors
feature_dataset.save
scaled_feature_dataset = feature_dataset.scale
scaled_feature_dataset.save
model = Model::LazarRegression.create training_dataset
model.neighbor_algorithm = "physchem_neighbors"
model.neighbor_algorithm_parameters = {
:feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.physchem",
:descriptors => @descriptors,
:feature_dataset_id => scaled_feature_dataset.id,
:min_sim => 0.3
}
model.save
cv = RegressionCrossValidation.create model
p cv
end
def test_classification_loo_validation
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
model = Model::LazarClassification.create dataset
loo = ClassificationLeaveOneOutValidation.create model
assert_equal 14, loo.nr_unpredicted
refute_empty loo.confusion_matrix
assert loo.accuracy > 0.77
assert loo.weighted_accuracy > 0.85
assert loo.accuracy < loo.weighted_accuracy
end
def test_regression_loo_validation
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
model = Model::LazarRegression.create dataset
loo = RegressionLeaveOneOutValidation.create model
assert_equal 11, loo.nr_unpredicted
assert loo.weighted_mae < loo.mae
assert loo.r_squared > 0.34
#assert_equal 14, loo.nr_unpredicted
#p loo.confusion_matrix
#p loo.accuracy
#File.open("tmp.svg","w+"){|f| f.puts loo.correlation_plot}
#`inkview tmp.svg`
end
end
|