summaryrefslogtreecommitdiff
path: root/test/validation.rb
blob: 95f9bc04184c1981bf1c111a6604786eb2b36a5f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
require_relative "setup.rb"

class ValidationTest < MiniTest::Test

  def test_fminer_crossvalidation
    skip
    dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
    model = Model::LazarFminerClassification.create dataset
    cv = ClassificationCrossValidation.create model
    refute_empty cv.validation_ids
    assert cv.accuracy > 0.8, "Crossvalidation accuracy lower than 0.8"
    assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy (#{cv.weighted_accuracy}) larger than unweighted accuracy(#{cv.accuracy}) "
  end

  def test_classification_crossvalidation
    dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
    model = Model::LazarClassification.create dataset#, features
    cv = ClassificationCrossValidation.create model
    #p cv
    assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7"
    #File.open("tmp.svg","w+"){|f| f.puts cv.confidence_plot}
    #`inkview tmp.svg`
    p cv.nr_unpredicted
    p cv.accuracy
    assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy (#{cv.weighted_accuracy}) should be larger than unweighted accuracy (#{cv.accuracy}) ."
  end

  def test_default_regression_crossvalidation
    dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
    model = Model::LazarRegression.create dataset
    cv = RegressionCrossValidation.create model
    #cv = RegressionCrossValidation.find '561503262b72ed54fd000001'
    #p cv.id
    #File.open("tmp.svg","w+"){|f| f.puts cv.correlation_plot}
    #`inkview tmp.svg`
    #File.open("tmp.svg","w+"){|f| f.puts cv.confidence_plot}
    #`inkview tmp.svg`
    
    #puts cv.misclassifications.to_yaml
    p cv.rmse
    p cv.weighted_rmse 
    assert cv.rmse < 1.5, "RMSE > 1.5"
    #assert cv.weighted_rmse < cv.rmse, "Weighted RMSE (#{cv.weighted_rmse}) larger than unweighted RMSE(#{cv.rmse}) "
    p cv.mae 
    p cv.weighted_mae 
    assert cv.mae < 1
    #assert cv.weighted_mae < cv.mae
  end

  def test_regression_crossvalidation
    dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
    #dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.csv"
    params = {
      :prediction_algorithm => "OpenTox::Algorithm::Regression.weighted_average",
      :neighbor_algorithm => "fingerprint_neighbors",
      :neighbor_algorithm_parameters => {
        :type => "MACCS",
        :min_sim => 0.7,
      }
    }
    model = Model::LazarRegression.create dataset, params
    cv = RegressionCrossValidation.create model
    cv.validation_ids.each do |vid|
      model = Model::Lazar.find(Validation.find(vid).model_id)
      assert_equal params[:neighbor_algorithm_parameters][:type], model[:neighbor_algorithm_parameters][:type]
      assert_equal params[:neighbor_algorithm_parameters][:min_sim], model[:neighbor_algorithm_parameters][:min_sim]
      refute_equal params[:neighbor_algorithm_parameters][:training_dataset_id], model[:neighbor_algorithm_parameters][:training_dataset_id]
    end

    assert cv.rmse < 1.5, "RMSE > 30"
    assert cv.mae < 1
  end

  def test_repeated_crossvalidation
    dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
    model = Model::LazarClassification.create dataset
    repeated_cv = RepeatedCrossValidation.create model
    repeated_cv.crossvalidations.each do |cv|
      assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
      assert_operator cv.weighted_accuracy, :>, cv.accuracy
    end
  end

  def test_crossvalidation_parameters
    dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
    params = {
      :neighbor_algorithm_parameters => {
        :min_sim => 0.3,
        :type => "FP3"
      }
    }
    model = Model::LazarClassification.create dataset, params
    model.save
    cv = ClassificationCrossValidation.create model
    params = model.neighbor_algorithm_parameters
    params.delete :training_dataset_id
    params = Hash[params.map{ |k, v| [k.to_s, v] }] # convert symbols to string

    cv.validations.each do |validation|
      validation_params = validation.model.neighbor_algorithm_parameters
      validation_params.delete "training_dataset_id"
      assert_equal params, validation_params
    end
  end

  def test_physchem_regression_crossvalidation
    skip

    @descriptors = OpenTox::Algorithm::Descriptor::OBDESCRIPTORS.keys
    refute_empty @descriptors

    # UPLOAD DATA
    training_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
    feature_dataset = Algorithm::Descriptor.physchem training_dataset, @descriptors
    feature_dataset.save
    scaled_feature_dataset = feature_dataset.scale
    scaled_feature_dataset.save
    model = Model::LazarRegression.create training_dataset
    model.neighbor_algorithm = "physchem_neighbors"
    model.neighbor_algorithm_parameters = {
      :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.physchem",
      :descriptors => @descriptors,
      :feature_dataset_id => scaled_feature_dataset.id,
      :min_sim => 0.3
    }
    model.save
    cv = RegressionCrossValidation.create model
    p cv
  end

  def test_classification_loo_validation
    dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
    model = Model::LazarClassification.create dataset
    loo = ClassificationLeaveOneOutValidation.create model
    assert_equal 14, loo.nr_unpredicted
    refute_empty loo.confusion_matrix
    assert loo.accuracy > 0.77
    assert loo.weighted_accuracy > 0.85
    assert loo.accuracy < loo.weighted_accuracy
  end

  def test_regression_loo_validation
    dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
    model = Model::LazarRegression.create dataset
    loo = RegressionLeaveOneOutValidation.create model
    assert_equal 11, loo.nr_unpredicted
    assert loo.weighted_mae < loo.mae
    assert loo.r_squared > 0.34
    #assert_equal 14, loo.nr_unpredicted
    #p loo.confusion_matrix
    #p loo.accuracy
    #File.open("tmp.svg","w+"){|f| f.puts loo.correlation_plot}
    #`inkview tmp.svg`
  end

end