From 8519274487166d75b3b9ae28e61f7a7be9f7e83c Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 27 Oct 2016 11:58:07 +0200 Subject: probability plot for classification validations --- lib/crossvalidation.rb | 18 +++++++---- lib/leave-one-out-validation.rb | 3 ++ lib/train-test-validation.rb | 14 +++++++++ lib/validation-statistics.rb | 64 +++++++++++++++++++++++---------------- test/validation-classification.rb | 2 ++ test/validation-regression.rb | 40 ++++++++++++++++++++++-- 6 files changed, 107 insertions(+), 34 deletions(-) diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 15d1031..4f779a2 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -64,14 +64,16 @@ module OpenTox field :weighted_accuracy, type: Float field :true_rate, type: Hash field :predictivity, type: Hash - field :confidence_plot_id, type: BSON::ObjectId + field :probability_plot_id, type: BSON::ObjectId end class RegressionCrossValidation < CrossValidation include RegressionStatistics - field :rmse, type: Float - field :mae, type: Float + field :rmse, type: Float, default:0 + field :mae, type: Float, default:0 field :r_squared, type: Float + field :within_prediction_interval, type: Integer, default:0 + field :out_of_prediction_interval, type: Integer, default:0 field :correlation_plot_id, type: BSON::ObjectId end @@ -93,6 +95,7 @@ module OpenTox crossvalidation_ids.collect{|id| CrossValidation.find(id)} end +=begin def correlation_plot format: "png" #unless correlation_plot_id feature = Feature.find(crossvalidations.first.model.prediction_feature) @@ -104,16 +107,18 @@ module OpenTox x = [] y = [] cv.predictions.each do |sid,p| - x << p["value"] - y << p["measurements"].median + x << p["measurements"].median + y << p["value"] end R.assign "measurement", x R.assign "prediction", y R.eval "all = c(measurement,prediction)" R.eval "range = c(min(all), max(all))" - R.eval "image#{i} = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" + R.eval "image#{i} = qplot(prediction,measurement,main='#{title} #{i}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" R.eval "image#{i} = image#{i} + geom_abline(intercept=0, slope=1)" images << "image#{i}" + + R.eval "ggsave(file='/home/ist/lazar/test/tmp#{i}.pdf', plot=image#{i})" end R.eval "pdf('#{tmpfile}')" R.eval "grid.arrange(#{images.join ","},ncol=#{images.size})" @@ -124,6 +129,7 @@ module OpenTox #end $gridfs.find_one(_id: correlation_plot_id).data end +=end end end diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index 59f43c5..538b7b3 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -5,6 +5,7 @@ module OpenTox class LeaveOneOut < Validation def self.create model + bad_request_error "Cannot create leave one out validation for models with supervised feature selection. Please use crossvalidation instead." if model.algorithms[:feature_selection] $logger.debug "#{model.name}: LOO validation started" t = Time.now model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOut : klass = RegressionLeaveOneOut @@ -48,6 +49,8 @@ module OpenTox field :rmse, type: Float, default: 0 field :mae, type: Float, default: 0 field :r_squared, type: Float + field :within_prediction_interval, type: Integer, default:0 + field :out_of_prediction_interval, type: Integer, default:0 field :correlation_plot_id, type: BSON::ObjectId end diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb index e3f5905..71abad2 100644 --- a/lib/train-test-validation.rb +++ b/lib/train-test-validation.rb @@ -44,10 +44,24 @@ module OpenTox class ClassificationTrainTest < TrainTest include ClassificationStatistics + field :accept_values, type: Array + field :confusion_matrix, type: Array + field :weighted_confusion_matrix, type: Array + field :accuracy, type: Float + field :weighted_accuracy, type: Float + field :true_rate, type: Hash + field :predictivity, type: Hash + field :probability_plot_id, type: BSON::ObjectId end class RegressionTrainTest < TrainTest include RegressionStatistics + field :rmse, type: Float, default:0 + field :mae, type: Float, default:0 + field :r_squared, type: Float + field :within_prediction_interval, type: Integer, default:0 + field :out_of_prediction_interval, type: Integer, default:0 + field :correlation_plot_id, type: BSON::ObjectId end end diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 3582c71..4ab4b13 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -65,43 +65,44 @@ module OpenTox } end - def confidence_plot - unless confidence_plot_id - tmpfile = "/tmp/#{id.to_s}_confidence.svg" + def probability_plot format: "pdf" + #unless probability_plot_id + tmpfile = "/tmp/#{id.to_s}_probability.#{format}" accuracies = [] - confidences = [] + probabilities = [] correct_predictions = 0 incorrect_predictions = 0 - predictions.each do |p| - p[:measurements].each do |db_act| - if p[:value] - p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1 - accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f - confidences << p[:confidence] - - end + pp = [] + predictions.values.select{|p| p["probabilities"]}.compact.each do |p| + p["measurements"].each do |m| + pp << [ p["probabilities"][p["value"]], p["value"] == m ] end end + pp.sort_by!{|p| 1-p.first} + pp.each do |p| + p[1] ? correct_predictions += 1 : incorrect_predictions += 1 + accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f + probabilities << p[0] + end R.assign "accuracy", accuracies - R.assign "confidence", confidences - R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" + R.assign "probability", probabilities + R.eval "image = qplot(probability,accuracy)+ylab('Accumulated accuracy')+xlab('Prediction probability')+ylim(c(0,1))+scale_x_reverse()+geom_line()" R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.svg") plot_id = $gridfs.insert_one(file) - update(:confidence_plot_id => plot_id) - end - $gridfs.find_one(_id: confidence_plot_id).data + update(:probability_plot_id => plot_id) + #end + $gridfs.find_one(_id: probability_plot_id).data end end module RegressionStatistics def statistics - # TODO: predictions within prediction_interval self.rmse = 0 self.mae = 0 - #self.within_prediction_interval = 0 - #self.outside_prediction_interval = 0 + self.within_prediction_interval = 0 + self.out_of_prediction_interval = 0 x = [] y = [] predictions.each do |cid,pred| @@ -111,9 +112,13 @@ module OpenTox error = pred[:value]-pred[:measurements].median self.rmse += error**2 self.mae += error.abs - #if pred[:prediction_interval] - #if pred[:measurements] - #end + if pred[:prediction_interval] + if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1] + self.within_prediction_interval += 1 + else + self.out_of_prediction_interval += 1 + end + end else warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." @@ -128,16 +133,23 @@ module OpenTox $logger.debug "R^2 #{r_squared}" $logger.debug "RMSE #{rmse}" $logger.debug "MAE #{mae}" + $logger.debug "#{percent_within_prediction_interval.round(2)}% measurements within prediction interval" save { :mae => mae, :rmse => rmse, :r_squared => r_squared, + :within_prediction_interval => within_prediction_interval, + :out_of_prediction_interval => out_of_prediction_interval, } end + def percent_within_prediction_interval + 100*within_prediction_interval.to_f/(within_prediction_interval+out_of_prediction_interval) + end + def correlation_plot format: "png" - unless correlation_plot_id + #unless correlation_plot_id tmpfile = "/tmp/#{id.to_s}_correlation.#{format}" x = [] y = [] @@ -158,7 +170,7 @@ module OpenTox file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}") plot_id = $gridfs.insert_one(file) update(:correlation_plot_id => plot_id) - end + #end $gridfs.find_one(_id: correlation_plot_id).data end diff --git a/test/validation-classification.rb b/test/validation-classification.rb index b71e427..c93e71f 100644 --- a/test/validation-classification.rb +++ b/test/validation-classification.rb @@ -11,6 +11,8 @@ class ValidationClassificationTest < MiniTest::Test cv = ClassificationCrossValidation.create model assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7, this may occur due to an unfavorable training/test set split" assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy (#{cv.weighted_accuracy}) should be larger than accuracy (#{cv.accuracy})." + #p cv + #File.open("tmp.pdf","w+"){|f| f.puts cv.probability_plot} end # parameters diff --git a/test/validation-regression.rb b/test/validation-regression.rb index efce849..a0895f9 100644 --- a/test/validation-regression.rb +++ b/test/validation-regression.rb @@ -9,8 +9,9 @@ class ValidationRegressionTest < MiniTest::Test dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi_log10.csv" model = Model::Lazar.create training_dataset: dataset cv = RegressionCrossValidation.create model - assert cv.rmse < 1.5, "RMSE #{cv.rmse} should be smaller than 1.5, this may occur due to an unfavorable training/test set split" - assert cv.mae < 1, "MAE #{cv.mae} should be smaller than 1, this may occur due to an unfavorable training/test set split" + assert cv.rmse < 1.5, "RMSE #{cv.rmse} should be smaller than 1.5, this may occur due to unfavorable training/test set splits" + assert cv.mae < 1.1, "MAE #{cv.mae} should be smaller than 1.1, this may occur due to unfavorable training/test set splits" + assert cv.percent_within_prediction_interval > 80, "Only #{cv.percent_within_prediction_interval.round(2)}% of measurement within prediction interval. This may occur due to unfavorable training/test set splits" end # parameters @@ -54,4 +55,39 @@ class ValidationRegressionTest < MiniTest::Test assert loo.r_squared > 0.34, "R^2 (#{loo.r_squared}) should be larger than 0.034" end + def test_regression_loo_validation_with_feature_selection + dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv") + algorithms = { + :descriptors => { + :method => "calculate_properties", + :features => PhysChem.openbabel_descriptors, + }, + :similarity => { + :method => "Algorithm::Similarity.weighted_cosine", + :min => 0.5 + }, + :feature_selection => { + :method => "Algorithm::FeatureSelection.correlation_filter", + }, + } + model = Model::Lazar.create training_dataset: dataset, algorithms: algorithms + assert_raises OpenTox::BadRequestError do + loo = RegressionLeaveOneOut.create model + end + end + + # repeated CV + + def test_repeated_crossvalidation + dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi_log10.csv") + model = Model::Lazar.create training_dataset: dataset + repeated_cv = RepeatedCrossValidation.create model + repeated_cv.crossvalidations.each do |cv| + #assert cv.r_squared > 0.34, "R^2 (#{cv.r_squared}) should be larger than 0.034" + #assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split" + end + p repeated_cv + File.open("tmp.png","w+"){|f| f.puts repeated_cv.correlation_plot} + end + end -- cgit v1.2.3