From d3a4c309d48b794f2f60f44bb9a3d94f402cc82f Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 16 Sep 2015 13:11:45 +0200 Subject: repeated crossvalidations, improved experiment reports --- lib/crossvalidation.rb | 57 +++++++++++++++++++++-------------- lib/dataset.rb | 1 + lib/error.rb | 2 +- lib/experiment.rb | 81 +++++++++++++++++++++++++------------------------- lib/lazar.rb | 5 ++-- lib/model.rb | 3 -- test/experiment.rb | 62 +++++++++++++++++++++++++++++--------- test/validation.rb | 12 ++++++++ 8 files changed, 141 insertions(+), 82 deletions(-) diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 90c0d75..f480932 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -22,7 +22,9 @@ module OpenTox end def self.create model, n=10 - cv = self.new( + model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation + bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass + cv = klass.new( name: model.name, model_id: model.id, folds: n @@ -55,6 +57,7 @@ module OpenTox nr_unpredicted: nr_unpredicted, predictions: predictions ) + cv.statistics cv end end @@ -70,14 +73,13 @@ module OpenTox field :predictivity, type: Hash # TODO auc, f-measure (usability??) - def self.create model, n=10 - cv = super model, n + def statistics accept_values = Feature.find(model.prediction_feature_id).accept_values confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} true_rate = {} predictivity = {} - cv.predictions.each do |pred| + predictions.each do |pred| compound_id,activity,prediction,confidence = pred if activity and prediction and confidence.numeric? if prediction == activity @@ -113,18 +115,16 @@ module OpenTox confidence_sum += c end end - cv.update_attributes( + update_attributes( accept_values: accept_values, confusion_matrix: confusion_matrix, weighted_confusion_matrix: weighted_confusion_matrix, - accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(cv.nr_instances-cv.nr_unpredicted).to_f, + accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f, weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, true_rate: true_rate, predictivity: predictivity, finished_at: Time.now ) - cv.save - cv end #Average area under roc 0.646 @@ -142,8 +142,7 @@ module OpenTox field :correlation_plot_id, type: BSON::ObjectId field :confidence_plot_id, type: BSON::ObjectId - def self.create model, n=10 - cv = super model, n + def statistics rmse = 0 weighted_rmse = 0 rse = 0 @@ -153,7 +152,7 @@ module OpenTox rae = 0 weighted_rae = 0 confidence_sum = 0 - cv.predictions.each do |pred| + predictions.each do |pred| compound_id,activity,prediction,confidence = pred if activity and prediction error = Math.log10(prediction)-Math.log10(activity) @@ -163,24 +162,24 @@ module OpenTox weighted_mae += confidence*error.abs confidence_sum += confidence else - cv.warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{cv.model.training_dataset_id}." - $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{cv.model.training_dataset_id}." + warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." end end - x = cv.predictions.collect{|p| p[1]} - y = cv.predictions.collect{|p| p[2]} + x = predictions.collect{|p| p[1]} + y = predictions.collect{|p| p[2]} R.assign "measurement", x R.assign "prediction", y R.eval "r <- cor(-log(measurement),-log(prediction))" r = R.eval("r").to_ruby - mae = mae/cv.predictions.size + mae = mae/predictions.size weighted_mae = weighted_mae/confidence_sum - rmse = Math.sqrt(rmse/cv.predictions.size) + rmse = Math.sqrt(rmse/predictions.size) weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) # TODO check!! =begin - cv.predictions.sort! do |a,b| + predictions.sort! do |a,b| relative_error_a = (a[1]-a[2]).abs/a[1].to_f relative_error_a = 1/relative_error_a if relative_error_a < 1 relative_error_b = (b[1]-b[2]).abs/b[1].to_f @@ -188,15 +187,14 @@ module OpenTox [relative_error_b,b[3]] <=> [relative_error_a,a[3]] end =end - cv.update_attributes( + update_attributes( mae: mae, rmse: rmse, weighted_mae: weighted_mae, weighted_rmse: weighted_rmse, - r_squared: r**2 + r_squared: r**2, + finished_at: Time.now ) - cv.save - cv end def misclassifications n=nil @@ -277,5 +275,20 @@ module OpenTox end end + class RepeatedCrossValidation + field :crossvalidation_ids, type: Array, default: [] + def self.create model, folds=10, repeats=3 + repeated_cross_validation = self.new + repeats.times do + repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id + end + repeated_cross_validation.save + repeated_cross_validation + end + def crossvalidations + crossvalidation_ids.collect{|id| CrossValidation.find(id)} + end + end + end diff --git a/lib/dataset.rb b/lib/dataset.rb index 851fabd..d884716 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -47,6 +47,7 @@ module OpenTox @data_entries = Marshal.load(data_entry_file.data) bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size + # TODO: data_entries can be empty, poorly reproducible, mongo problem? bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size #$logger.debug "Retrieving data: #{Time.now-t}" end diff --git a/lib/error.rb b/lib/error.rb index 8fe8a1e..39b3c76 100644 --- a/lib/error.rb +++ b/lib/error.rb @@ -58,7 +58,7 @@ module OpenTox OpenTox.const_set error[:class],c # define global methods for raising errors, eg. bad_request_error - Object.send(:define_method, error[:method]) do |message,uri=nil,cause=nil| + Object.send(:define_method, error[:method]) do |message| raise c.new(message) end end diff --git a/lib/experiment.rb b/lib/experiment.rb index 2f51756..7849337 100644 --- a/lib/experiment.rb +++ b/lib/experiment.rb @@ -2,45 +2,22 @@ module OpenTox class Experiment field :dataset_ids, type: Array - field :model_algorithms, type: Array - field :model_ids, type: Array, default: [] - field :crossvalidation_ids, type: Array, default: [] - field :prediction_algorithms, type: Array - field :neighbor_algorithms, type: Array - field :neighbor_algorithm_parameters, type: Array + field :model_settings, type: Array + field :results, type: Hash, default: {} end - # TODO more sophisticated experimental design def run dataset_ids.each do |dataset_id| dataset = Dataset.find(dataset_id) - model_algorithms.each do |model_algorithm| - prediction_algorithms.each do |prediction_algorithm| - neighbor_algorithms.each do |neighbor_algorithm| - neighbor_algorithm_parameters.each do |neighbor_algorithm_parameter| - $logger.debug "Creating #{model_algorithm} model for dataset #{dataset.name}, with prediction_algorithm #{prediction_algorithm}, neighbor_algorithm #{neighbor_algorithm}, neighbor_algorithm_parameters #{neighbor_algorithm_parameter}." - model = Object.const_get(model_algorithm).create dataset - model.prediction_algorithm = prediction_algorithm - model.neighbor_algorithm = neighbor_algorithm - model.neighbor_algorithm_parameters = neighbor_algorithm_parameter - model.save - model_ids << model.id - cv = nil - if dataset.features.first.nominal - cv = ClassificationCrossValidation - elsif dataset.features.first.numeric - cv = RegressionCrossValidation - end - if cv - $logger.debug "Creating #{cv} for #{model_algorithm}, dataset #{dataset.name}, with prediction_algorithm #{prediction_algorithm}, neighbor_algorithm #{neighbor_algorithm}, neighbor_algorithm_parameters #{neighbor_algorithm_parameter}." - crossvalidation = cv.create model - self.crossvalidation_ids << crossvalidation.id - else - $logger.warn "#{dataset.features.first} is neither nominal nor numeric." - end - end - end - end + results[dataset_id.to_s] = [] + model_settings.each do |setting| + model = Object.const_get(setting[:algorithm]).create dataset + model.prediction_algorithm = setting[:prediction_algorithm] if setting[:prediction_algorithm] + model.neighbor_algorithm = setting[:neighbor_algorithm] if setting[:neighbor_algorithm] + model.neighbor_algorithm_parameters = setting[:neighbor_algorithm_parameter] if setting[:neighbor_algorithm_parameter] + model.save + repeated_crossvalidation = RepeatedCrossValidation.create model + results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id} end end save @@ -54,13 +31,37 @@ module OpenTox end def report - # TODO create ggplot2 report - self.crossvalidation_ids.each do |id| - cv = CrossValidation.find(id) - file = "/tmp/#{id}.svg" - File.open(file,"w+"){|f| f.puts cv.correlation_plot} - `inkview '#{file}'` + # TODO significances + report = {} + report[:name] = name + report[:experiment_id] = self.id.to_s + dataset_ids.each do |dataset_id| + dataset_name = Dataset.find(dataset_id).name + report[dataset_name] = [] + results[dataset_id.to_s].each do |result| + model = Model::Lazar.find(result[:model_id]) + repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id]) + crossvalidations = repeated_cv.crossvalidations + summary = {} + [:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key| + summary[key] = model[key] + end + summary[:nr_instances] = crossvalidations.first.nr_instances + summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted} + summary[:time] = crossvalidations.collect{|cv| cv.time} + if crossvalidations.first.is_a? ClassificationCrossValidation + summary[:accuracies] = crossvalidations.collect{|cv| cv.accuracy} + elsif crossvalidations.first.is_a? RegressionCrossValidation + summary[:r_squared] = crossvalidations.collect{|cv| cv.r_squared} + end + report[dataset_name] << summary + #p repeated_cv.crossvalidations.collect{|cv| cv.accuracy} + #file = "/tmp/#{id}.svg" + #File.open(file,"w+"){|f| f.puts cv.correlation_plot} + #`inkview '#{file}'` + end end + report end end diff --git a/lib/lazar.rb b/lib/lazar.rb index decbe69..9b02053 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -15,7 +15,8 @@ ENV["MONGOID_ENV"] ||= "development" # TODO remove config files, change default via ENV or directly in Mongoid class Mongoid.load!("#{File.expand_path(File.join(File.dirname(__FILE__),'..','mongoid.yml'))}") Mongoid.raise_not_found_error = false # return nil if no document is found -$mongo = Mongoid.default_client +$mongo = Mongo::Client.new('mongodb://127.0.0.1:27017/opentox') +#$mongo = Mongoid.default_client $gridfs = $mongo.database.fs # R setup @@ -42,7 +43,7 @@ ENV['FMINER_SILENT'] = 'true' ENV['FMINER_NR_HITS'] = 'true' # OpenTox classes and includes -CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","Experiment"]# Algorithm and Models are modules +CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules [ # be aware of the require sequence as it affects class/method overwrites "overwrite.rb", diff --git a/lib/model.rb b/lib/model.rb index 0155fc8..ddb69e4 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -28,9 +28,6 @@ module OpenTox field :neighbor_algorithm, type: String field :neighbor_algorithm_parameters, type: Hash - #attr_accessor :prediction_dataset - #attr_accessor :training_dataset - # Create a lazar model from a training_dataset and a feature_dataset # @param [OpenTox::Dataset] training_dataset # @return [OpenTox::Model::Lazar] Regression or classification model diff --git a/test/experiment.rb b/test/experiment.rb index c465d7b..cad4fa7 100644 --- a/test/experiment.rb +++ b/test/experiment.rb @@ -4,27 +4,61 @@ class ExperimentTest < MiniTest::Test def test_regression_experiment datasets = [ - "EPAFHM.csv", - "FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv", + "EPAFHM.medi.csv", + #"EPAFHM.csv", + #"FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv", "LOAEL_mmol_corrected_smiles.csv" + ] + experiment = Experiment.create( + :name => "Default regression for datasets #{datasets}.", + :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id}, + :model_settings => [ + { + :algorithm => "OpenTox::Model::LazarRegression", + } ] - model_algorithms = ["OpenTox::Model::LazarRegression"] - neighbor_algorithms = ["OpenTox::Algorithm::Neighbor.fingerprint_similarity"] - prediction_algorithms = ["OpenTox::Algorithm::Regression.weighted_average"] - neighbor_algorithm_parameters = [{:min_sim => 0.7}] + ) + experiment.run + puts experiment.report.to_yaml + assert_equal datasets.size, experiment.results.size + experiment.results.each do |dataset_id, result| + assert_equal 1, result.size + result.each do |r| + assert_kind_of BSON::ObjectId, r[:model_id] + assert_kind_of BSON::ObjectId, r[:repeated_crossvalidation_id] + end + end + end + + def test_classification_experiment + + datasets = [ "hamster_carcinogenicity.csv" ] experiment = Experiment.create( - :name => "Regression for datasets #{datasets}.", + :name => "Fminer vs fingerprint classification for datasets #{datasets}.", :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id}, - :model_algorithms => model_algorithms, - :neighbor_algorithms => neighbor_algorithms, - :neighbor_algorithm_parameters => neighbor_algorithm_parameters, - :prediction_algorithms => prediction_algorithms, + :model_settings => [ + { + :algorithm => "OpenTox::Model::LazarClassification", + },{ + :algorithm => "OpenTox::Model::LazarClassification", + :neighbor_algorithm_parameter => {:min_sim => 0.3} + }, + #{ + #:algorithm => "OpenTox::Model::LazarFminerClassification", + #} + ] ) experiment.run =begin - p experiment - experiment.report + experiment = Experiment.find "55f944a22b72ed7de2000000" =end - refute_empty experiment.crossvalidation_ids + puts experiment.report.to_yaml + experiment.results.each do |dataset_id, result| + assert_equal 2, result.size + result.each do |r| + assert_kind_of BSON::ObjectId, r[:model_id] + assert_kind_of BSON::ObjectId, r[:repeated_crossvalidation_id] + end + end end end diff --git a/test/validation.rb b/test/validation.rb index a4c3d80..dfa2c81 100644 --- a/test/validation.rb +++ b/test/validation.rb @@ -33,4 +33,16 @@ class ValidationTest < MiniTest::Test #assert cv.weighted_mae < cv.mae end + def test_repeated_crossvalidation + dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" + model = Model::LazarClassification.create dataset + repeated_cv = RepeatedCrossValidation.create model + p repeated_cv + repeated_cv.crossvalidations.each do |cv| + p cv + p cv.accuracy + assert cv.accuracy > 0.7 + end + end + end -- cgit v1.2.3