From a8368dda776c05331474adf7eaf9a6e413a3b1eb Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 13 Apr 2016 15:15:51 +0200 Subject: validation tests pass --- lib/compound.rb | 2 +- lib/crossvalidation.rb | 109 +++------------------------------------- lib/dataset.rb | 40 +++++++-------- lib/lazar.rb | 3 +- lib/leave-one-out-validation.rb | 108 +++++++-------------------------------- lib/model.rb | 23 +++++---- lib/validation.rb | 62 ++++------------------- test/classification.rb | 6 ++- test/validation.rb | 6 +-- 9 files changed, 78 insertions(+), 281 deletions(-) diff --git a/lib/compound.rb b/lib/compound.rb index 84d8891..757ba1a 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -341,7 +341,7 @@ module OpenTox {'$sort' => {'tanimoto' => -1}} ] - $mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]} + $mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]} end diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index b7cd7bf..f93a04c 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -6,7 +6,7 @@ module OpenTox field :folds, type: Integer field :nr_instances, type: Integer field :nr_unpredicted, type: Integer - field :predictions, type: Array, default: [] + field :predictions, type: Hash, default: {} field :finished_at, type: Time def time @@ -32,7 +32,7 @@ module OpenTox cv.save # set created_at nr_instances = 0 nr_unpredicted = 0 - predictions = [] + predictions = {} training_dataset = Dataset.find model.training_dataset_id training_dataset.folds(n).each_with_index do |fold,fold_nr| #fork do # parallel execution of validations @@ -42,12 +42,12 @@ module OpenTox $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" #end end - #Process.waitall + Process.waitall cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id) cv.validations.each do |validation| nr_instances += validation.nr_instances nr_unpredicted += validation.nr_unpredicted - predictions += validation.predictions + predictions.merge! validation.predictions end cv.update_attributes( nr_instances: nr_instances, @@ -73,61 +73,8 @@ module OpenTox # TODO auc, f-measure (usability??) def statistics - accept_values = Feature.find(model.prediction_feature_id).accept_values - confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - true_rate = {} - predictivity = {} - predictions.each do |pred| - compound_id,activities,prediction,confidence = pred - if activities and prediction #and confidence.numeric? - if activities.uniq.size == 1 - activity = activities.uniq.first - if prediction == activity - if prediction == accept_values[0] - confusion_matrix[0][0] += 1 - #weighted_confusion_matrix[0][0] += confidence - elsif prediction == accept_values[1] - confusion_matrix[1][1] += 1 - #weighted_confusion_matrix[1][1] += confidence - end - elsif prediction != activity - if prediction == accept_values[0] - confusion_matrix[0][1] += 1 - #weighted_confusion_matrix[0][1] += confidence - elsif prediction == accept_values[1] - confusion_matrix[1][0] += 1 - #weighted_confusion_matrix[1][0] += confidence - end - end - end - else - nr_unpredicted += 1 if prediction.nil? - end - end - true_rate = {} - predictivity = {} - accept_values.each_with_index do |v,i| - true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f - predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f - end - confidence_sum = 0 - #weighted_confusion_matrix.each do |r| - #r.each do |c| - #confidence_sum += c - #end - #end - update_attributes( - accept_values: accept_values, - confusion_matrix: confusion_matrix, - #weighted_confusion_matrix: weighted_confusion_matrix, - accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f, - #weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, - true_rate: true_rate, - predictivity: predictivity, - finished_at: Time.now - ) - $logger.debug "Accuracy #{accuracy}" + stat = ValidationStatistics.classification(predictions, Feature.find(model.prediction_feature_id).accept_values) + update_attributes(stat) end def confidence_plot @@ -169,48 +116,8 @@ module OpenTox field :correlation_plot_id, type: BSON::ObjectId def statistics - rmse = 0 - mae = 0 - x = [] - y = [] - predictions.each do |pred| - compound_id,activity,prediction,confidence = pred - if activity and prediction - unless activity == [nil] - x << -Math.log10(activity.median) - y << -Math.log10(prediction) - error = Math.log10(prediction)-Math.log10(activity.median) - rmse += error**2 - #weighted_rmse += confidence*error**2 - mae += error.abs - #weighted_mae += confidence*error.abs - #confidence_sum += confidence - end - else - warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - end - end - R.assign "measurement", x - R.assign "prediction", y - R.eval "r <- cor(measurement,prediction,use='complete')" - r = R.eval("r").to_ruby - - mae = mae/predictions.size - #weighted_mae = weighted_mae/confidence_sum - rmse = Math.sqrt(rmse/predictions.size) - #weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) - update_attributes( - mae: mae, - rmse: rmse, - #weighted_mae: weighted_mae, - #weighted_rmse: weighted_rmse, - r_squared: r**2, - finished_at: Time.now - ) - $logger.debug "R^2 #{r**2}" - $logger.debug "RMSE #{rmse}" - $logger.debug "MAE #{mae}" + stat = ValidationStatistics.regression predictions + update_attributes(stat) end def misclassifications n=nil diff --git a/lib/dataset.rb b/lib/dataset.rb index 5c04382..25307c9 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -5,9 +5,6 @@ module OpenTox class Dataset - # associations like has_many, belongs_to deteriorate performance - #field :feature_ids, type: Array, default: [] - #field :substance_ids, type: Array, default: [] field :data_entries, type: Hash, default: {} # Readers @@ -24,7 +21,7 @@ module OpenTox # Get all features def features - @features ||= data_entries.collect{|cid,f| f.keys}.flatten.uniq.collect{|id| OpenTox::Feature.find(id)} + @features ||= data_entries.collect{|cid,f| f.first}.flatten.uniq.collect{|id| OpenTox::Feature.find(id)} @features end @@ -33,7 +30,7 @@ module OpenTox # @param feature [OpenTox::Feature] OpenTox Feature object # @return [Array] Data entry values def values(compound, feature) - data_entries[compound.id,feature.id] + data_entries[compound.id.to_s][feature.id.to_s] end # Writers @@ -68,15 +65,14 @@ module OpenTox training_idxs = indices-test_idxs training_cids = training_idxs.collect{|i| substance_ids[i]} chunk = [training_cids,test_cids].collect do |cids| - new_cids = [] - new_data_entries = [] + new_data_entries = {} cids.each do |cid| - data_entries[cid].each do |de| - new_cids << cid - new_data_entries << de + data_entries[cid].each do |f,v| + new_data_entries[cid] ||= {} + new_data_entries[cid][f] = v end end - dataset = self.class.new(:data_entries => data_entries, :source => self.id ) + dataset = self.class.new(:data_entries => new_data_entries, :source => self.id ) dataset.compounds.each do |compound| compound.dataset_ids << dataset.id compound.save @@ -213,9 +209,6 @@ module OpenTox next end - #substance_ids << compound.id - #table.first.size == 0 ? self.data_entries[compound.id] = Array.new(0) : self.data_entries[compound.id] = Array.new(table.first.size-1) - vals.each_with_index do |v,j| if v.blank? warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})." @@ -228,10 +221,8 @@ module OpenTox self.data_entries[compound.id.to_s] ||= {} self.data_entries[compound.id.to_s][@features[j].id.to_s] ||= [] self.data_entries[compound.id.to_s][@features[j].id.to_s] << v - #i = compound.feature_ids.index feature_ids[j] - #TODO - #compound.features[feature_ids[j].to_s] ||= [] - #compound.features[feature_ids[j].to_s] << v + compound.features[@features[j].id.to_s] ||= [] + compound.features[@features[j].id.to_s] << v compound.save end end @@ -251,14 +242,23 @@ module OpenTox end # Dataset for lazar predictions - class LazarPrediction < Dataset + class LazarPrediction #< Dataset field :creator, type: String - field :prediction_feature_id, type: String + field :prediction_feature_id, type: BSON::ObjectId + field :predictions, type: Hash, default: {} def prediction_feature Feature.find prediction_feature_id end + def compounds + substances.select{|s| s.is_a? Compound} + end + + def substances + predictions.keys.collect{|id| Substance.find id} + end + end end diff --git a/lib/lazar.rb b/lib/lazar.rb index 2bcecc5..a1ad551 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -62,7 +62,7 @@ suppressPackageStartupMessages({ # OpenTox classes and includes #CLASSES = ["Feature","Substance::Compound","Substance::Nanoparticle","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules -CLASSES = ["Feature","Substance","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules +CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules [ # be aware of the require sequence as it affects class/method overwrites "overwrite.rb", @@ -81,6 +81,7 @@ CLASSES = ["Feature","Substance","Dataset","Validation","CrossValidation","Leave "validation.rb", "crossvalidation.rb", "leave-one-out-validation.rb", + "validation-statistics.rb", "experiment.rb", "import.rb", ].each{ |f| require_relative f } diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index 2cd13db..10fbe85 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -6,20 +6,26 @@ module OpenTox field :dataset_id, type: BSON::ObjectId field :nr_instances, type: Integer field :nr_unpredicted, type: Integer - field :predictions, type: Array + field :predictions, type: Hash field :finished_at, type: Time def self.create model model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation loo = klass.new :model_id => model.id, :dataset_id => model.training_dataset_id - compound_ids = model.training_dataset.compound_ids predictions = model.predict model.training_dataset.compounds - predictions = predictions.each_with_index {|p,i| p[:compound_id] = compound_ids[i]} - predictions.select!{|p| p[:database_activities] and !p[:database_activities].empty?} + predictions.each{|cid,p| p.delete(:neighbors)} + nr_unpredicted = 0 + predictions.each do |cid,prediction| + if prediction[:value] + prediction[:measured] = model.training_dataset.data_entries[cid][prediction[:prediction_feature_id].to_s] + else + nr_unpredicted += 1 + end + predictions.delete(cid) unless prediction[:value] and prediction[:measured] + end loo.nr_instances = predictions.size - predictions.select!{|p| p[:value]} # remove unpredicted - loo.predictions = predictions#.sort{|a,b| b[:confidence] <=> a[:confidence]} - loo.nr_unpredicted = loo.nr_instances - loo.predictions.size + loo.nr_unpredicted = nr_unpredicted + loo.predictions = predictions loo.statistics loo.save loo @@ -42,53 +48,8 @@ module OpenTox field :confidence_plot_id, type: BSON::ObjectId def statistics - accept_values = Feature.find(model.prediction_feature_id).accept_values - confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - predictions.each do |pred| - pred[:database_activities].each do |db_act| - if pred[:value] - if pred[:value] == db_act - if pred[:value] == accept_values[0] - confusion_matrix[0][0] += 1 - weighted_confusion_matrix[0][0] += pred[:confidence] - elsif pred[:value] == accept_values[1] - confusion_matrix[1][1] += 1 - weighted_confusion_matrix[1][1] += pred[:confidence] - end - else - if pred[:value] == accept_values[0] - confusion_matrix[0][1] += 1 - weighted_confusion_matrix[0][1] += pred[:confidence] - elsif pred[:value] == accept_values[1] - confusion_matrix[1][0] += 1 - weighted_confusion_matrix[1][0] += pred[:confidence] - end - end - end - end - end - accept_values.each_with_index do |v,i| - true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f - predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f - end - confidence_sum = 0 - weighted_confusion_matrix.each do |r| - r.each do |c| - confidence_sum += c - end - end - update_attributes( - accept_values: accept_values, - confusion_matrix: confusion_matrix, - weighted_confusion_matrix: weighted_confusion_matrix, - accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f, - weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, - true_rate: true_rate, - predictivity: predictivity, - finished_at: Time.now - ) - $logger.debug "Accuracy #{accuracy}" + stat = ValidationStatistics.classification(predictions, Feature.find(model.prediction_feature_id).accept_values) + update_attributes(stat) end def confidence_plot @@ -132,43 +93,10 @@ module OpenTox field :correlation_plot_id, type: BSON::ObjectId field :confidence_plot_id, type: BSON::ObjectId + def statistics - confidence_sum = 0 - predicted_values = [] - measured_values = [] - predictions.each do |pred| - pred[:database_activities].each do |activity| - if pred[:value] - predicted_values << pred[:value] - measured_values << activity - error = Math.log10(pred[:value])-Math.log10(activity) - self.rmse += error**2 - #self.weighted_rmse += pred[:confidence]*error**2 - self.mae += error.abs - #self.weighted_mae += pred[:confidence]*error.abs - #confidence_sum += pred[:confidence] - end - end - if pred[:database_activities].empty? - warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - end - end - R.assign "measurement", measured_values - R.assign "prediction", predicted_values - R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" - r = R.eval("r").to_ruby - - self.mae = self.mae/predictions.size - #self.weighted_mae = self.weighted_mae/confidence_sum - self.rmse = Math.sqrt(self.rmse/predictions.size) - #self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum) - self.r_squared = r**2 - self.finished_at = Time.now - save - $logger.debug "R^2 #{r**2}" - $logger.debug "RMSE #{rmse}" - $logger.debug "MAE #{mae}" + stat = ValidationStatistics.regression predictions + update_attributes(stat) end def correlation_plot diff --git a/lib/model.rb b/lib/model.rb index 1f9942b..5140d5a 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -90,33 +90,36 @@ module OpenTox end # make predictions - predictions = [] - predictions = compounds.collect{|c| predict_compound c} + predictions = {} + compounds.each do |c| + predictions[c.id.to_s] = predict_compound c + predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id + end # serialize result case object.class.to_s when "OpenTox::Compound" - prediction = predictions.first + prediction = predictions[compounds.first.id.to_s] prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity - return prediction + return predictions when "Array" return predictions when "OpenTox::Dataset" + predictions.each{|cid,p| p.delete(:neighbors)} # prepare prediction dataset measurement_feature = Feature.find prediction_feature_id - prediction_feature = OpenTox::NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" ) + prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" ) prediction_dataset = LazarPrediction.new( :name => "Lazar prediction for #{prediction_feature.name}", :creator => __FILE__, :prediction_feature_id => prediction_feature.id ) - confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Model RMSE" ) - warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings") - prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ] - prediction_dataset.compounds = compounds - prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:rmse] , p[:dataset_activities].to_s, p[:warning]]} + + compounds.each_with_index do |c,i| + prediction_dataset.predictions[c.id.to_s] = predictions[i] + end prediction_dataset.save return prediction_dataset end diff --git a/lib/validation.rb b/lib/validation.rb index b72d273..484e22e 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -8,7 +8,7 @@ module OpenTox field :test_dataset_id, type: BSON::ObjectId field :nr_instances, type: Integer field :nr_unpredicted, type: Integer - field :predictions, type: Array + field :predictions, type: Hash def prediction_dataset Dataset.find prediction_dataset_id @@ -29,30 +29,22 @@ module OpenTox atts[:training_dataset_id] = training_set.id validation_model = model.class.create training_set, atts validation_model.save - cids = test_set.compound_ids - - test_set_without_activities = Dataset.new(:compound_ids => cids.uniq) # remove duplicates and make sure that activities cannot be used - prediction_dataset = validation_model.predict test_set_without_activities - predictions = [] + predictions = validation_model.predict test_set.compounds + predictions.each{|cid,p| p.delete(:neighbors)} nr_unpredicted = 0 - activities = test_set.data_entries.collect{|de| de.first} - prediction_dataset.data_entries.each_with_index do |de,i| - if de[0] #and de[1] - cid = prediction_dataset.compound_ids[i] - rows = cids.each_index.select{|r| cids[r] == cid } - activities = rows.collect{|r| test_set.data_entries[r][0]} - prediction = de.first - confidence = de[1] - predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]] + predictions.each do |cid,prediction| + if prediction[:value] + prediction[:measured] = test_set.data_entries[cid][prediction[:prediction_feature_id].to_s] else nr_unpredicted += 1 end + predictions.delete(cid) unless prediction[:value] and prediction[:measured] end validation = self.new( :model_id => validation_model.id, - :prediction_dataset_id => prediction_dataset.id, + #:prediction_dataset_id => prediction_dataset.id, :test_dataset_id => test_set.id, - :nr_instances => test_set.compound_ids.size, + :nr_instances => test_set.compounds.size, :nr_unpredicted => nr_unpredicted, :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence ) @@ -67,42 +59,6 @@ module OpenTox end class RegressionValidation < Validation - - def statistics - rmse = 0 - weighted_rmse = 0 - rse = 0 - weighted_rse = 0 - mae = 0 - weighted_mae = 0 - confidence_sum = 0 - predictions.each do |pred| - compound_id,activity,prediction,confidence = pred - if activity and prediction - error = Math.log10(prediction)-Math.log10(activity.median) - rmse += error**2 - weighted_rmse += confidence*error**2 - mae += error.abs - weighted_mae += confidence*error.abs - confidence_sum += confidence - else - warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - end - end - x = predictions.collect{|p| p[1].median} - y = predictions.collect{|p| p[2]} - R.assign "measurement", x - R.assign "prediction", y - R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" - r = R.eval("r").to_ruby - - mae = mae/predictions.size - weighted_mae = weighted_mae/confidence_sum - rmse = Math.sqrt(rmse/predictions.size) - weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) - { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae } - end end end diff --git a/test/classification.rb b/test/classification.rb index bedbe14..af23db6 100644 --- a/test/classification.rb +++ b/test/classification.rb @@ -33,8 +33,10 @@ class LazarClassificationTest < MiniTest::Test prediction = model.predict compound_dataset assert_equal compound_dataset.compounds, prediction.compounds - assert_equal "Could not find similar compounds with experimental data in the training dataset.", prediction.data_entries[7][3] - assert_equal "1 compounds have been removed from neighbors, because they have the same structure as the query compound.", prediction.data_entries[14][3] + cid = prediction.compounds[7].id.to_s + assert_equal "Could not find similar compounds with experimental data in the training dataset.", prediction.predictions[cid][:warning] + cid = prediction.compounds[9].id.to_s + assert_equal "1 compounds have been removed from neighbors, because they have the same structure as the query compound.", prediction.predictions[cid][:warning] # cleanup [training_dataset,model,compound_dataset].each{|o| o.delete} end diff --git a/test/validation.rb b/test/validation.rb index d8eea59..e702278 100644 --- a/test/validation.rb +++ b/test/validation.rb @@ -8,15 +8,15 @@ class ValidationTest < MiniTest::Test dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" model = Model::LazarClassification.create dataset cv = ClassificationCrossValidation.create model - assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7" + assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7, this may occur due to an unfavorable training/test set split" end def test_default_regression_crossvalidation dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv" model = Model::LazarRegression.create dataset cv = RegressionCrossValidation.create model - assert cv.rmse < 1.5, "RMSE > 1.5" - assert cv.mae < 1 + assert cv.rmse < 1.5, "RMSE #{cv.rmse} should be larger than 1.5, this may occur due to an unfavorable training/test set split" + assert cv.mae < 1, "MAE #{cv.mae} should be larger than 1, this may occur due to an unfavorable training/test set split" end # parameters -- cgit v1.2.3