summaryrefslogtreecommitdiff
path: root/lib/leave-one-out-validation.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/leave-one-out-validation.rb')
-rw-r--r--lib/leave-one-out-validation.rb238
1 files changed, 46 insertions, 192 deletions
diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb
index 2cd13db..538b7b3 100644
--- a/lib/leave-one-out-validation.rb
+++ b/lib/leave-one-out-validation.rb
@@ -1,205 +1,59 @@
module OpenTox
- class LeaveOneOutValidation
-
- field :model_id, type: BSON::ObjectId
- field :dataset_id, type: BSON::ObjectId
- field :nr_instances, type: Integer
- field :nr_unpredicted, type: Integer
- field :predictions, type: Array
- field :finished_at, type: Time
-
- def self.create model
- model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation
- loo = klass.new :model_id => model.id, :dataset_id => model.training_dataset_id
- compound_ids = model.training_dataset.compound_ids
- predictions = model.predict model.training_dataset.compounds
- predictions = predictions.each_with_index {|p,i| p[:compound_id] = compound_ids[i]}
- predictions.select!{|p| p[:database_activities] and !p[:database_activities].empty?}
- loo.nr_instances = predictions.size
- predictions.select!{|p| p[:value]} # remove unpredicted
- loo.predictions = predictions#.sort{|a,b| b[:confidence] <=> a[:confidence]}
- loo.nr_unpredicted = loo.nr_instances - loo.predictions.size
- loo.statistics
- loo.save
- loo
- end
-
- def model
- Model::Lazar.find model_id
- end
- end
-
- class ClassificationLeaveOneOutValidation < LeaveOneOutValidation
-
- field :accept_values, type: Array
- field :confusion_matrix, type: Array, default: []
- field :weighted_confusion_matrix, type: Array, default: []
- field :accuracy, type: Float
- field :weighted_accuracy, type: Float
- field :true_rate, type: Hash, default: {}
- field :predictivity, type: Hash, default: {}
- field :confidence_plot_id, type: BSON::ObjectId
-
- def statistics
- accept_values = Feature.find(model.prediction_feature_id).accept_values
- confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
- weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
- predictions.each do |pred|
- pred[:database_activities].each do |db_act|
- if pred[:value]
- if pred[:value] == db_act
- if pred[:value] == accept_values[0]
- confusion_matrix[0][0] += 1
- weighted_confusion_matrix[0][0] += pred[:confidence]
- elsif pred[:value] == accept_values[1]
- confusion_matrix[1][1] += 1
- weighted_confusion_matrix[1][1] += pred[:confidence]
- end
- else
- if pred[:value] == accept_values[0]
- confusion_matrix[0][1] += 1
- weighted_confusion_matrix[0][1] += pred[:confidence]
- elsif pred[:value] == accept_values[1]
- confusion_matrix[1][0] += 1
- weighted_confusion_matrix[1][0] += pred[:confidence]
- end
- end
+ module Validation
+
+ class LeaveOneOut < Validation
+
+ def self.create model
+ bad_request_error "Cannot create leave one out validation for models with supervised feature selection. Please use crossvalidation instead." if model.algorithms[:feature_selection]
+ $logger.debug "#{model.name}: LOO validation started"
+ t = Time.now
+ model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOut : klass = RegressionLeaveOneOut
+ loo = klass.new :model_id => model.id
+ predictions = model.predict model.training_dataset.substances
+ predictions.each{|cid,p| p.delete(:neighbors)}
+ nr_unpredicted = 0
+ predictions.each do |cid,prediction|
+ if prediction[:value]
+ prediction[:measurements] = model.training_dataset.values(cid, prediction[:prediction_feature_id])
+ else
+ nr_unpredicted += 1
end
+ predictions.delete(cid) unless prediction[:value] and prediction[:measurements]
end
+ predictions.select!{|cid,p| p[:value] and p[:measurements]}
+ loo.nr_instances = predictions.size
+ loo.nr_unpredicted = nr_unpredicted
+ loo.predictions = predictions
+ loo.statistics
+ $logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds"
+ loo
end
- accept_values.each_with_index do |v,i|
- true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
- predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
- end
- confidence_sum = 0
- weighted_confusion_matrix.each do |r|
- r.each do |c|
- confidence_sum += c
- end
- end
- update_attributes(
- accept_values: accept_values,
- confusion_matrix: confusion_matrix,
- weighted_confusion_matrix: weighted_confusion_matrix,
- accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
- weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
- true_rate: true_rate,
- predictivity: predictivity,
- finished_at: Time.now
- )
- $logger.debug "Accuracy #{accuracy}"
- end
-
- def confidence_plot
- unless confidence_plot_id
- tmpfile = "/tmp/#{id.to_s}_confidence.svg"
- accuracies = []
- confidences = []
- correct_predictions = 0
- incorrect_predictions = 0
- predictions.each do |p|
- p[:database_activities].each do |db_act|
- if p[:value]
- p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1
- accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
- confidences << p[:confidence]
- end
- end
- end
- R.assign "accuracy", accuracies
- R.assign "confidence", confidences
- R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
- R.eval "ggsave(file='#{tmpfile}', plot=image)"
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
- plot_id = $gridfs.insert_one(file)
- update(:confidence_plot_id => plot_id)
- end
- $gridfs.find_one(_id: confidence_plot_id).data
end
- end
-
-
- class RegressionLeaveOneOutValidation < LeaveOneOutValidation
-
- field :rmse, type: Float, default: 0.0
- field :mae, type: Float, default: 0
- #field :weighted_rmse, type: Float, default: 0
- #field :weighted_mae, type: Float, default: 0
- field :r_squared, type: Float
- field :correlation_plot_id, type: BSON::ObjectId
- field :confidence_plot_id, type: BSON::ObjectId
-
- def statistics
- confidence_sum = 0
- predicted_values = []
- measured_values = []
- predictions.each do |pred|
- pred[:database_activities].each do |activity|
- if pred[:value]
- predicted_values << pred[:value]
- measured_values << activity
- error = Math.log10(pred[:value])-Math.log10(activity)
- self.rmse += error**2
- #self.weighted_rmse += pred[:confidence]*error**2
- self.mae += error.abs
- #self.weighted_mae += pred[:confidence]*error.abs
- #confidence_sum += pred[:confidence]
- end
- end
- if pred[:database_activities].empty?
- warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
- $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
- end
- end
- R.assign "measurement", measured_values
- R.assign "prediction", predicted_values
- R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
- r = R.eval("r").to_ruby
-
- self.mae = self.mae/predictions.size
- #self.weighted_mae = self.weighted_mae/confidence_sum
- self.rmse = Math.sqrt(self.rmse/predictions.size)
- #self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum)
- self.r_squared = r**2
- self.finished_at = Time.now
- save
- $logger.debug "R^2 #{r**2}"
- $logger.debug "RMSE #{rmse}"
- $logger.debug "MAE #{mae}"
+ class ClassificationLeaveOneOut < LeaveOneOut
+ include ClassificationStatistics
+ field :accept_values, type: Array
+ field :confusion_matrix, type: Array, default: []
+ field :weighted_confusion_matrix, type: Array, default: []
+ field :accuracy, type: Float
+ field :weighted_accuracy, type: Float
+ field :true_rate, type: Hash, default: {}
+ field :predictivity, type: Hash, default: {}
+ field :confidence_plot_id, type: BSON::ObjectId
end
-
- def correlation_plot
- unless correlation_plot_id
- tmpfile = "/tmp/#{id.to_s}_correlation.svg"
- predicted_values = []
- measured_values = []
- predictions.each do |pred|
- pred[:database_activities].each do |activity|
- if pred[:value]
- predicted_values << pred[:value]
- measured_values << activity
- end
- end
- end
- attributes = Model::Lazar.find(self.model_id).attributes
- attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
- attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
- R.assign "measurement", measured_values
- R.assign "prediction", predicted_values
- R.eval "all = c(-log(measurement),-log(prediction))"
- R.eval "range = c(min(all), max(all))"
- R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
- R.eval "image = image + geom_abline(intercept=0, slope=1)"
- R.eval "ggsave(file='#{tmpfile}', plot=image)"
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg")
- plot_id = $gridfs.insert_one(file)
- update(:correlation_plot_id => plot_id)
- end
- $gridfs.find_one(_id: correlation_plot_id).data
+
+ class RegressionLeaveOneOut < LeaveOneOut
+ include RegressionStatistics
+ field :rmse, type: Float, default: 0
+ field :mae, type: Float, default: 0
+ field :r_squared, type: Float
+ field :within_prediction_interval, type: Integer, default:0
+ field :out_of_prediction_interval, type: Integer, default:0
+ field :correlation_plot_id, type: BSON::ObjectId
end
+
end
end