diff options
Diffstat (limited to 'lib/crossvalidation.rb')
-rw-r--r-- | lib/crossvalidation.rb | 388 |
1 files changed, 111 insertions, 277 deletions
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 15dfb21..5a05955 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -1,301 +1,135 @@ module OpenTox - class CrossValidation - field :validation_ids, type: Array, default: [] - field :model_id, type: BSON::ObjectId - field :folds, type: Integer - field :nr_instances, type: Integer - field :nr_unpredicted, type: Integer - field :predictions, type: Array, default: [] - field :finished_at, type: Time - - def time - finished_at - created_at - end - - def validations - validation_ids.collect{|vid| Validation.find vid} - end - - def model - Model::Lazar.find model_id - end - - def self.create model, n=10 - model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation - bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass - cv = klass.new( - name: model.name, - model_id: model.id, - folds: n - ) - cv.save # set created_at - nr_instances = 0 - nr_unpredicted = 0 - predictions = [] - training_dataset = Dataset.find model.training_dataset_id - training_dataset.folds(n).each_with_index do |fold,fold_nr| - #fork do # parallel execution of validations - $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" - t = Time.now - validation = Validation.create(model, fold[0], fold[1],cv) - $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" - #end - end - #Process.waitall - cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id) - cv.validations.each do |validation| - nr_instances += validation.nr_instances - nr_unpredicted += validation.nr_unpredicted - predictions += validation.predictions - end - cv.update_attributes( - nr_instances: nr_instances, - nr_unpredicted: nr_unpredicted, - predictions: predictions#.sort{|a,b| b[3] <=> a[3]} # sort according to confidence - ) - $logger.debug "Nr unpredicted: #{nr_unpredicted}" - cv.statistics - cv - end - end - - class ClassificationCrossValidation < CrossValidation - - field :accept_values, type: Array - field :confusion_matrix, type: Array - field :weighted_confusion_matrix, type: Array - field :accuracy, type: Float - field :weighted_accuracy, type: Float - field :true_rate, type: Hash - field :predictivity, type: Hash - field :confidence_plot_id, type: BSON::ObjectId - # TODO auc, f-measure (usability??) - - def statistics - accept_values = Feature.find(model.prediction_feature_id).accept_values - confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - true_rate = {} - predictivity = {} - predictions.each do |pred| - compound_id,activities,prediction,confidence = pred - if activities and prediction #and confidence.numeric? - if activities.uniq.size == 1 - activity = activities.uniq.first - if prediction == activity - if prediction == accept_values[0] - confusion_matrix[0][0] += 1 - #weighted_confusion_matrix[0][0] += confidence - elsif prediction == accept_values[1] - confusion_matrix[1][1] += 1 - #weighted_confusion_matrix[1][1] += confidence - end - elsif prediction != activity - if prediction == accept_values[0] - confusion_matrix[0][1] += 1 - #weighted_confusion_matrix[0][1] += confidence - elsif prediction == accept_values[1] - confusion_matrix[1][0] += 1 - #weighted_confusion_matrix[1][0] += confidence - end - end - end - else - nr_unpredicted += 1 if prediction.nil? + module Validation + class CrossValidation < Validation + field :validation_ids, type: Array, default: [] + field :folds, type: Integer, default: 10 + + def self.create model, n=10 + $logger.debug model.algorithms + klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification + klass = RegressionCrossValidation if model.is_a? Model::LazarRegression + bad_request_error "Unknown model class #{model.class}." unless klass + + cv = klass.new( + name: model.name, + model_id: model.id, + folds: n + ) + cv.save # set created_at + + nr_instances = 0 + nr_unpredicted = 0 + training_dataset = model.training_dataset + training_dataset.folds(n).each_with_index do |fold,fold_nr| + #fork do # parallel execution of validations can lead to Rserve and memory problems + $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" + t = Time.now + validation = TrainTest.create(model, fold[0], fold[1]) + cv.validation_ids << validation.id + cv.nr_instances += validation.nr_instances + cv.nr_unpredicted += validation.nr_unpredicted + #cv.predictions.merge! validation.predictions + $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" + #end end + #Process.waitall + cv.save + $logger.debug "Nr unpredicted: #{nr_unpredicted}" + cv.statistics + cv.update_attributes(finished_at: Time.now) + cv end - true_rate = {} - predictivity = {} - accept_values.each_with_index do |v,i| - true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f - predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f + + def time + finished_at - created_at end - confidence_sum = 0 - #weighted_confusion_matrix.each do |r| - #r.each do |c| - #confidence_sum += c - #end - #end - update_attributes( - accept_values: accept_values, - confusion_matrix: confusion_matrix, - #weighted_confusion_matrix: weighted_confusion_matrix, - accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f, - #weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, - true_rate: true_rate, - predictivity: predictivity, - finished_at: Time.now - ) - $logger.debug "Accuracy #{accuracy}" - end - def confidence_plot - unless confidence_plot_id - tmpfile = "/tmp/#{id.to_s}_confidence.png" - accuracies = [] - confidences = [] - correct_predictions = 0 - incorrect_predictions = 0 - predictions.each do |p| - if p[1] and p[2] - p[1] == p[2] ? correct_predictions += 1 : incorrect_predictions += 1 - accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f - confidences << p[3] + def validations + validation_ids.collect{|vid| TrainTest.find vid} + end - end - end - R.assign "accuracy", accuracies - R.assign "confidence", confidences - R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" - R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png") - plot_id = $gridfs.insert_one(file) - update(:confidence_plot_id => plot_id) + def predictions + predictions = {} + validations.each{|v| predictions.merge!(v.predictions)} + predictions end - $gridfs.find_one(_id: confidence_plot_id).data end - #Average area under roc 0.646 - #Area under roc 0.646 - #F measure carcinogen: 0.769, noncarcinogen: 0.348 - end + class ClassificationCrossValidation < CrossValidation + include ClassificationStatistics + field :accept_values, type: Array + field :confusion_matrix, type: Array + field :weighted_confusion_matrix, type: Array + field :accuracy, type: Float + field :weighted_accuracy, type: Float + field :true_rate, type: Hash + field :predictivity, type: Hash + field :probability_plot_id, type: BSON::ObjectId + end - class RegressionCrossValidation < CrossValidation + class RegressionCrossValidation < CrossValidation + include RegressionStatistics + field :rmse, type: Float, default:0 + field :mae, type: Float, default:0 + field :r_squared, type: Float + field :within_prediction_interval, type: Integer, default:0 + field :out_of_prediction_interval, type: Integer, default:0 + field :correlation_plot_id, type: BSON::ObjectId + end - field :rmse, type: Float - field :mae, type: Float - field :r_squared, type: Float - field :correlation_plot_id, type: BSON::ObjectId + class RepeatedCrossValidation < Validation + field :crossvalidation_ids, type: Array, default: [] + field :correlation_plot_id, type: BSON::ObjectId - def statistics - rmse = 0 - mae = 0 - x = [] - y = [] - predictions.each do |pred| - compound_id,activity,prediction,confidence = pred - if activity and prediction - unless activity == [nil] - x << -Math.log10(activity.median) - y << -Math.log10(prediction) - error = Math.log10(prediction)-Math.log10(activity.median) - rmse += error**2 - #weighted_rmse += confidence*error**2 - mae += error.abs - #weighted_mae += confidence*error.abs - #confidence_sum += confidence - end - else - warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + def self.create model, folds=10, repeats=3 + repeated_cross_validation = self.new + repeats.times do |n| + $logger.debug "Crossvalidation #{n+1} for #{model.name}" + repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id end + repeated_cross_validation.save + repeated_cross_validation end - R.assign "measurement", x - R.assign "prediction", y - R.eval "r <- cor(measurement,prediction,use='complete')" - r = R.eval("r").to_ruby - mae = mae/predictions.size - #weighted_mae = weighted_mae/confidence_sum - rmse = Math.sqrt(rmse/predictions.size) - #weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) - update_attributes( - mae: mae, - rmse: rmse, - #weighted_mae: weighted_mae, - #weighted_rmse: weighted_rmse, - r_squared: r**2, - finished_at: Time.now - ) - $logger.debug "R^2 #{r**2}" - $logger.debug "RMSE #{rmse}" - $logger.debug "MAE #{mae}" - end + def crossvalidations + crossvalidation_ids.collect{|id| CrossValidation.find(id)} + end - def misclassifications n=nil - #n = predictions.size unless n - n ||= 10 - model = Model::Lazar.find(self.model_id) - training_dataset = Dataset.find(model.training_dataset_id) - prediction_feature = training_dataset.features.first - predictions.collect do |p| - unless p.include? nil - compound = Compound.find(p[0]) - neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters) - neighbors.collect! do |n| - neighbor = Compound.find(n[0]) - values = training_dataset.values(neighbor,prediction_feature) - { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values} + def correlation_plot format: "png" + #unless correlation_plot_id + feature = Feature.find(crossvalidations.first.model.prediction_feature) + title = feature.name + title += "[#{feature.unit}]" if feature.unit and !feature.unit.blank? + tmpfile = "/tmp/#{id.to_s}_correlation.#{format}" + images = [] + crossvalidations.each_with_index do |cv,i| + x = [] + y = [] + cv.predictions.each do |sid,p| + x << p["measurements"].median + y << p["value"] + end + R.assign "measurement", x + R.assign "prediction", y + R.eval "all = c(measurement,prediction)" + R.eval "range = c(min(all), max(all))" + R.eval "image#{i} = qplot(prediction,measurement,main='#{title} #{i}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" + R.eval "image#{i} = image#{i} + geom_abline(intercept=0, slope=1)" + images << "image#{i}" + + R.eval "ggsave(file='/home/ist/lazar/test/tmp#{i}.pdf', plot=image#{i})" end - { - :smiles => compound.smiles, - #:fingerprint => compound.fp4.collect{|id| Smarts.find(id).name}, - :measured => p[1], - :predicted => p[2], - #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs, - :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs, - :relative_error => (p[1]-p[2]).abs/p[1], - :confidence => p[3], - :neighbors => neighbors - } - end - end.compact.sort{|a,b| b[:relative_error] <=> a[:relative_error]}[0..n-1] - end - - def confidence_plot - tmpfile = "/tmp/#{id.to_s}_confidence.png" - sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact - R.assign "error", sorted_predictions.collect{|p| p[0]} - R.assign "confidence", sorted_predictions.collect{|p| p[1]} - # TODO fix axis names - R.eval "image = qplot(confidence,error)" - R.eval "image = image + stat_smooth(method='lm', se=FALSE)" - R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png") - plot_id = $gridfs.insert_one(file) - update(:confidence_plot_id => plot_id) - $gridfs.find_one(_id: confidence_plot_id).data - end - - def correlation_plot - unless correlation_plot_id - tmpfile = "/tmp/#{id.to_s}_correlation.png" - x = predictions.collect{|p| p[1]} - y = predictions.collect{|p| p[2]} - attributes = Model::Lazar.find(self.model_id).attributes - attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key} - attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n") - R.assign "measurement", x - R.assign "prediction", y - R.eval "all = c(-log(measurement),-log(prediction))" - R.eval "range = c(min(all), max(all))" - R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)" - R.eval "image = image + geom_abline(intercept=0, slope=1)" - R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png") - plot_id = $gridfs.insert_one(file) - update(:correlation_plot_id => plot_id) - end + R.eval "pdf('#{tmpfile}')" + R.eval "grid.arrange(#{images.join ","},ncol=#{images.size})" + R.eval "dev.off()" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}") + correlation_plot_id = $gridfs.insert_one(file) + update(:correlation_plot_id => correlation_plot_id) + #end $gridfs.find_one(_id: correlation_plot_id).data - end - end - - class RepeatedCrossValidation - field :crossvalidation_ids, type: Array, default: [] - def self.create model, folds=10, repeats=3 - repeated_cross_validation = self.new - repeats.times do |n| - $logger.debug "Crossvalidation #{n+1} for #{model.name}" - repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id end - repeated_cross_validation.save - repeated_cross_validation - end - def crossvalidations - crossvalidation_ids.collect{|id| CrossValidation.find(id)} end end - end |