From 84222bae2bbb9fb3e0ce3e65de1be8e7f94d2147 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 12 Apr 2016 12:37:37 +0200 Subject: new dataset structure --- lib/crossvalidation.rb | 1 - 1 file changed, 1 deletion(-) (limited to 'lib/crossvalidation.rb') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 15dfb21..b7cd7bf 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -297,5 +297,4 @@ module OpenTox end end - end -- cgit v1.2.3 From a8368dda776c05331474adf7eaf9a6e413a3b1eb Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 13 Apr 2016 15:15:51 +0200 Subject: validation tests pass --- lib/crossvalidation.rb | 109 ++++--------------------------------------------- 1 file changed, 8 insertions(+), 101 deletions(-) (limited to 'lib/crossvalidation.rb') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index b7cd7bf..f93a04c 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -6,7 +6,7 @@ module OpenTox field :folds, type: Integer field :nr_instances, type: Integer field :nr_unpredicted, type: Integer - field :predictions, type: Array, default: [] + field :predictions, type: Hash, default: {} field :finished_at, type: Time def time @@ -32,7 +32,7 @@ module OpenTox cv.save # set created_at nr_instances = 0 nr_unpredicted = 0 - predictions = [] + predictions = {} training_dataset = Dataset.find model.training_dataset_id training_dataset.folds(n).each_with_index do |fold,fold_nr| #fork do # parallel execution of validations @@ -42,12 +42,12 @@ module OpenTox $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" #end end - #Process.waitall + Process.waitall cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id) cv.validations.each do |validation| nr_instances += validation.nr_instances nr_unpredicted += validation.nr_unpredicted - predictions += validation.predictions + predictions.merge! validation.predictions end cv.update_attributes( nr_instances: nr_instances, @@ -73,61 +73,8 @@ module OpenTox # TODO auc, f-measure (usability??) def statistics - accept_values = Feature.find(model.prediction_feature_id).accept_values - confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - true_rate = {} - predictivity = {} - predictions.each do |pred| - compound_id,activities,prediction,confidence = pred - if activities and prediction #and confidence.numeric? - if activities.uniq.size == 1 - activity = activities.uniq.first - if prediction == activity - if prediction == accept_values[0] - confusion_matrix[0][0] += 1 - #weighted_confusion_matrix[0][0] += confidence - elsif prediction == accept_values[1] - confusion_matrix[1][1] += 1 - #weighted_confusion_matrix[1][1] += confidence - end - elsif prediction != activity - if prediction == accept_values[0] - confusion_matrix[0][1] += 1 - #weighted_confusion_matrix[0][1] += confidence - elsif prediction == accept_values[1] - confusion_matrix[1][0] += 1 - #weighted_confusion_matrix[1][0] += confidence - end - end - end - else - nr_unpredicted += 1 if prediction.nil? - end - end - true_rate = {} - predictivity = {} - accept_values.each_with_index do |v,i| - true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f - predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f - end - confidence_sum = 0 - #weighted_confusion_matrix.each do |r| - #r.each do |c| - #confidence_sum += c - #end - #end - update_attributes( - accept_values: accept_values, - confusion_matrix: confusion_matrix, - #weighted_confusion_matrix: weighted_confusion_matrix, - accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f, - #weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, - true_rate: true_rate, - predictivity: predictivity, - finished_at: Time.now - ) - $logger.debug "Accuracy #{accuracy}" + stat = ValidationStatistics.classification(predictions, Feature.find(model.prediction_feature_id).accept_values) + update_attributes(stat) end def confidence_plot @@ -169,48 +116,8 @@ module OpenTox field :correlation_plot_id, type: BSON::ObjectId def statistics - rmse = 0 - mae = 0 - x = [] - y = [] - predictions.each do |pred| - compound_id,activity,prediction,confidence = pred - if activity and prediction - unless activity == [nil] - x << -Math.log10(activity.median) - y << -Math.log10(prediction) - error = Math.log10(prediction)-Math.log10(activity.median) - rmse += error**2 - #weighted_rmse += confidence*error**2 - mae += error.abs - #weighted_mae += confidence*error.abs - #confidence_sum += confidence - end - else - warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - end - end - R.assign "measurement", x - R.assign "prediction", y - R.eval "r <- cor(measurement,prediction,use='complete')" - r = R.eval("r").to_ruby - - mae = mae/predictions.size - #weighted_mae = weighted_mae/confidence_sum - rmse = Math.sqrt(rmse/predictions.size) - #weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) - update_attributes( - mae: mae, - rmse: rmse, - #weighted_mae: weighted_mae, - #weighted_rmse: weighted_rmse, - r_squared: r**2, - finished_at: Time.now - ) - $logger.debug "R^2 #{r**2}" - $logger.debug "RMSE #{rmse}" - $logger.debug "MAE #{mae}" + stat = ValidationStatistics.regression predictions + update_attributes(stat) end def misclassifications n=nil -- cgit v1.2.3 From 8aab046eb1ad39aaf10c5a8596102c35c7b2ee0b Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 15 Apr 2016 11:01:16 +0200 Subject: data_entries removed from datasets. datasets are now just containers for compounds and features, feature values have to be retrieved from substances. --- lib/crossvalidation.rb | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'lib/crossvalidation.rb') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index f93a04c..752d393 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -22,8 +22,10 @@ module OpenTox end def self.create model, n=10 - model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation - bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass + klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification + klass = RegressionCrossValidation if model.is_a? Model::LazarRegression + bad_request_error "Unknown model class #{model.class}." unless klass + cv = klass.new( name: model.name, model_id: model.id, @@ -35,7 +37,7 @@ module OpenTox predictions = {} training_dataset = Dataset.find model.training_dataset_id training_dataset.folds(n).each_with_index do |fold,fold_nr| - #fork do # parallel execution of validations + #fork do # parallel execution of validations can lead to Rserve and memory problems $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" t = Time.now validation = Validation.create(model, fold[0], fold[1],cv) @@ -121,7 +123,6 @@ module OpenTox end def misclassifications n=nil - #n = predictions.size unless n n ||= 10 model = Model::Lazar.find(self.model_id) training_dataset = Dataset.find(model.training_dataset_id) @@ -132,8 +133,7 @@ module OpenTox neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters) neighbors.collect! do |n| neighbor = Compound.find(n[0]) - values = training_dataset.values(neighbor,prediction_feature) - { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values} + { :smiles => neighbor.smiles, :similarity => n[1], :measurements => neighbor.toxicities[prediction_feature.id.to_s]} end { :smiles => compound.smiles, -- cgit v1.2.3 From 4ebd80fee52c04bd36781f846eae60019918345d Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 21 Apr 2016 14:29:23 +0200 Subject: initial classification probabilities --- lib/crossvalidation.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/crossvalidation.rb') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 15dfb21..6ffeb25 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -52,7 +52,7 @@ module OpenTox cv.update_attributes( nr_instances: nr_instances, nr_unpredicted: nr_unpredicted, - predictions: predictions#.sort{|a,b| b[3] <=> a[3]} # sort according to confidence + predictions: predictions ) $logger.debug "Nr unpredicted: #{nr_unpredicted}" cv.statistics -- cgit v1.2.3 From 32d767ee7cfcc19337892551906950621f348174 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 28 Apr 2016 08:11:12 +0200 Subject: nanoparticle crossvalidation technically working --- lib/crossvalidation.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/crossvalidation.rb') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 50afb6f..0ae36c4 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -44,7 +44,7 @@ module OpenTox $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" #end end - Process.waitall + #Process.waitall cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id) cv.validations.each do |validation| nr_instances += validation.nr_instances -- cgit v1.2.3 From 05386e748270c337c66f6f379317ea4b25905236 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 4 May 2016 19:24:42 +0200 Subject: first reasonable results for nanoparticle crossvalidation --- lib/crossvalidation.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/crossvalidation.rb') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 0ae36c4..e1f956b 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -141,7 +141,7 @@ module OpenTox :measured => p[1], :predicted => p[2], #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs, - :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs, + :error => (p[1]-p[2]).abs, :relative_error => (p[1]-p[2]).abs/p[1], :confidence => p[3], :neighbors => neighbors @@ -152,7 +152,7 @@ module OpenTox def confidence_plot tmpfile = "/tmp/#{id.to_s}_confidence.png" - sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact + sorted_predictions = predictions.collect{|p| [(p[1]-p[2]).abs,p[3]] if p[1] and p[2]}.compact R.assign "error", sorted_predictions.collect{|p| p[0]} R.assign "confidence", sorted_predictions.collect{|p| p[1]} # TODO fix axis names -- cgit v1.2.3 From 06fc914653face2c58fd4e6c47161cb03e217582 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sun, 8 May 2016 12:22:58 +0200 Subject: default validations fixed --- lib/crossvalidation.rb | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'lib/crossvalidation.rb') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index e1f956b..8e0c5b9 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -133,14 +133,12 @@ module OpenTox neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters) neighbors.collect! do |n| neighbor = Compound.find(n[0]) - { :smiles => neighbor.smiles, :similarity => n[1], :measurements => neighbor.toxicities[prediction_feature.id.to_s]} + { :smiles => neighbor.smiles, :similarity => n[1], :measurements => neighbor.toxicities[prediction_feature.id.to_s][training_dataset.id.to_s]} end { :smiles => compound.smiles, - #:fingerprint => compound.fp4.collect{|id| Smarts.find(id).name}, :measured => p[1], :predicted => p[2], - #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs, :error => (p[1]-p[2]).abs, :relative_error => (p[1]-p[2]).abs/p[1], :confidence => p[3], -- cgit v1.2.3 From b8bb12c8a163c238d7d4387c1914e2100bb660df Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 12 May 2016 15:23:01 +0200 Subject: enm study import fixed --- lib/crossvalidation.rb | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'lib/crossvalidation.rb') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 8e0c5b9..da4b731 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -77,6 +77,7 @@ module OpenTox def statistics stat = ValidationStatistics.classification(predictions, Feature.find(model.prediction_feature_id).accept_values) update_attributes(stat) + stat end def confidence_plot @@ -120,6 +121,7 @@ module OpenTox def statistics stat = ValidationStatistics.regression predictions update_attributes(stat) + stat end def misclassifications n=nil @@ -164,24 +166,29 @@ module OpenTox end def correlation_plot - unless correlation_plot_id + #unless correlation_plot_id tmpfile = "/tmp/#{id.to_s}_correlation.png" - x = predictions.collect{|p| p[1]} - y = predictions.collect{|p| p[2]} + x = [] + y = [] + predictions.each do |sid,p| + x << p["value"] + y << p["measured"].median + end attributes = Model::Lazar.find(self.model_id).attributes attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key} attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n") R.assign "measurement", x R.assign "prediction", y - R.eval "all = c(-log(measurement),-log(prediction))" + R.eval "all = c(measurement,prediction)" R.eval "range = c(min(all), max(all))" - R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)" + R.eval "image = qplot(prediction,measurement,main='#{self.name}',asp=1,xlim=range, ylim=range)" R.eval "image = image + geom_abline(intercept=0, slope=1)" - R.eval "ggsave(file='#{tmpfile}', plot=image)" + #R.eval "ggsave(file='#{tmpfile}', plot=image)" + R.eval "ggsave(file='#{tmpfile}')" file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png") plot_id = $gridfs.insert_one(file) update(:correlation_plot_id => plot_id) - end + #end $gridfs.find_one(_id: correlation_plot_id).data end end -- cgit v1.2.3 From c90644211e214a50f6fdb3a936bf247f45f1f4be Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 13 May 2016 13:38:24 +0200 Subject: compound tests fixed --- lib/crossvalidation.rb | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) (limited to 'lib/crossvalidation.rb') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index da4b731..357f0fa 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -41,6 +41,7 @@ module OpenTox $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" t = Time.now validation = Validation.create(model, fold[0], fold[1],cv) + #p validation $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" #end end @@ -166,29 +167,10 @@ module OpenTox end def correlation_plot - #unless correlation_plot_id - tmpfile = "/tmp/#{id.to_s}_correlation.png" - x = [] - y = [] - predictions.each do |sid,p| - x << p["value"] - y << p["measured"].median - end - attributes = Model::Lazar.find(self.model_id).attributes - attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key} - attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n") - R.assign "measurement", x - R.assign "prediction", y - R.eval "all = c(measurement,prediction)" - R.eval "range = c(min(all), max(all))" - R.eval "image = qplot(prediction,measurement,main='#{self.name}',asp=1,xlim=range, ylim=range)" - R.eval "image = image + geom_abline(intercept=0, slope=1)" - #R.eval "ggsave(file='#{tmpfile}', plot=image)" - R.eval "ggsave(file='#{tmpfile}')" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png") - plot_id = $gridfs.insert_one(file) + unless correlation_plot_id + plot_id = ValidationStatistics.correlation_plot predictions update(:correlation_plot_id => plot_id) - #end + end $gridfs.find_one(_id: correlation_plot_id).data end end -- cgit v1.2.3 From f46ba3b7262f5b551c81fc9396c5b7f0cac7f030 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 27 May 2016 19:16:16 +0200 Subject: first correlation of nanoparticle predictions --- lib/crossvalidation.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/crossvalidation.rb') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 357f0fa..420dd8c 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -168,7 +168,7 @@ module OpenTox def correlation_plot unless correlation_plot_id - plot_id = ValidationStatistics.correlation_plot predictions + plot_id = ValidationStatistics.correlation_plot id, predictions update(:correlation_plot_id => plot_id) end $gridfs.find_one(_id: correlation_plot_id).data -- cgit v1.2.3 From b515a0cfedb887a2af753db6e4a08ae1af430cad Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 31 May 2016 18:08:08 +0200 Subject: cleanup of validation modules/classes --- lib/crossvalidation.rb | 251 +++++++++++++++---------------------------------- 1 file changed, 77 insertions(+), 174 deletions(-) (limited to 'lib/crossvalidation.rb') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 420dd8c..22071d8 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -1,193 +1,96 @@ module OpenTox - class CrossValidation - field :validation_ids, type: Array, default: [] - field :model_id, type: BSON::ObjectId - field :folds, type: Integer - field :nr_instances, type: Integer - field :nr_unpredicted, type: Integer - field :predictions, type: Hash, default: {} - field :finished_at, type: Time - - def time - finished_at - created_at - end - - def validations - validation_ids.collect{|vid| Validation.find vid} - end - - def model - Model::Lazar.find model_id - end - - def self.create model, n=10 - klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification - klass = RegressionCrossValidation if model.is_a? Model::LazarRegression - bad_request_error "Unknown model class #{model.class}." unless klass - - cv = klass.new( - name: model.name, - model_id: model.id, - folds: n - ) - cv.save # set created_at - nr_instances = 0 - nr_unpredicted = 0 - predictions = {} - training_dataset = Dataset.find model.training_dataset_id - training_dataset.folds(n).each_with_index do |fold,fold_nr| - #fork do # parallel execution of validations can lead to Rserve and memory problems - $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" - t = Time.now - validation = Validation.create(model, fold[0], fold[1],cv) - #p validation - $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" - #end - end - #Process.waitall - cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id) - cv.validations.each do |validation| - nr_instances += validation.nr_instances - nr_unpredicted += validation.nr_unpredicted - predictions.merge! validation.predictions + module Validation + class CrossValidation < Validation + field :validation_ids, type: Array, default: [] + field :model_id, type: BSON::ObjectId + field :folds, type: Integer, default: 10 + field :nr_instances, type: Integer, default: 0 + field :nr_unpredicted, type: Integer, default: 0 + field :predictions, type: Hash, default: {} + + def time + finished_at - created_at end - cv.update_attributes( - nr_instances: nr_instances, - nr_unpredicted: nr_unpredicted, - predictions: predictions - ) - $logger.debug "Nr unpredicted: #{nr_unpredicted}" - cv.statistics - cv - end - end - class ClassificationCrossValidation < CrossValidation - - field :accept_values, type: Array - field :confusion_matrix, type: Array - field :weighted_confusion_matrix, type: Array - field :accuracy, type: Float - field :weighted_accuracy, type: Float - field :true_rate, type: Hash - field :predictivity, type: Hash - field :confidence_plot_id, type: BSON::ObjectId - # TODO auc, f-measure (usability??) - - def statistics - stat = ValidationStatistics.classification(predictions, Feature.find(model.prediction_feature_id).accept_values) - update_attributes(stat) - stat - end + def validations + validation_ids.collect{|vid| TrainTest.find vid} + end - def confidence_plot - unless confidence_plot_id - tmpfile = "/tmp/#{id.to_s}_confidence.png" - accuracies = [] - confidences = [] - correct_predictions = 0 - incorrect_predictions = 0 - predictions.each do |p| - if p[1] and p[2] - p[1] == p[2] ? correct_predictions += 1 : incorrect_predictions += 1 - accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f - confidences << p[3] + def model + Model::Lazar.find model_id + end - end + def self.create model, n=10 + klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification + klass = RegressionCrossValidation if model.is_a? Model::LazarRegression + bad_request_error "Unknown model class #{model.class}." unless klass + + cv = klass.new( + name: model.name, + model_id: model.id, + folds: n + ) + cv.save # set created_at + nr_instances = 0 + nr_unpredicted = 0 + predictions = {} + training_dataset = Dataset.find model.training_dataset_id + training_dataset.folds(n).each_with_index do |fold,fold_nr| + #fork do # parallel execution of validations can lead to Rserve and memory problems + $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" + t = Time.now + validation = TrainTest.create(model, fold[0], fold[1]) + cv.validation_ids << validation.id + cv.nr_instances += validation.nr_instances + cv.nr_unpredicted += validation.nr_unpredicted + cv.predictions.merge! validation.predictions + $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" + #end end - R.assign "accuracy", accuracies - R.assign "confidence", confidences - R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" - R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png") - plot_id = $gridfs.insert_one(file) - update(:confidence_plot_id => plot_id) + #Process.waitall + cv.save + $logger.debug "Nr unpredicted: #{nr_unpredicted}" + cv.statistics + cv.update_attributes(finished_at: Time.now) + cv end - $gridfs.find_one(_id: confidence_plot_id).data - end - - #Average area under roc 0.646 - #Area under roc 0.646 - #F measure carcinogen: 0.769, noncarcinogen: 0.348 - end - - class RegressionCrossValidation < CrossValidation - - field :rmse, type: Float - field :mae, type: Float - field :r_squared, type: Float - field :correlation_plot_id, type: BSON::ObjectId - - def statistics - stat = ValidationStatistics.regression predictions - update_attributes(stat) - stat end - def misclassifications n=nil - n ||= 10 - model = Model::Lazar.find(self.model_id) - training_dataset = Dataset.find(model.training_dataset_id) - prediction_feature = training_dataset.features.first - predictions.collect do |p| - unless p.include? nil - compound = Compound.find(p[0]) - neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters) - neighbors.collect! do |n| - neighbor = Compound.find(n[0]) - { :smiles => neighbor.smiles, :similarity => n[1], :measurements => neighbor.toxicities[prediction_feature.id.to_s][training_dataset.id.to_s]} - end - { - :smiles => compound.smiles, - :measured => p[1], - :predicted => p[2], - :error => (p[1]-p[2]).abs, - :relative_error => (p[1]-p[2]).abs/p[1], - :confidence => p[3], - :neighbors => neighbors - } - end - end.compact.sort{|a,b| b[:relative_error] <=> a[:relative_error]}[0..n-1] + class ClassificationCrossValidation < CrossValidation + include ClassificationStatistics + field :accept_values, type: Array + field :confusion_matrix, type: Array + field :weighted_confusion_matrix, type: Array + field :accuracy, type: Float + field :weighted_accuracy, type: Float + field :true_rate, type: Hash + field :predictivity, type: Hash + field :confidence_plot_id, type: BSON::ObjectId end - def confidence_plot - tmpfile = "/tmp/#{id.to_s}_confidence.png" - sorted_predictions = predictions.collect{|p| [(p[1]-p[2]).abs,p[3]] if p[1] and p[2]}.compact - R.assign "error", sorted_predictions.collect{|p| p[0]} - R.assign "confidence", sorted_predictions.collect{|p| p[1]} - # TODO fix axis names - R.eval "image = qplot(confidence,error)" - R.eval "image = image + stat_smooth(method='lm', se=FALSE)" - R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.png") - plot_id = $gridfs.insert_one(file) - update(:confidence_plot_id => plot_id) - $gridfs.find_one(_id: confidence_plot_id).data + class RegressionCrossValidation < CrossValidation + include RegressionStatistics + field :rmse, type: Float + field :mae, type: Float + field :r_squared, type: Float + field :correlation_plot_id, type: BSON::ObjectId end - def correlation_plot - unless correlation_plot_id - plot_id = ValidationStatistics.correlation_plot id, predictions - update(:correlation_plot_id => plot_id) + class RepeatedCrossValidation < Validation + field :crossvalidation_ids, type: Array, default: [] + def self.create model, folds=10, repeats=3 + repeated_cross_validation = self.new + repeats.times do |n| + $logger.debug "Crossvalidation #{n+1} for #{model.name}" + repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id + end + repeated_cross_validation.save + repeated_cross_validation end - $gridfs.find_one(_id: correlation_plot_id).data - end - end - - class RepeatedCrossValidation - field :crossvalidation_ids, type: Array, default: [] - def self.create model, folds=10, repeats=3 - repeated_cross_validation = self.new - repeats.times do |n| - $logger.debug "Crossvalidation #{n+1} for #{model.name}" - repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id + def crossvalidations + crossvalidation_ids.collect{|id| CrossValidation.find(id)} end - repeated_cross_validation.save - repeated_cross_validation - end - def crossvalidations - crossvalidation_ids.collect{|id| CrossValidation.find(id)} end end -- cgit v1.2.3 From 65b69d4c35890a7a2d2992108f0cf4eb5202dd1b Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 1 Jun 2016 10:37:00 +0200 Subject: validation tests fixed --- lib/crossvalidation.rb | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'lib/crossvalidation.rb') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 22071d8..15e25a5 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -3,23 +3,7 @@ module OpenTox module Validation class CrossValidation < Validation field :validation_ids, type: Array, default: [] - field :model_id, type: BSON::ObjectId field :folds, type: Integer, default: 10 - field :nr_instances, type: Integer, default: 0 - field :nr_unpredicted, type: Integer, default: 0 - field :predictions, type: Hash, default: {} - - def time - finished_at - created_at - end - - def validations - validation_ids.collect{|vid| TrainTest.find vid} - end - - def model - Model::Lazar.find model_id - end def self.create model, n=10 klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification @@ -55,6 +39,14 @@ module OpenTox cv.update_attributes(finished_at: Time.now) cv end + + def time + finished_at - created_at + end + + def validations + validation_ids.collect{|vid| TrainTest.find vid} + end end class ClassificationCrossValidation < CrossValidation -- cgit v1.2.3 From f7e87b45f15083e5fcdea64821f06ed93ece4c4e Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 7 Jun 2016 18:07:28 +0200 Subject: (repeated)crossvalidation plots --- lib/crossvalidation.rb | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'lib/crossvalidation.rb') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 15e25a5..7aae3d2 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -71,6 +71,8 @@ module OpenTox class RepeatedCrossValidation < Validation field :crossvalidation_ids, type: Array, default: [] + field :correlation_plot_id, type: BSON::ObjectId + def self.create model, folds=10, repeats=3 repeated_cross_validation = self.new repeats.times do |n| @@ -80,9 +82,42 @@ module OpenTox repeated_cross_validation.save repeated_cross_validation end + def crossvalidations crossvalidation_ids.collect{|id| CrossValidation.find(id)} end + + def correlation_plot format: "png" + #unless correlation_plot_id + feature = Feature.find(crossvalidations.first.model.prediction_feature) + title = feature.name + title += "[#{feature.unit}]" if feature.unit and !feature.unit.blank? + tmpfile = "/tmp/#{id.to_s}_correlation.#{format}" + images = [] + crossvalidations.each_with_index do |cv,i| + x = [] + y = [] + cv.predictions.each do |sid,p| + x << p["value"] + y << p["measurements"].median + end + R.assign "measurement", x + R.assign "prediction", y + R.eval "all = c(measurement,prediction)" + R.eval "range = c(min(all), max(all))" + R.eval "image#{i} = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" + R.eval "image#{i} = image#{i} + geom_abline(intercept=0, slope=1)" + images << "image#{i}" + end + R.eval "pdf('#{tmpfile}')" + R.eval "grid.arrange(#{images.join ","},ncol=#{images.size})" + R.eval "dev.off()" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}") + correlation_plot_id = $gridfs.insert_one(file) + update(:correlation_plot_id => correlation_plot_id) + #end + $gridfs.find_one(_id: correlation_plot_id).data + end end end -- cgit v1.2.3 From 46c628f1757ce8274a0b277b3ec3306609b38c14 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 25 Jul 2016 15:53:22 +0200 Subject: local_weighted_average fallback fixed, cv predictions pulled from validations to avoid mongo document size errors --- lib/crossvalidation.rb | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'lib/crossvalidation.rb') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 7aae3d2..d7a1f08 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -18,7 +18,7 @@ module OpenTox cv.save # set created_at nr_instances = 0 nr_unpredicted = 0 - predictions = {} + #predictions = {} training_dataset = Dataset.find model.training_dataset_id training_dataset.folds(n).each_with_index do |fold,fold_nr| #fork do # parallel execution of validations can lead to Rserve and memory problems @@ -28,7 +28,7 @@ module OpenTox cv.validation_ids << validation.id cv.nr_instances += validation.nr_instances cv.nr_unpredicted += validation.nr_unpredicted - cv.predictions.merge! validation.predictions + #cv.predictions.merge! validation.predictions $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" #end end @@ -47,6 +47,12 @@ module OpenTox def validations validation_ids.collect{|vid| TrainTest.find vid} end + + def predictions + predictions = {} + validations.each{|v| predictions.merge!(v.predictions)} + predictions + end end class ClassificationCrossValidation < CrossValidation -- cgit v1.2.3 From 91787edb3682900bc5a2feeca66e5142f387fcc6 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 7 Oct 2016 10:25:58 +0200 Subject: unified interface for prediction algorithms --- lib/crossvalidation.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/crossvalidation.rb') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index d7a1f08..15d1031 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -16,10 +16,10 @@ module OpenTox folds: n ) cv.save # set created_at + nr_instances = 0 nr_unpredicted = 0 - #predictions = {} - training_dataset = Dataset.find model.training_dataset_id + training_dataset = model.training_dataset training_dataset.folds(n).each_with_index do |fold,fold_nr| #fork do # parallel execution of validations can lead to Rserve and memory problems $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" -- cgit v1.2.3 From 8519274487166d75b3b9ae28e61f7a7be9f7e83c Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 27 Oct 2016 11:58:07 +0200 Subject: probability plot for classification validations --- lib/crossvalidation.rb | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'lib/crossvalidation.rb') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 15d1031..4f779a2 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -64,14 +64,16 @@ module OpenTox field :weighted_accuracy, type: Float field :true_rate, type: Hash field :predictivity, type: Hash - field :confidence_plot_id, type: BSON::ObjectId + field :probability_plot_id, type: BSON::ObjectId end class RegressionCrossValidation < CrossValidation include RegressionStatistics - field :rmse, type: Float - field :mae, type: Float + field :rmse, type: Float, default:0 + field :mae, type: Float, default:0 field :r_squared, type: Float + field :within_prediction_interval, type: Integer, default:0 + field :out_of_prediction_interval, type: Integer, default:0 field :correlation_plot_id, type: BSON::ObjectId end @@ -93,6 +95,7 @@ module OpenTox crossvalidation_ids.collect{|id| CrossValidation.find(id)} end +=begin def correlation_plot format: "png" #unless correlation_plot_id feature = Feature.find(crossvalidations.first.model.prediction_feature) @@ -104,16 +107,18 @@ module OpenTox x = [] y = [] cv.predictions.each do |sid,p| - x << p["value"] - y << p["measurements"].median + x << p["measurements"].median + y << p["value"] end R.assign "measurement", x R.assign "prediction", y R.eval "all = c(measurement,prediction)" R.eval "range = c(min(all), max(all))" - R.eval "image#{i} = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" + R.eval "image#{i} = qplot(prediction,measurement,main='#{title} #{i}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" R.eval "image#{i} = image#{i} + geom_abline(intercept=0, slope=1)" images << "image#{i}" + + R.eval "ggsave(file='/home/ist/lazar/test/tmp#{i}.pdf', plot=image#{i})" end R.eval "pdf('#{tmpfile}')" R.eval "grid.arrange(#{images.join ","},ncol=#{images.size})" @@ -124,6 +129,7 @@ module OpenTox #end $gridfs.find_one(_id: correlation_plot_id).data end +=end end end -- cgit v1.2.3 From 280f81dcffb3b8b929ff9cbe92ba17403f5a9dd3 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 28 Oct 2016 12:31:53 +0200 Subject: adjusted r^2 removed (does not apply well to local models) --- lib/crossvalidation.rb | 2 -- 1 file changed, 2 deletions(-) (limited to 'lib/crossvalidation.rb') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 4f779a2..be680ae 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -95,7 +95,6 @@ module OpenTox crossvalidation_ids.collect{|id| CrossValidation.find(id)} end -=begin def correlation_plot format: "png" #unless correlation_plot_id feature = Feature.find(crossvalidations.first.model.prediction_feature) @@ -129,7 +128,6 @@ module OpenTox #end $gridfs.find_one(_id: correlation_plot_id).data end -=end end end -- cgit v1.2.3 From 99c42f76b02f9084d0757eb0c52b4a55fa295a95 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 11 Nov 2016 17:19:13 +0100 Subject: p-chem regression and enm import fixed --- lib/crossvalidation.rb | 1 + 1 file changed, 1 insertion(+) (limited to 'lib/crossvalidation.rb') diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index be680ae..5a05955 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -6,6 +6,7 @@ module OpenTox field :folds, type: Integer, default: 10 def self.create model, n=10 + $logger.debug model.algorithms klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification klass = RegressionCrossValidation if model.is_a? Model::LazarRegression bad_request_error "Unknown model class #{model.class}." unless klass -- cgit v1.2.3