From 815cf6ba1543fc323eb7cbd1202fadbf03bcfbca Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 13 Apr 2016 15:35:01 +0200 Subject: new files added --- lib/validation-statistics.rb | 100 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 lib/validation-statistics.rb (limited to 'lib/validation-statistics.rb') diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb new file mode 100644 index 0000000..570b2d4 --- /dev/null +++ b/lib/validation-statistics.rb @@ -0,0 +1,100 @@ +module OpenTox + class ValidationStatistics + include OpenTox + def self.classification predictions, accept_values + confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} + weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} + true_rate = {} + predictivity = {} + nr_instances = 0 + predictions.each do |cid,pred| + # TODO use measured majority class + if pred[:measured].uniq.size == 1 + m = pred[:measured].first + #pred[:measured].each do |m| + if pred[:value] == m + if pred[:value] == accept_values[0] + confusion_matrix[0][0] += 1 + weighted_confusion_matrix[0][0] += pred[:confidence] + nr_instances += 1 + elsif pred[:value] == accept_values[1] + confusion_matrix[1][1] += 1 + weighted_confusion_matrix[1][1] += pred[:confidence] + nr_instances += 1 + end + elsif pred[:value] != m + if pred[:value] == accept_values[0] + confusion_matrix[0][1] += 1 + weighted_confusion_matrix[0][1] += pred[:confidence] + nr_instances += 1 + elsif pred[:value] == accept_values[1] + confusion_matrix[1][0] += 1 + weighted_confusion_matrix[1][0] += pred[:confidence] + nr_instances += 1 + end + end + end + end + true_rate = {} + predictivity = {} + accept_values.each_with_index do |v,i| + true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f + predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f + end + confidence_sum = 0 + weighted_confusion_matrix.each do |r| + r.each do |c| + confidence_sum += c + end + end + accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f + $logger.debug "Accuracy #{accuracy}" + { + :accept_values => accept_values, + :confusion_matrix => confusion_matrix, + :weighted_confusion_matrix => weighted_confusion_matrix, + :accuracy => accuracy, + :weighted_accuracy => (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, + :true_rate => true_rate, + :predictivity => predictivity, + :finished_at => Time.now + } + end + + def self.regression predictions + # TODO: prediction intervals + rmse = 0 + mae = 0 + x = [] + y = [] + predictions.each do |cid,pred| + if pred[:value] and pred[:measured] #and pred[:measured] != [nil] + x << -Math.log10(pred[:measured].median) + y << -Math.log10(pred[:value]) + error = Math.log10(pred[:value])-Math.log10(pred[:measured].median) + rmse += error**2 + mae += error.abs + else + warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + end + end + R.assign "measurement", x + R.assign "prediction", y + R.eval "r <- cor(measurement,prediction,use='complete')" + r = R.eval("r").to_ruby + + mae = mae/predictions.size + rmse = Math.sqrt(rmse/predictions.size) + $logger.debug "R^2 #{r**2}" + $logger.debug "RMSE #{rmse}" + $logger.debug "MAE #{mae}" + { + :mae => mae, + :rmse => rmse, + :r_squared => r**2, + :finished_at => Time.now + } + end + end +end -- cgit v1.2.3 From 05386e748270c337c66f6f379317ea4b25905236 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 4 May 2016 19:24:42 +0200 Subject: first reasonable results for nanoparticle crossvalidation --- lib/validation-statistics.rb | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'lib/validation-statistics.rb') diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index c6b2a07..b7c95f6 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -63,16 +63,15 @@ module OpenTox end def self.regression predictions - # TODO: prediction intervals rmse = 0 mae = 0 x = [] y = [] predictions.each do |cid,pred| if pred[:value] and pred[:measured] #and pred[:measured] != [nil] - x << -Math.log10(pred[:measured].median) - y << -Math.log10(pred[:value]) - error = Math.log10(pred[:value])-Math.log10(pred[:measured].median) + x << pred[:measured].median + y << pred[:value] + error = pred[:value]-pred[:measured].median rmse += error**2 mae += error.abs else -- cgit v1.2.3 From 7794086d367fb256c3673d7578b23ec2fb83e6ed Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 9 May 2016 14:05:29 +0200 Subject: physchem crossvalidation fixed --- lib/validation-statistics.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/validation-statistics.rb') diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index b7c95f6..0079bae 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -68,7 +68,7 @@ module OpenTox x = [] y = [] predictions.each do |cid,pred| - if pred[:value] and pred[:measured] #and pred[:measured] != [nil] + if pred[:value] and pred[:measured] x << pred[:measured].median y << pred[:value] error = pred[:value]-pred[:measured].median -- cgit v1.2.3 From c90644211e214a50f6fdb3a936bf247f45f1f4be Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 13 May 2016 13:38:24 +0200 Subject: compound tests fixed --- lib/validation-statistics.rb | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'lib/validation-statistics.rb') diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 0079bae..2d6b56e 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -96,5 +96,29 @@ module OpenTox :finished_at => Time.now } end + + end + + module Plot + + def plot_id + tmpfile = "/tmp/#{id.to_s}_correlation.png" + x = [] + y = [] + predictions.each do |sid,p| + x << p["value"] + y << p["measured"].median + end + R.assign "measurement", x + R.assign "prediction", y + R.eval "all = c(measurement,prediction)" + R.eval "range = c(min(all), max(all))" + R.eval "image = qplot(prediction,measurement,main='',asp=1,xlim=range, ylim=range)" + R.eval "image = image + geom_abline(intercept=0, slope=1)" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png") + plot_id = $gridfs.insert_one(file) + plot_id + end end end -- cgit v1.2.3 From b2d80ad2e470fcb41af4b747142e5693f2fa4615 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 24 May 2016 13:05:53 +0200 Subject: dataset tests fixed --- lib/validation-statistics.rb | 1 + 1 file changed, 1 insertion(+) (limited to 'lib/validation-statistics.rb') diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 2d6b56e..3c52b15 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -68,6 +68,7 @@ module OpenTox x = [] y = [] predictions.each do |cid,pred| + p pred if pred[:value] and pred[:measured] x << pred[:measured].median y << pred[:value] -- cgit v1.2.3 From cc08e6beda7f7d70ebf6c6929a22d1a0cd7c1a20 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 24 May 2016 15:41:24 +0200 Subject: tests fixed. DescriptorTest#test_compound_all may fail within all.rb --- lib/validation-statistics.rb | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'lib/validation-statistics.rb') diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 3c52b15..156353a 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -8,10 +8,11 @@ module OpenTox predictivity = {} nr_instances = 0 predictions.each do |cid,pred| - # TODO use measured majority class - if pred[:measured].uniq.size == 1 + # TODO + # use predictions without probabilities (single neighbor)?? + # use measured majority class?? + if pred[:measured].uniq.size == 1 and pred[:probabilities] m = pred[:measured].first - #pred[:measured].each do |m| if pred[:value] == m if pred[:value] == accept_values[0] confusion_matrix[0][0] += 1 @@ -63,12 +64,12 @@ module OpenTox end def self.regression predictions + # TODO: predictions within prediction_interval rmse = 0 mae = 0 x = [] y = [] predictions.each do |cid,pred| - p pred if pred[:value] and pred[:measured] x << pred[:measured].median y << pred[:value] -- cgit v1.2.3 From f46ba3b7262f5b551c81fc9396c5b7f0cac7f030 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 27 May 2016 19:16:16 +0200 Subject: first correlation of nanoparticle predictions --- lib/validation-statistics.rb | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'lib/validation-statistics.rb') diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 156353a..e61543b 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -83,7 +83,7 @@ module OpenTox end R.assign "measurement", x R.assign "prediction", y - R.eval "r <- cor(measurement,prediction,use='complete')" + R.eval "r <- cor(measurement,prediction,use='pairwise')" r = R.eval("r").to_ruby mae = mae/predictions.size @@ -99,11 +99,7 @@ module OpenTox } end - end - - module Plot - - def plot_id + def self.correlation_plot id, predictions tmpfile = "/tmp/#{id.to_s}_correlation.png" x = [] y = [] @@ -115,10 +111,11 @@ module OpenTox R.assign "prediction", y R.eval "all = c(measurement,prediction)" R.eval "range = c(min(all), max(all))" - R.eval "image = qplot(prediction,measurement,main='',asp=1,xlim=range, ylim=range)" + # TODO units + R.eval "image = qplot(prediction,measurement,main='',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" R.eval "image = image + geom_abline(intercept=0, slope=1)" R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png") + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.png") plot_id = $gridfs.insert_one(file) plot_id end -- cgit v1.2.3 From b515a0cfedb887a2af753db6e4a08ae1af430cad Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 31 May 2016 18:08:08 +0200 Subject: cleanup of validation modules/classes --- lib/validation-statistics.rb | 292 +++++++++++++++++++++++++++---------------- 1 file changed, 186 insertions(+), 106 deletions(-) (limited to 'lib/validation-statistics.rb') diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index e61543b..816824b 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -1,123 +1,203 @@ module OpenTox - class ValidationStatistics - include OpenTox - def self.classification predictions, accept_values - confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} - weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} - true_rate = {} - predictivity = {} - nr_instances = 0 - predictions.each do |cid,pred| - # TODO - # use predictions without probabilities (single neighbor)?? - # use measured majority class?? - if pred[:measured].uniq.size == 1 and pred[:probabilities] - m = pred[:measured].first - if pred[:value] == m - if pred[:value] == accept_values[0] - confusion_matrix[0][0] += 1 - weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]] - nr_instances += 1 - elsif pred[:value] == accept_values[1] - confusion_matrix[1][1] += 1 - weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]] - nr_instances += 1 - end - elsif pred[:value] != m - if pred[:value] == accept_values[0] - confusion_matrix[0][1] += 1 - weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]] - nr_instances += 1 - elsif pred[:value] == accept_values[1] - confusion_matrix[1][0] += 1 - weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]] - nr_instances += 1 + module Validation + module ClassificationStatistics + + def statistics + self.accept_values = model.prediction_feature.accept_values + self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} + self.weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} + true_rate = {} + predictivity = {} + nr_instances = 0 + predictions.each do |cid,pred| + # TODO + # use predictions without probabilities (single neighbor)?? + # use measured majority class?? + if pred[:measurements].uniq.size == 1 and pred[:probabilities] + m = pred[:measurements].first + if pred[:value] == m + if pred[:value] == accept_values[0] + confusion_matrix[0][0] += 1 + weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]] + nr_instances += 1 + elsif pred[:value] == accept_values[1] + confusion_matrix[1][1] += 1 + weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]] + nr_instances += 1 + end + elsif pred[:value] != m + if pred[:value] == accept_values[0] + confusion_matrix[0][1] += 1 + weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]] + nr_instances += 1 + elsif pred[:value] == accept_values[1] + confusion_matrix[1][0] += 1 + weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]] + nr_instances += 1 + end end end end + true_rate = {} + predictivity = {} + accept_values.each_with_index do |v,i| + true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f + predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f + end + confidence_sum = 0 + weighted_confusion_matrix.each do |r| + r.each do |c| + confidence_sum += c + end + end + self.accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f + self.weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f + $logger.debug "Accuracy #{accuracy}" + save + { + :accept_values => accept_values, + :confusion_matrix => confusion_matrix, + :weighted_confusion_matrix => weighted_confusion_matrix, + :accuracy => accuracy, + :weighted_accuracy => weighted_accuracy, + :true_rate => true_rate, + :predictivity => predictivity, + } end - true_rate = {} - predictivity = {} - accept_values.each_with_index do |v,i| - true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f - predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f - end - confidence_sum = 0 - weighted_confusion_matrix.each do |r| - r.each do |c| - confidence_sum += c + + def confidence_plot + unless confidence_plot_id + tmpfile = "/tmp/#{id.to_s}_confidence.svg" + accuracies = [] + confidences = [] + correct_predictions = 0 + incorrect_predictions = 0 + predictions.each do |p| + p[:measurements].each do |db_act| + if p[:value] + p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1 + accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f + confidences << p[:confidence] + + end + end + end + R.assign "accuracy", accuracies + R.assign "confidence", confidences + R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") + plot_id = $gridfs.insert_one(file) + update(:confidence_plot_id => plot_id) end + $gridfs.find_one(_id: confidence_plot_id).data end - accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f - weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f - $logger.debug "Accuracy #{accuracy}" - { - :accept_values => accept_values, - :confusion_matrix => confusion_matrix, - :weighted_confusion_matrix => weighted_confusion_matrix, - :accuracy => accuracy, - :weighted_accuracy => weighted_accuracy, - :true_rate => true_rate, - :predictivity => predictivity, - :finished_at => Time.now - } end - def self.regression predictions - # TODO: predictions within prediction_interval - rmse = 0 - mae = 0 - x = [] - y = [] - predictions.each do |cid,pred| - if pred[:value] and pred[:measured] - x << pred[:measured].median - y << pred[:value] - error = pred[:value]-pred[:measured].median - rmse += error**2 - mae += error.abs - else - warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + module RegressionStatistics + + def statistics + # TODO: predictions within prediction_interval + rmse = 0 + mae = 0 + x = [] + y = [] + predictions.each do |cid,pred| + if pred[:value] and pred[:measurements] + x << pred[:measurements].median + y << pred[:value] + error = pred[:value]-pred[:measurements].median + rmse += error**2 + mae += error.abs + else + warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + end end + R.assign "measurement", x + R.assign "prediction", y + R.eval "r <- cor(measurement,prediction,use='pairwise')" + r = R.eval("r").to_ruby + + mae = mae/predictions.size + rmse = Math.sqrt(rmse/predictions.size) + $logger.debug "R^2 #{r**2}" + $logger.debug "RMSE #{rmse}" + $logger.debug "MAE #{mae}" + { + :mae => mae, + :rmse => rmse, + :r_squared => r**2, + } end - R.assign "measurement", x - R.assign "prediction", y - R.eval "r <- cor(measurement,prediction,use='pairwise')" - r = R.eval("r").to_ruby - mae = mae/predictions.size - rmse = Math.sqrt(rmse/predictions.size) - $logger.debug "R^2 #{r**2}" - $logger.debug "RMSE #{rmse}" - $logger.debug "MAE #{mae}" - { - :mae => mae, - :rmse => rmse, - :r_squared => r**2, - :finished_at => Time.now - } - end + def correlation_plot + unless correlation_plot_id + tmpfile = "/tmp/#{id.to_s}_correlation.pdf" + x = [] + y = [] + feature = Feature.find(predictions.first.last["prediction_feature_id"]) + predictions.each do |sid,p| + x << p["value"] + y << p["measurements"].median + end + R.assign "measurement", x + R.assign "prediction", y + R.eval "all = c(measurement,prediction)" + R.eval "range = c(min(all), max(all))" + title = feature.name + title += "[#{feature.unit}]" if feature.unit and !feature.unit.blank? + R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" + R.eval "image = image + geom_abline(intercept=0, slope=1)" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.png") + plot_id = $gridfs.insert_one(file) + update(:correlation_plot_id => plot_id) + end + $gridfs.find_one(_id: correlation_plot_id).data + end - def self.correlation_plot id, predictions - tmpfile = "/tmp/#{id.to_s}_correlation.png" - x = [] - y = [] - predictions.each do |sid,p| - x << p["value"] - y << p["measured"].median + def worst_predictions n: 5, show_neigbors: true, show_common_descriptors: false + worst_predictions = predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,n] + worst_predictions.collect do |p| + substance = Substance.find(p.first) + prediction = p[1] + if show_neigbors + neighbors = prediction["neighbors"].collect do |n| + common_descriptors = [] + if show_common_descriptors + common_descriptors = n["common_descriptors"].collect do |d| + f=Feature.find(d) + { + :id => f.id.to_s, + :name => "#{f.name} (#{f.conditions})", + :p_value => d[:p_value], + :r_squared => d[:r_squared], + } + end + else + common_descriptors = n["common_descriptors"].size + end + { + :name => Substance.find(n["_id"]).name, + :id => n["_id"].to_s, + :common_descriptors => common_descriptors + } + end + else + neighbors = prediction["neighbors"].size + end + { + :id => substance.id.to_s, + :name => substance.name, + :feature => Feature.find(prediction["prediction_feature_id"]).name, + :error => (prediction["value"] - prediction["measurements"].median).abs, + :prediction => prediction["value"], + :measurements => prediction["measurements"], + :neighbors => neighbors + } + end end - R.assign "measurement", x - R.assign "prediction", y - R.eval "all = c(measurement,prediction)" - R.eval "range = c(min(all), max(all))" - # TODO units - R.eval "image = qplot(prediction,measurement,main='',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" - R.eval "image = image + geom_abline(intercept=0, slope=1)" - R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.png") - plot_id = $gridfs.insert_one(file) - plot_id end end end -- cgit v1.2.3 From 65b69d4c35890a7a2d2992108f0cf4eb5202dd1b Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 1 Jun 2016 10:37:00 +0200 Subject: validation tests fixed --- lib/validation-statistics.rb | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'lib/validation-statistics.rb') diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 816824b..e42d298 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -98,8 +98,8 @@ module OpenTox def statistics # TODO: predictions within prediction_interval - rmse = 0 - mae = 0 + self.rmse = 0 + self.mae = 0 x = [] y = [] predictions.each do |cid,pred| @@ -107,8 +107,8 @@ module OpenTox x << pred[:measurements].median y << pred[:value] error = pred[:value]-pred[:measurements].median - rmse += error**2 - mae += error.abs + self.rmse += error**2 + self.mae += error.abs else warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." @@ -117,17 +117,18 @@ module OpenTox R.assign "measurement", x R.assign "prediction", y R.eval "r <- cor(measurement,prediction,use='pairwise')" - r = R.eval("r").to_ruby + self.r_squared = R.eval("r").to_ruby**2 - mae = mae/predictions.size - rmse = Math.sqrt(rmse/predictions.size) - $logger.debug "R^2 #{r**2}" + self.mae = self.mae/predictions.size + self.rmse = Math.sqrt(self.rmse/predictions.size) + $logger.debug "R^2 #{r_squared}" $logger.debug "RMSE #{rmse}" $logger.debug "MAE #{mae}" + save { :mae => mae, :rmse => rmse, - :r_squared => r**2, + :r_squared => r_squared, } end -- cgit v1.2.3 From 85f2308c101b4778508c2d767e08af4cfd671b7b Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 2 Jun 2016 12:22:39 +0200 Subject: local pls regression for nanoparticles --- lib/validation-statistics.rb | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'lib/validation-statistics.rb') diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index e42d298..6b252b1 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -100,6 +100,8 @@ module OpenTox # TODO: predictions within prediction_interval self.rmse = 0 self.mae = 0 + #self.within_prediction_interval = 0 + #self.outside_prediction_interval = 0 x = [] y = [] predictions.each do |cid,pred| @@ -109,6 +111,9 @@ module OpenTox error = pred[:value]-pred[:measurements].median self.rmse += error**2 self.mae += error.abs + #if pred[:prediction_interval] + #if pred[:measurements] + #end else warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." @@ -118,7 +123,6 @@ module OpenTox R.assign "prediction", y R.eval "r <- cor(measurement,prediction,use='pairwise')" self.r_squared = R.eval("r").to_ruby**2 - self.mae = self.mae/predictions.size self.rmse = Math.sqrt(self.rmse/predictions.size) $logger.debug "R^2 #{r_squared}" -- cgit v1.2.3 From f7e87b45f15083e5fcdea64821f06ed93ece4c4e Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 7 Jun 2016 18:07:28 +0200 Subject: (repeated)crossvalidation plots --- lib/validation-statistics.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'lib/validation-statistics.rb') diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 6b252b1..9aa9cff 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -136,9 +136,9 @@ module OpenTox } end - def correlation_plot + def correlation_plot format: "png" unless correlation_plot_id - tmpfile = "/tmp/#{id.to_s}_correlation.pdf" + tmpfile = "/tmp/#{id.to_s}_correlation.#{format}" x = [] y = [] feature = Feature.find(predictions.first.last["prediction_feature_id"]) @@ -155,7 +155,7 @@ module OpenTox R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" R.eval "image = image + geom_abline(intercept=0, slope=1)" R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.png") + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}") plot_id = $gridfs.insert_one(file) update(:correlation_plot_id => plot_id) end -- cgit v1.2.3 From f93aad7227c7bb3702fd28aab2d289f1ca9ce7e9 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 21 Jul 2016 17:35:20 +0200 Subject: correlation plot fixed --- lib/validation-statistics.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib/validation-statistics.rb') diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 9aa9cff..3582c71 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -143,8 +143,8 @@ module OpenTox y = [] feature = Feature.find(predictions.first.last["prediction_feature_id"]) predictions.each do |sid,p| - x << p["value"] - y << p["measurements"].median + x << p["measurements"].median + y << p["value"] end R.assign "measurement", x R.assign "prediction", y -- cgit v1.2.3 From 8519274487166d75b3b9ae28e61f7a7be9f7e83c Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 27 Oct 2016 11:58:07 +0200 Subject: probability plot for classification validations --- lib/validation-statistics.rb | 64 ++++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 26 deletions(-) (limited to 'lib/validation-statistics.rb') diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 3582c71..4ab4b13 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -65,43 +65,44 @@ module OpenTox } end - def confidence_plot - unless confidence_plot_id - tmpfile = "/tmp/#{id.to_s}_confidence.svg" + def probability_plot format: "pdf" + #unless probability_plot_id + tmpfile = "/tmp/#{id.to_s}_probability.#{format}" accuracies = [] - confidences = [] + probabilities = [] correct_predictions = 0 incorrect_predictions = 0 - predictions.each do |p| - p[:measurements].each do |db_act| - if p[:value] - p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1 - accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f - confidences << p[:confidence] - - end + pp = [] + predictions.values.select{|p| p["probabilities"]}.compact.each do |p| + p["measurements"].each do |m| + pp << [ p["probabilities"][p["value"]], p["value"] == m ] end end + pp.sort_by!{|p| 1-p.first} + pp.each do |p| + p[1] ? correct_predictions += 1 : incorrect_predictions += 1 + accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f + probabilities << p[0] + end R.assign "accuracy", accuracies - R.assign "confidence", confidences - R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" + R.assign "probability", probabilities + R.eval "image = qplot(probability,accuracy)+ylab('Accumulated accuracy')+xlab('Prediction probability')+ylim(c(0,1))+scale_x_reverse()+geom_line()" R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.svg") plot_id = $gridfs.insert_one(file) - update(:confidence_plot_id => plot_id) - end - $gridfs.find_one(_id: confidence_plot_id).data + update(:probability_plot_id => plot_id) + #end + $gridfs.find_one(_id: probability_plot_id).data end end module RegressionStatistics def statistics - # TODO: predictions within prediction_interval self.rmse = 0 self.mae = 0 - #self.within_prediction_interval = 0 - #self.outside_prediction_interval = 0 + self.within_prediction_interval = 0 + self.out_of_prediction_interval = 0 x = [] y = [] predictions.each do |cid,pred| @@ -111,9 +112,13 @@ module OpenTox error = pred[:value]-pred[:measurements].median self.rmse += error**2 self.mae += error.abs - #if pred[:prediction_interval] - #if pred[:measurements] - #end + if pred[:prediction_interval] + if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1] + self.within_prediction_interval += 1 + else + self.out_of_prediction_interval += 1 + end + end else warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." @@ -128,16 +133,23 @@ module OpenTox $logger.debug "R^2 #{r_squared}" $logger.debug "RMSE #{rmse}" $logger.debug "MAE #{mae}" + $logger.debug "#{percent_within_prediction_interval.round(2)}% measurements within prediction interval" save { :mae => mae, :rmse => rmse, :r_squared => r_squared, + :within_prediction_interval => within_prediction_interval, + :out_of_prediction_interval => out_of_prediction_interval, } end + def percent_within_prediction_interval + 100*within_prediction_interval.to_f/(within_prediction_interval+out_of_prediction_interval) + end + def correlation_plot format: "png" - unless correlation_plot_id + #unless correlation_plot_id tmpfile = "/tmp/#{id.to_s}_correlation.#{format}" x = [] y = [] @@ -158,7 +170,7 @@ module OpenTox file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}") plot_id = $gridfs.insert_one(file) update(:correlation_plot_id => plot_id) - end + #end $gridfs.find_one(_id: correlation_plot_id).data end -- cgit v1.2.3 From 5418c2477a1a48b06f97d693f6c117336aec5b4c Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 27 Oct 2016 12:09:06 +0200 Subject: GridFS storage for plots. --- lib/validation-statistics.rb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'lib/validation-statistics.rb') diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 4ab4b13..b251bdb 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -66,7 +66,7 @@ module OpenTox end def probability_plot format: "pdf" - #unless probability_plot_id + unless probability_plot_id tmpfile = "/tmp/#{id.to_s}_probability.#{format}" accuracies = [] probabilities = [] @@ -91,7 +91,7 @@ module OpenTox file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.svg") plot_id = $gridfs.insert_one(file) update(:probability_plot_id => plot_id) - #end + end $gridfs.find_one(_id: probability_plot_id).data end end @@ -133,7 +133,7 @@ module OpenTox $logger.debug "R^2 #{r_squared}" $logger.debug "RMSE #{rmse}" $logger.debug "MAE #{mae}" - $logger.debug "#{percent_within_prediction_interval.round(2)}% measurements within prediction interval" + $logger.debug "#{percent_within_prediction_interval.round(2)}% of measurements within prediction interval" save { :mae => mae, @@ -149,7 +149,7 @@ module OpenTox end def correlation_plot format: "png" - #unless correlation_plot_id + unless correlation_plot_id tmpfile = "/tmp/#{id.to_s}_correlation.#{format}" x = [] y = [] @@ -170,7 +170,7 @@ module OpenTox file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}") plot_id = $gridfs.insert_one(file) update(:correlation_plot_id => plot_id) - #end + end $gridfs.find_one(_id: correlation_plot_id).data end -- cgit v1.2.3 From 280f81dcffb3b8b929ff9cbe92ba17403f5a9dd3 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 28 Oct 2016 12:31:53 +0200 Subject: adjusted r^2 removed (does not apply well to local models) --- lib/validation-statistics.rb | 1 + 1 file changed, 1 insertion(+) (limited to 'lib/validation-statistics.rb') diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index b251bdb..799bb34 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -139,6 +139,7 @@ module OpenTox :mae => mae, :rmse => rmse, :r_squared => r_squared, + :r_squared_adjusted => r_squared_adjusted, :within_prediction_interval => within_prediction_interval, :out_of_prediction_interval => out_of_prediction_interval, } -- cgit v1.2.3 From c6e86fc1bfee7cb91782dd7067408d78a8e48ed9 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 8 Nov 2016 16:04:49 +0100 Subject: probability plot for classification --- lib/validation-statistics.rb | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'lib/validation-statistics.rb') diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 799bb34..b6f8a60 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -66,8 +66,13 @@ module OpenTox end def probability_plot format: "pdf" - unless probability_plot_id - tmpfile = "/tmp/#{id.to_s}_probability.#{format}" + #unless probability_plot_id + + #tmpdir = File.join(ENV["HOME"], "tmp") + tmpdir = "/tmp" + #p tmpdir + FileUtils.mkdir_p tmpdir + tmpfile = File.join(tmpdir,"#{id.to_s}_probability.#{format}") accuracies = [] probabilities = [] correct_predictions = 0 @@ -91,7 +96,7 @@ module OpenTox file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.svg") plot_id = $gridfs.insert_one(file) update(:probability_plot_id => plot_id) - end + #end $gridfs.find_one(_id: probability_plot_id).data end end @@ -139,7 +144,6 @@ module OpenTox :mae => mae, :rmse => rmse, :r_squared => r_squared, - :r_squared_adjusted => r_squared_adjusted, :within_prediction_interval => within_prediction_interval, :out_of_prediction_interval => out_of_prediction_interval, } -- cgit v1.2.3