summaryrefslogtreecommitdiff
path: root/lib/validation-statistics.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/validation-statistics.rb')
-rw-r--r--lib/validation-statistics.rb176
1 files changed, 103 insertions, 73 deletions
diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 69e7992..5fd9985 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -7,62 +7,66 @@ module OpenTox
# @return [Hash]
def statistics
self.accept_values = model.prediction_feature.accept_values
- self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
- self.weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
- nr_instances = 0
+ self.confusion_matrix = {:all => Array.new(accept_values.size){Array.new(accept_values.size,0)}, :confidence_high => Array.new(accept_values.size){Array.new(accept_values.size,0)}, :confidence_low => Array.new(accept_values.size){Array.new(accept_values.size,0)}}
+ self.nr_predictions = {:all => 0,:confidence_high => 0,:confidence_low => 0}
predictions.each do |cid,pred|
- # TODO
- # use predictions without probabilities (single neighbor)??
- # use measured majority class??
+ # TODO: use measured majority class or all measurements??
if pred[:measurements].uniq.size == 1 and pred[:probabilities]
m = pred[:measurements].first
if pred[:value] == m
- if pred[:value] == accept_values[0]
- confusion_matrix[0][0] += 1
- weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]]
- nr_instances += 1
- elsif pred[:value] == accept_values[1]
- confusion_matrix[1][1] += 1
- weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]]
- nr_instances += 1
+ accept_values.each_with_index do |v,i|
+ if pred[:value] == v
+ confusion_matrix[:all][i][i] += 1
+ self.nr_predictions[:all] += 1
+ if pred[:confidence].match(/Similar/i)
+ confusion_matrix[:confidence_high][i][i] += 1
+ self.nr_predictions[:confidence_high] += 1
+ elsif pred[:confidence].match(/Low/i)
+ confusion_matrix[:confidence_low][i][i] += 1
+ self.nr_predictions[:confidence_low] += 1
+ end
+ end
end
elsif pred[:value] != m
- if pred[:value] == accept_values[0]
- confusion_matrix[0][1] += 1
- weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]]
- nr_instances += 1
- elsif pred[:value] == accept_values[1]
- confusion_matrix[1][0] += 1
- weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]]
- nr_instances += 1
+ accept_values.each_with_index do |v,i|
+ if pred[:value] == v
+ confusion_matrix[:all][i][(i+1)%2] += 1
+ self.nr_predictions[:all] += 1
+ if pred[:confidence].match(/Similar/i)
+ confusion_matrix[:confidence_high][i][(i+1)%2] += 1
+ self.nr_predictions[:confidence_high] += 1
+ elsif pred[:confidence].match(/Low/i)
+ confusion_matrix[:confidence_low][i][(i+1)%2] += 1
+ self.nr_predictions[:confidence_low] += 1
+ end
+ end
end
end
end
end
- self.true_rate = {}
- self.predictivity = {}
+
+ self.true_rate = {:all => {}, :confidence_high => {}, :confidence_low => {}}
+ self.predictivity = {:all => {}, :confidence_high => {}, :confidence_low => {}}
accept_values.each_with_index do |v,i|
- self.true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
- self.predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
- end
- confidence_sum = 0
- weighted_confusion_matrix.each do |r|
- r.each do |c|
- confidence_sum += c
+ [:all,:confidence_high,:confidence_low].each do |a|
+ self.true_rate[a][v] = confusion_matrix[a][i][i]/confusion_matrix[a][i].reduce(:+).to_f
+ self.predictivity[a][v] = confusion_matrix[a][i][i]/confusion_matrix[a].collect{|n| n[i]}.reduce(:+).to_f
end
end
- self.accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f
- self.weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f
+ self.accuracy = {}
+ [:all,:confidence_high,:confidence_low].each do |a|
+ self.accuracy[a] = (confusion_matrix[a][0][0]+confusion_matrix[a][1][1])/nr_predictions[a].to_f
+ end
$logger.debug "Accuracy #{accuracy}"
+ $logger.debug "Nr Predictions #{nr_predictions}"
save
{
:accept_values => accept_values,
:confusion_matrix => confusion_matrix,
- :weighted_confusion_matrix => weighted_confusion_matrix,
:accuracy => accuracy,
- :weighted_accuracy => weighted_accuracy,
:true_rate => self.true_rate,
:predictivity => self.predictivity,
+ :nr_predictions => nr_predictions,
}
end
@@ -97,7 +101,7 @@ module OpenTox
R.assign "probability", probabilities
R.eval "image = qplot(probability,accuracy)+ylab('Accumulated accuracy')+xlab('Prediction probability')+ylim(c(0,1))+scale_x_reverse()+geom_line()"
R.eval "ggsave(file='#{tmpfile}', plot=image)"
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.svg")
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.#{format}")
plot_id = $gridfs.insert_one(file)
update(:probability_plot_id => plot_id)
#end
@@ -108,29 +112,27 @@ module OpenTox
# Statistical evaluation of regression validations
module RegressionStatistics
+ attr_accessor :x, :y
+
# Get statistics
# @return [Hash]
def statistics
self.warnings = []
- self.rmse = 0
- self.mae = 0
- self.within_prediction_interval = 0
- self.out_of_prediction_interval = 0
- x = []
- y = []
+ self.rmse = {:all =>0,:confidence_high => 0,:confidence_low => 0}
+ self.r_squared = {:all =>0,:confidence_high => 0,:confidence_low => 0}
+ self.mae = {:all =>0,:confidence_high => 0,:confidence_low => 0}
+ self.within_prediction_interval = {:all =>0,:confidence_high => 0,:confidence_low => 0}
+ self.out_of_prediction_interval = {:all =>0,:confidence_high => 0,:confidence_low => 0}
+ @x = {:all => [],:confidence_high => [],:confidence_low => []}
+ @y = {:all => [],:confidence_high => [],:confidence_low => []}
+ self.nr_predictions = {:all =>0,:confidence_high => 0,:confidence_low => 0}
predictions.each do |cid,pred|
- if pred[:value] and pred[:measurements]
- x << pred[:measurements].median
- y << pred[:value]
- error = pred[:value]-pred[:measurements].median
- self.rmse += error**2
- self.mae += error.abs
- if pred[:prediction_interval]
- if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1]
- self.within_prediction_interval += 1
- else
- self.out_of_prediction_interval += 1
- end
+ !if pred[:value] and pred[:measurements] and !pred[:measurements].empty?
+ insert_prediction pred, :all
+ if pred[:confidence].match(/Similar/i)
+ insert_prediction pred, :confidence_high
+ elsif pred[:confidence].match(/Low/i)
+ insert_prediction pred, :confidence_low
end
else
trd_id = model.training_dataset_id
@@ -139,39 +141,49 @@ module OpenTox
$logger.debug "No training activities for #{smiles} in training dataset #{trd_id}."
end
end
- R.assign "measurement", x
- R.assign "prediction", y
- R.eval "r <- cor(measurement,prediction,use='pairwise')"
- self.r_squared = R.eval("r").to_ruby**2
- self.mae = self.mae/predictions.size
- self.rmse = Math.sqrt(self.rmse/predictions.size)
+ [:all,:confidence_high,:confidence_low].each do |a|
+ if @x[a].size > 2
+ R.assign "measurement", @x[a]
+ R.assign "prediction", @y[a]
+ R.eval "r <- cor(measurement,prediction,use='pairwise')"
+ self.r_squared[a] = R.eval("r").to_ruby**2
+ else
+ self.r_squared[a] = 0
+ end
+ if self.nr_predictions[a] > 0
+ self.mae[a] = self.mae[a]/self.nr_predictions[a]
+ self.rmse[a] = Math.sqrt(self.rmse[a]/self.nr_predictions[a])
+ else
+ self.mae[a] = nil
+ self.rmse[a] = nil
+ end
+ end
$logger.debug "R^2 #{r_squared}"
$logger.debug "RMSE #{rmse}"
$logger.debug "MAE #{mae}"
- $logger.debug "#{percent_within_prediction_interval.round(2)}% of measurements within prediction interval"
- $logger.debug "#{warnings}"
+ $logger.debug "Nr predictions #{nr_predictions}"
+ $logger.debug "#{within_prediction_interval} measurements within prediction interval"
save
{
:mae => mae,
:rmse => rmse,
:r_squared => r_squared,
- :within_prediction_interval => within_prediction_interval,
+ :within_prediction_interval => self.within_prediction_interval,
:out_of_prediction_interval => out_of_prediction_interval,
+ :nr_predictions => nr_predictions,
}
end
- # Get percentage of measurements within the prediction interval
- # @return [Float]
- def percent_within_prediction_interval
- 100*within_prediction_interval.to_f/(within_prediction_interval+out_of_prediction_interval)
- end
-
# Plot predicted vs measured values
# @param [String,nil] format
# @return [Blob]
def correlation_plot format: "png"
- unless correlation_plot_id
- tmpfile = "/tmp/#{id.to_s}_correlation.#{format}"
+ #unless correlation_plot_id
+ #tmpfile = "/tmp/#{id.to_s}_correlation.#{format}"
+ tmpdir = "/tmp"
+ #p tmpdir
+ FileUtils.mkdir_p tmpdir
+ tmpfile = File.join(tmpdir,"#{id.to_s}_correlation.#{format}")
x = []
y = []
feature = Feature.find(predictions.first.last["prediction_feature_id"])
@@ -187,7 +199,7 @@ module OpenTox
title = "log2(Net cell association [mL/ug(Mg)])"
else
title = feature.name
- title += " [#{feature.unit}]" if feature.unit and !feature.unit.blank?
+ title += "-log10(#{feature.unit})" if feature.unit and !feature.unit.blank?
end
R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
R.eval "image = image + geom_abline(intercept=0, slope=1)"
@@ -195,7 +207,7 @@ module OpenTox
file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}")
plot_id = $gridfs.insert_one(file)
update(:correlation_plot_id => plot_id)
- end
+ #end
$gridfs.find_one(_id: correlation_plot_id).data
end
@@ -215,6 +227,24 @@ module OpenTox
end
worst_predictions.sort_by{|sid,p| p["distance_prediction_interval"] }.to_h
end
+
+ private
+
+ def insert_prediction prediction, type
+ self.nr_predictions[type] +=1
+ @x[type] << prediction[:measurements].median
+ @y[type] << prediction[:value]
+ error = prediction[:value]-prediction[:measurements].median
+ self.rmse[type] += error**2
+ self.mae[type] += error.abs
+ if prediction[:prediction_interval]
+ if prediction[:measurements].median >= prediction[:prediction_interval][0] and prediction[:measurements].median <= prediction[:prediction_interval][1]
+ self.within_prediction_interval[type] += 1
+ else
+ self.out_of_prediction_interval[type] += 1
+ end
+ end
+ end
end
end
end