diff options
Diffstat (limited to 'lib/validation-statistics.rb')
-rw-r--r-- | lib/validation-statistics.rb | 176 |
1 files changed, 103 insertions, 73 deletions
diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 69e7992..5fd9985 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -7,62 +7,66 @@ module OpenTox # @return [Hash] def statistics self.accept_values = model.prediction_feature.accept_values - self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} - self.weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} - nr_instances = 0 + self.confusion_matrix = {:all => Array.new(accept_values.size){Array.new(accept_values.size,0)}, :confidence_high => Array.new(accept_values.size){Array.new(accept_values.size,0)}, :confidence_low => Array.new(accept_values.size){Array.new(accept_values.size,0)}} + self.nr_predictions = {:all => 0,:confidence_high => 0,:confidence_low => 0} predictions.each do |cid,pred| - # TODO - # use predictions without probabilities (single neighbor)?? - # use measured majority class?? + # TODO: use measured majority class or all measurements?? if pred[:measurements].uniq.size == 1 and pred[:probabilities] m = pred[:measurements].first if pred[:value] == m - if pred[:value] == accept_values[0] - confusion_matrix[0][0] += 1 - weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]] - nr_instances += 1 - elsif pred[:value] == accept_values[1] - confusion_matrix[1][1] += 1 - weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]] - nr_instances += 1 + accept_values.each_with_index do |v,i| + if pred[:value] == v + confusion_matrix[:all][i][i] += 1 + self.nr_predictions[:all] += 1 + if pred[:confidence].match(/Similar/i) + confusion_matrix[:confidence_high][i][i] += 1 + self.nr_predictions[:confidence_high] += 1 + elsif pred[:confidence].match(/Low/i) + confusion_matrix[:confidence_low][i][i] += 1 + self.nr_predictions[:confidence_low] += 1 + end + end end elsif pred[:value] != m - if pred[:value] == accept_values[0] - confusion_matrix[0][1] += 1 - weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]] - nr_instances += 1 - elsif pred[:value] == accept_values[1] - confusion_matrix[1][0] += 1 - weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]] - nr_instances += 1 + accept_values.each_with_index do |v,i| + if pred[:value] == v + confusion_matrix[:all][i][(i+1)%2] += 1 + self.nr_predictions[:all] += 1 + if pred[:confidence].match(/Similar/i) + confusion_matrix[:confidence_high][i][(i+1)%2] += 1 + self.nr_predictions[:confidence_high] += 1 + elsif pred[:confidence].match(/Low/i) + confusion_matrix[:confidence_low][i][(i+1)%2] += 1 + self.nr_predictions[:confidence_low] += 1 + end + end end end end end - self.true_rate = {} - self.predictivity = {} + + self.true_rate = {:all => {}, :confidence_high => {}, :confidence_low => {}} + self.predictivity = {:all => {}, :confidence_high => {}, :confidence_low => {}} accept_values.each_with_index do |v,i| - self.true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f - self.predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f - end - confidence_sum = 0 - weighted_confusion_matrix.each do |r| - r.each do |c| - confidence_sum += c + [:all,:confidence_high,:confidence_low].each do |a| + self.true_rate[a][v] = confusion_matrix[a][i][i]/confusion_matrix[a][i].reduce(:+).to_f + self.predictivity[a][v] = confusion_matrix[a][i][i]/confusion_matrix[a].collect{|n| n[i]}.reduce(:+).to_f end end - self.accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f - self.weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f + self.accuracy = {} + [:all,:confidence_high,:confidence_low].each do |a| + self.accuracy[a] = (confusion_matrix[a][0][0]+confusion_matrix[a][1][1])/nr_predictions[a].to_f + end $logger.debug "Accuracy #{accuracy}" + $logger.debug "Nr Predictions #{nr_predictions}" save { :accept_values => accept_values, :confusion_matrix => confusion_matrix, - :weighted_confusion_matrix => weighted_confusion_matrix, :accuracy => accuracy, - :weighted_accuracy => weighted_accuracy, :true_rate => self.true_rate, :predictivity => self.predictivity, + :nr_predictions => nr_predictions, } end @@ -97,7 +101,7 @@ module OpenTox R.assign "probability", probabilities R.eval "image = qplot(probability,accuracy)+ylab('Accumulated accuracy')+xlab('Prediction probability')+ylim(c(0,1))+scale_x_reverse()+geom_line()" R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.svg") + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.#{format}") plot_id = $gridfs.insert_one(file) update(:probability_plot_id => plot_id) #end @@ -108,29 +112,27 @@ module OpenTox # Statistical evaluation of regression validations module RegressionStatistics + attr_accessor :x, :y + # Get statistics # @return [Hash] def statistics self.warnings = [] - self.rmse = 0 - self.mae = 0 - self.within_prediction_interval = 0 - self.out_of_prediction_interval = 0 - x = [] - y = [] + self.rmse = {:all =>0,:confidence_high => 0,:confidence_low => 0} + self.r_squared = {:all =>0,:confidence_high => 0,:confidence_low => 0} + self.mae = {:all =>0,:confidence_high => 0,:confidence_low => 0} + self.within_prediction_interval = {:all =>0,:confidence_high => 0,:confidence_low => 0} + self.out_of_prediction_interval = {:all =>0,:confidence_high => 0,:confidence_low => 0} + @x = {:all => [],:confidence_high => [],:confidence_low => []} + @y = {:all => [],:confidence_high => [],:confidence_low => []} + self.nr_predictions = {:all =>0,:confidence_high => 0,:confidence_low => 0} predictions.each do |cid,pred| - if pred[:value] and pred[:measurements] - x << pred[:measurements].median - y << pred[:value] - error = pred[:value]-pred[:measurements].median - self.rmse += error**2 - self.mae += error.abs - if pred[:prediction_interval] - if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1] - self.within_prediction_interval += 1 - else - self.out_of_prediction_interval += 1 - end + !if pred[:value] and pred[:measurements] and !pred[:measurements].empty? + insert_prediction pred, :all + if pred[:confidence].match(/Similar/i) + insert_prediction pred, :confidence_high + elsif pred[:confidence].match(/Low/i) + insert_prediction pred, :confidence_low end else trd_id = model.training_dataset_id @@ -139,39 +141,49 @@ module OpenTox $logger.debug "No training activities for #{smiles} in training dataset #{trd_id}." end end - R.assign "measurement", x - R.assign "prediction", y - R.eval "r <- cor(measurement,prediction,use='pairwise')" - self.r_squared = R.eval("r").to_ruby**2 - self.mae = self.mae/predictions.size - self.rmse = Math.sqrt(self.rmse/predictions.size) + [:all,:confidence_high,:confidence_low].each do |a| + if @x[a].size > 2 + R.assign "measurement", @x[a] + R.assign "prediction", @y[a] + R.eval "r <- cor(measurement,prediction,use='pairwise')" + self.r_squared[a] = R.eval("r").to_ruby**2 + else + self.r_squared[a] = 0 + end + if self.nr_predictions[a] > 0 + self.mae[a] = self.mae[a]/self.nr_predictions[a] + self.rmse[a] = Math.sqrt(self.rmse[a]/self.nr_predictions[a]) + else + self.mae[a] = nil + self.rmse[a] = nil + end + end $logger.debug "R^2 #{r_squared}" $logger.debug "RMSE #{rmse}" $logger.debug "MAE #{mae}" - $logger.debug "#{percent_within_prediction_interval.round(2)}% of measurements within prediction interval" - $logger.debug "#{warnings}" + $logger.debug "Nr predictions #{nr_predictions}" + $logger.debug "#{within_prediction_interval} measurements within prediction interval" save { :mae => mae, :rmse => rmse, :r_squared => r_squared, - :within_prediction_interval => within_prediction_interval, + :within_prediction_interval => self.within_prediction_interval, :out_of_prediction_interval => out_of_prediction_interval, + :nr_predictions => nr_predictions, } end - # Get percentage of measurements within the prediction interval - # @return [Float] - def percent_within_prediction_interval - 100*within_prediction_interval.to_f/(within_prediction_interval+out_of_prediction_interval) - end - # Plot predicted vs measured values # @param [String,nil] format # @return [Blob] def correlation_plot format: "png" - unless correlation_plot_id - tmpfile = "/tmp/#{id.to_s}_correlation.#{format}" + #unless correlation_plot_id + #tmpfile = "/tmp/#{id.to_s}_correlation.#{format}" + tmpdir = "/tmp" + #p tmpdir + FileUtils.mkdir_p tmpdir + tmpfile = File.join(tmpdir,"#{id.to_s}_correlation.#{format}") x = [] y = [] feature = Feature.find(predictions.first.last["prediction_feature_id"]) @@ -187,7 +199,7 @@ module OpenTox title = "log2(Net cell association [mL/ug(Mg)])" else title = feature.name - title += " [#{feature.unit}]" if feature.unit and !feature.unit.blank? + title += "-log10(#{feature.unit})" if feature.unit and !feature.unit.blank? end R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" R.eval "image = image + geom_abline(intercept=0, slope=1)" @@ -195,7 +207,7 @@ module OpenTox file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}") plot_id = $gridfs.insert_one(file) update(:correlation_plot_id => plot_id) - end + #end $gridfs.find_one(_id: correlation_plot_id).data end @@ -215,6 +227,24 @@ module OpenTox end worst_predictions.sort_by{|sid,p| p["distance_prediction_interval"] }.to_h end + + private + + def insert_prediction prediction, type + self.nr_predictions[type] +=1 + @x[type] << prediction[:measurements].median + @y[type] << prediction[:value] + error = prediction[:value]-prediction[:measurements].median + self.rmse[type] += error**2 + self.mae[type] += error.abs + if prediction[:prediction_interval] + if prediction[:measurements].median >= prediction[:prediction_interval][0] and prediction[:measurements].median <= prediction[:prediction_interval][1] + self.within_prediction_interval[type] += 1 + else + self.out_of_prediction_interval[type] += 1 + end + end + end end end end |