cleanup of validation modules/classes

author: Christoph Helma <helma@in-silico.ch> 2016-05-31 18:08:08 +0200
committer: Christoph Helma <helma@in-silico.ch> 2016-05-31 18:08:08 +0200
commit: b515a0cfedb887a2af753db6e4a08ae1af430cad (patch)
tree: 5d69d89d0031d581e932272aeb741ee38a0106d6 /lib/validation-statistics.rb
parent: f46ba3b7262f5b551c81fc9396c5b7f0cac7f030 (diff)
1 files changed, 186 insertions, 106 deletions
diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index e61543b..816824b 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -1,123 +1,203 @@
 module OpenTox
-  class ValidationStatistics
-    include OpenTox
-    def self.classification predictions, accept_values
-      confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
-      weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
-      true_rate = {}
-      predictivity = {}
-      nr_instances = 0
-      predictions.each do |cid,pred|
-        # TODO
-        # use predictions without probabilities (single neighbor)??
-        # use measured majority class??
-        if pred[:measured].uniq.size == 1 and pred[:probabilities]
-          m = pred[:measured].first
-          if pred[:value] == m
-            if pred[:value] == accept_values[0]
-              confusion_matrix[0][0] += 1
-              weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]]
-              nr_instances += 1
-            elsif pred[:value] == accept_values[1]
-              confusion_matrix[1][1] += 1
-              weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]]
-              nr_instances += 1
-            end
-          elsif pred[:value] != m
-            if pred[:value] == accept_values[0]
-              confusion_matrix[0][1] += 1
-              weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]]
-              nr_instances += 1
-            elsif pred[:value] == accept_values[1]
-              confusion_matrix[1][0] += 1
-              weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]]
-              nr_instances += 1
+  module Validation
+    module ClassificationStatistics
+
+      def statistics 
+        self.accept_values = model.prediction_feature.accept_values
+        self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
+        self.weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
+        true_rate = {}
+        predictivity = {}
+        nr_instances = 0
+        predictions.each do |cid,pred|
+          # TODO
+          # use predictions without probabilities (single neighbor)??
+          # use measured majority class??
+          if pred[:measurements].uniq.size == 1 and pred[:probabilities]
+            m = pred[:measurements].first
+            if pred[:value] == m
+              if pred[:value] == accept_values[0]
+                confusion_matrix[0][0] += 1
+                weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]]
+                nr_instances += 1
+              elsif pred[:value] == accept_values[1]
+                confusion_matrix[1][1] += 1
+                weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]]
+                nr_instances += 1
+              end
+            elsif pred[:value] != m
+              if pred[:value] == accept_values[0]
+                confusion_matrix[0][1] += 1
+                weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]]
+                nr_instances += 1
+              elsif pred[:value] == accept_values[1]
+                confusion_matrix[1][0] += 1
+                weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]]
+                nr_instances += 1
+              end
             end
           end
         end
+        true_rate = {}
+        predictivity = {}
+        accept_values.each_with_index do |v,i|
+          true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
+          predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
+        end
+        confidence_sum = 0
+        weighted_confusion_matrix.each do |r|
+          r.each do |c|
+            confidence_sum += c
+          end
+        end
+        self.accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f
+        self.weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f
+        $logger.debug "Accuracy #{accuracy}"
+        save
+        {
+          :accept_values => accept_values,
+          :confusion_matrix => confusion_matrix,
+          :weighted_confusion_matrix => weighted_confusion_matrix,
+          :accuracy => accuracy,
+          :weighted_accuracy => weighted_accuracy,
+          :true_rate => true_rate,
+          :predictivity => predictivity,
+        }
       end
-      true_rate = {}
-      predictivity = {}
-      accept_values.each_with_index do |v,i|
-        true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
-        predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
-      end
-      confidence_sum = 0
-      weighted_confusion_matrix.each do |r|
-        r.each do |c|
-          confidence_sum += c
+
+      def confidence_plot
+        unless confidence_plot_id
+          tmpfile = "/tmp/#{id.to_s}_confidence.svg"
+          accuracies = []
+          confidences = []
+          correct_predictions = 0
+          incorrect_predictions = 0
+          predictions.each do |p|
+            p[:measurements].each do |db_act|
+              if p[:value] 
+                p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1
+                accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
+                confidences << p[:confidence]
+
+              end
+            end
+          end
+          R.assign "accuracy", accuracies
+          R.assign "confidence", confidences
+          R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
+          R.eval "ggsave(file='#{tmpfile}', plot=image)"
+          file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
+          plot_id = $gridfs.insert_one(file)
+          update(:confidence_plot_id => plot_id)
         end
+        $gridfs.find_one(_id: confidence_plot_id).data
       end
-      accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f
-      weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f
-      $logger.debug "Accuracy #{accuracy}"
-      {
-        :accept_values => accept_values,
-        :confusion_matrix => confusion_matrix,
-        :weighted_confusion_matrix => weighted_confusion_matrix,
-        :accuracy => accuracy,
-        :weighted_accuracy => weighted_accuracy,
-        :true_rate => true_rate,
-        :predictivity => predictivity,
-        :finished_at => Time.now
-      }
     end
 
-    def self.regression predictions
-      # TODO: predictions within prediction_interval
-      rmse = 0
-      mae = 0
-      x = []
-      y = []
-      predictions.each do |cid,pred|
-        if pred[:value] and pred[:measured] 
-          x << pred[:measured].median
-          y << pred[:value]
-          error = pred[:value]-pred[:measured].median
-          rmse += error**2
-          mae += error.abs
-        else
-          warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
-          $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+    module RegressionStatistics
+
+      def statistics
+        # TODO: predictions within prediction_interval
+        rmse = 0
+        mae = 0
+        x = []
+        y = []
+        predictions.each do |cid,pred|
+          if pred[:value] and pred[:measurements] 
+            x << pred[:measurements].median
+            y << pred[:value]
+            error = pred[:value]-pred[:measurements].median
+            rmse += error**2
+            mae += error.abs
+          else
+            warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+            $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+          end
         end
+        R.assign "measurement", x
+        R.assign "prediction", y
+        R.eval "r <- cor(measurement,prediction,use='pairwise')"
+        r = R.eval("r").to_ruby
+
+        mae = mae/predictions.size
+        rmse = Math.sqrt(rmse/predictions.size)
+        $logger.debug "R^2 #{r**2}"
+        $logger.debug "RMSE #{rmse}"
+        $logger.debug "MAE #{mae}"
+        {
+          :mae => mae,
+          :rmse => rmse,
+          :r_squared => r**2,
+        }
       end
-      R.assign "measurement", x
-      R.assign "prediction", y
-      R.eval "r <- cor(measurement,prediction,use='pairwise')"
-      r = R.eval("r").to_ruby
 
-      mae = mae/predictions.size
-      rmse = Math.sqrt(rmse/predictions.size)
-      $logger.debug "R^2 #{r**2}"
-      $logger.debug "RMSE #{rmse}"
-      $logger.debug "MAE #{mae}"
-      {
-        :mae => mae,
-        :rmse => rmse,
-        :r_squared => r**2,
-        :finished_at => Time.now
-      }
-    end
+      def correlation_plot 
+        unless correlation_plot_id
+          tmpfile = "/tmp/#{id.to_s}_correlation.pdf"
+          x = []
+          y = []
+          feature = Feature.find(predictions.first.last["prediction_feature_id"])
+          predictions.each do |sid,p|
+            x << p["value"]
+            y << p["measurements"].median
+          end
+          R.assign "measurement", x
+          R.assign "prediction", y
+          R.eval "all = c(measurement,prediction)"
+          R.eval "range = c(min(all), max(all))"
+          title = feature.name
+          title += "[#{feature.unit}]" if feature.unit and !feature.unit.blank?
+          R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
+          R.eval "image = image + geom_abline(intercept=0, slope=1)"
+          R.eval "ggsave(file='#{tmpfile}', plot=image)"
+          file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.png")
+          plot_id = $gridfs.insert_one(file)
+          update(:correlation_plot_id => plot_id)
+        end
+        $gridfs.find_one(_id: correlation_plot_id).data
+      end
 
-    def self.correlation_plot id, predictions
-      tmpfile = "/tmp/#{id.to_s}_correlation.png"
-      x = []
-      y = []
-      predictions.each do |sid,p|
-        x << p["value"]
-        y << p["measured"].median
+      def worst_predictions n: 5, show_neigbors: true, show_common_descriptors: false
+        worst_predictions = predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,n]
+        worst_predictions.collect do |p|
+          substance = Substance.find(p.first)
+          prediction = p[1]
+          if show_neigbors
+            neighbors = prediction["neighbors"].collect do |n|
+              common_descriptors = []
+              if show_common_descriptors
+                common_descriptors = n["common_descriptors"].collect do |d|
+                  f=Feature.find(d)
+                  {
+                    :id => f.id.to_s,
+                    :name => "#{f.name} (#{f.conditions})",
+                    :p_value => d[:p_value],
+                    :r_squared => d[:r_squared],
+                  }
+                end
+              else
+                common_descriptors = n["common_descriptors"].size
+              end
+              {
+                :name => Substance.find(n["_id"]).name,
+                :id => n["_id"].to_s,
+                :common_descriptors => common_descriptors
+              }
+            end
+          else
+            neighbors = prediction["neighbors"].size
+          end
+          {
+            :id => substance.id.to_s,
+            :name => substance.name,
+            :feature => Feature.find(prediction["prediction_feature_id"]).name,
+            :error => (prediction["value"] - prediction["measurements"].median).abs,
+            :prediction => prediction["value"],
+            :measurements => prediction["measurements"],
+            :neighbors => neighbors
+          }
+        end
       end
-      R.assign "measurement", x
-      R.assign "prediction", y
-      R.eval "all = c(measurement,prediction)"
-      R.eval "range = c(min(all), max(all))"
-      # TODO units
-      R.eval "image = qplot(prediction,measurement,main='',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
-      R.eval "image = image + geom_abline(intercept=0, slope=1)"
-      R.eval "ggsave(file='#{tmpfile}', plot=image)"
-      file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.png")
-      plot_id = $gridfs.insert_one(file)
-      plot_id
     end
   end
 end
author	Christoph Helma <helma@in-silico.ch>	2016-05-31 18:08:08 +0200
committer	Christoph Helma <helma@in-silico.ch>	2016-05-31 18:08:08 +0200
commit	b515a0cfedb887a2af753db6e4a08ae1af430cad (patch)
tree	5d69d89d0031d581e932272aeb741ee38a0106d6 /lib/validation-statistics.rb
parent	f46ba3b7262f5b551c81fc9396c5b7f0cac7f030 (diff)