From 815cf6ba1543fc323eb7cbd1202fadbf03bcfbca Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Wed, 13 Apr 2016 15:35:01 +0200
Subject: new files added

---
 lib/validation-statistics.rb | 100 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 lib/validation-statistics.rb

(limited to 'lib/validation-statistics.rb')

diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
new file mode 100644
index 0000000..570b2d4
--- /dev/null
+++ b/lib/validation-statistics.rb
@@ -0,0 +1,100 @@
+module OpenTox
+  class ValidationStatistics
+    include OpenTox
+    def self.classification predictions, accept_values
+      confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
+      weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
+      true_rate = {}
+      predictivity = {}
+      nr_instances = 0
+      predictions.each do |cid,pred|
+        # TODO use measured majority class
+        if pred[:measured].uniq.size == 1
+          m = pred[:measured].first
+        #pred[:measured].each do |m|
+          if pred[:value] == m
+            if pred[:value] == accept_values[0]
+              confusion_matrix[0][0] += 1
+              weighted_confusion_matrix[0][0] += pred[:confidence]
+              nr_instances += 1
+            elsif pred[:value] == accept_values[1]
+              confusion_matrix[1][1] += 1
+              weighted_confusion_matrix[1][1] += pred[:confidence]
+              nr_instances += 1
+            end
+          elsif pred[:value] != m
+            if pred[:value] == accept_values[0]
+              confusion_matrix[0][1] += 1
+              weighted_confusion_matrix[0][1] += pred[:confidence]
+              nr_instances += 1
+            elsif pred[:value] == accept_values[1]
+              confusion_matrix[1][0] += 1
+              weighted_confusion_matrix[1][0] += pred[:confidence]
+              nr_instances += 1
+            end
+          end
+        end
+      end
+      true_rate = {}
+      predictivity = {}
+      accept_values.each_with_index do |v,i|
+        true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
+        predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
+      end
+      confidence_sum = 0
+      weighted_confusion_matrix.each do |r|
+        r.each do |c|
+          confidence_sum += c
+        end
+      end
+      accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f
+      $logger.debug "Accuracy #{accuracy}"
+      {
+        :accept_values => accept_values,
+        :confusion_matrix => confusion_matrix,
+        :weighted_confusion_matrix => weighted_confusion_matrix,
+        :accuracy => accuracy,
+        :weighted_accuracy => (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
+        :true_rate => true_rate,
+        :predictivity => predictivity,
+        :finished_at => Time.now
+      }
+    end
+
+    def self.regression predictions
+      # TODO: prediction intervals
+      rmse = 0
+      mae = 0
+      x = []
+      y = []
+      predictions.each do |cid,pred|
+        if pred[:value] and pred[:measured] #and pred[:measured] != [nil]
+          x << -Math.log10(pred[:measured].median)
+          y << -Math.log10(pred[:value])
+          error = Math.log10(pred[:value])-Math.log10(pred[:measured].median)
+          rmse += error**2
+          mae += error.abs
+        else
+          warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+          $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+        end
+      end
+      R.assign "measurement", x
+      R.assign "prediction", y
+      R.eval "r <- cor(measurement,prediction,use='complete')"
+      r = R.eval("r").to_ruby
+
+      mae = mae/predictions.size
+      rmse = Math.sqrt(rmse/predictions.size)
+      $logger.debug "R^2 #{r**2}"
+      $logger.debug "RMSE #{rmse}"
+      $logger.debug "MAE #{mae}"
+      {
+        :mae => mae,
+        :rmse => rmse,
+        :r_squared => r**2,
+        :finished_at => Time.now
+      }
+    end
+  end
+end
-- 
cgit v1.2.3


From 05386e748270c337c66f6f379317ea4b25905236 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Wed, 4 May 2016 19:24:42 +0200
Subject: first reasonable results for nanoparticle crossvalidation

---
 lib/validation-statistics.rb | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'lib/validation-statistics.rb')

diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index c6b2a07..b7c95f6 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -63,16 +63,15 @@ module OpenTox
     end
 
     def self.regression predictions
-      # TODO: prediction intervals
       rmse = 0
       mae = 0
       x = []
       y = []
       predictions.each do |cid,pred|
         if pred[:value] and pred[:measured] #and pred[:measured] != [nil]
-          x << -Math.log10(pred[:measured].median)
-          y << -Math.log10(pred[:value])
-          error = Math.log10(pred[:value])-Math.log10(pred[:measured].median)
+          x << pred[:measured].median
+          y << pred[:value]
+          error = pred[:value]-pred[:measured].median
           rmse += error**2
           mae += error.abs
         else
-- 
cgit v1.2.3


From 7794086d367fb256c3673d7578b23ec2fb83e6ed Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Mon, 9 May 2016 14:05:29 +0200
Subject: physchem crossvalidation fixed

---
 lib/validation-statistics.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/validation-statistics.rb')

diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index b7c95f6..0079bae 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -68,7 +68,7 @@ module OpenTox
       x = []
       y = []
       predictions.each do |cid,pred|
-        if pred[:value] and pred[:measured] #and pred[:measured] != [nil]
+        if pred[:value] and pred[:measured] 
           x << pred[:measured].median
           y << pred[:value]
           error = pred[:value]-pred[:measured].median
-- 
cgit v1.2.3


From c90644211e214a50f6fdb3a936bf247f45f1f4be Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Fri, 13 May 2016 13:38:24 +0200
Subject: compound tests fixed

---
 lib/validation-statistics.rb | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'lib/validation-statistics.rb')

diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 0079bae..2d6b56e 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -96,5 +96,29 @@ module OpenTox
         :finished_at => Time.now
       }
     end
+
+  end
+  
+  module Plot
+
+    def plot_id 
+      tmpfile = "/tmp/#{id.to_s}_correlation.png"
+      x = []
+      y = []
+      predictions.each do |sid,p|
+        x << p["value"]
+        y << p["measured"].median
+      end
+      R.assign "measurement", x
+      R.assign "prediction", y
+      R.eval "all = c(measurement,prediction)"
+      R.eval "range = c(min(all), max(all))"
+      R.eval "image = qplot(prediction,measurement,main='',asp=1,xlim=range, ylim=range)"
+      R.eval "image = image + geom_abline(intercept=0, slope=1)"
+      R.eval "ggsave(file='#{tmpfile}', plot=image)"
+      file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png")
+      plot_id = $gridfs.insert_one(file)
+      plot_id
+    end
   end
 end
-- 
cgit v1.2.3


From b2d80ad2e470fcb41af4b747142e5693f2fa4615 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Tue, 24 May 2016 13:05:53 +0200
Subject: dataset tests fixed

---
 lib/validation-statistics.rb | 1 +
 1 file changed, 1 insertion(+)

(limited to 'lib/validation-statistics.rb')

diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 2d6b56e..3c52b15 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -68,6 +68,7 @@ module OpenTox
       x = []
       y = []
       predictions.each do |cid,pred|
+        p pred
         if pred[:value] and pred[:measured] 
           x << pred[:measured].median
           y << pred[:value]
-- 
cgit v1.2.3


From cc08e6beda7f7d70ebf6c6929a22d1a0cd7c1a20 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Tue, 24 May 2016 15:41:24 +0200
Subject: tests fixed. DescriptorTest#test_compound_all may fail within all.rb

---
 lib/validation-statistics.rb | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'lib/validation-statistics.rb')

diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 3c52b15..156353a 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -8,10 +8,11 @@ module OpenTox
       predictivity = {}
       nr_instances = 0
       predictions.each do |cid,pred|
-        # TODO use measured majority class
-        if pred[:measured].uniq.size == 1
+        # TODO
+        # use predictions without probabilities (single neighbor)??
+        # use measured majority class??
+        if pred[:measured].uniq.size == 1 and pred[:probabilities]
           m = pred[:measured].first
-        #pred[:measured].each do |m|
           if pred[:value] == m
             if pred[:value] == accept_values[0]
               confusion_matrix[0][0] += 1
@@ -63,12 +64,12 @@ module OpenTox
     end
 
     def self.regression predictions
+      # TODO: predictions within prediction_interval
       rmse = 0
       mae = 0
       x = []
       y = []
       predictions.each do |cid,pred|
-        p pred
         if pred[:value] and pred[:measured] 
           x << pred[:measured].median
           y << pred[:value]
-- 
cgit v1.2.3


From f46ba3b7262f5b551c81fc9396c5b7f0cac7f030 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Fri, 27 May 2016 19:16:16 +0200
Subject: first correlation of nanoparticle predictions

---
 lib/validation-statistics.rb | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'lib/validation-statistics.rb')

diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 156353a..e61543b 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -83,7 +83,7 @@ module OpenTox
       end
       R.assign "measurement", x
       R.assign "prediction", y
-      R.eval "r <- cor(measurement,prediction,use='complete')"
+      R.eval "r <- cor(measurement,prediction,use='pairwise')"
       r = R.eval("r").to_ruby
 
       mae = mae/predictions.size
@@ -99,11 +99,7 @@ module OpenTox
       }
     end
 
-  end
-  
-  module Plot
-
-    def plot_id 
+    def self.correlation_plot id, predictions
       tmpfile = "/tmp/#{id.to_s}_correlation.png"
       x = []
       y = []
@@ -115,10 +111,11 @@ module OpenTox
       R.assign "prediction", y
       R.eval "all = c(measurement,prediction)"
       R.eval "range = c(min(all), max(all))"
-      R.eval "image = qplot(prediction,measurement,main='',asp=1,xlim=range, ylim=range)"
+      # TODO units
+      R.eval "image = qplot(prediction,measurement,main='',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
       R.eval "image = image + geom_abline(intercept=0, slope=1)"
       R.eval "ggsave(file='#{tmpfile}', plot=image)"
-      file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.png")
+      file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.png")
       plot_id = $gridfs.insert_one(file)
       plot_id
     end
-- 
cgit v1.2.3


From b515a0cfedb887a2af753db6e4a08ae1af430cad Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Tue, 31 May 2016 18:08:08 +0200
Subject: cleanup of validation modules/classes

---
 lib/validation-statistics.rb | 292 +++++++++++++++++++++++++++----------------
 1 file changed, 186 insertions(+), 106 deletions(-)

(limited to 'lib/validation-statistics.rb')

diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index e61543b..816824b 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -1,123 +1,203 @@
 module OpenTox
-  class ValidationStatistics
-    include OpenTox
-    def self.classification predictions, accept_values
-      confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
-      weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
-      true_rate = {}
-      predictivity = {}
-      nr_instances = 0
-      predictions.each do |cid,pred|
-        # TODO
-        # use predictions without probabilities (single neighbor)??
-        # use measured majority class??
-        if pred[:measured].uniq.size == 1 and pred[:probabilities]
-          m = pred[:measured].first
-          if pred[:value] == m
-            if pred[:value] == accept_values[0]
-              confusion_matrix[0][0] += 1
-              weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]]
-              nr_instances += 1
-            elsif pred[:value] == accept_values[1]
-              confusion_matrix[1][1] += 1
-              weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]]
-              nr_instances += 1
-            end
-          elsif pred[:value] != m
-            if pred[:value] == accept_values[0]
-              confusion_matrix[0][1] += 1
-              weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]]
-              nr_instances += 1
-            elsif pred[:value] == accept_values[1]
-              confusion_matrix[1][0] += 1
-              weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]]
-              nr_instances += 1
+  module Validation
+    module ClassificationStatistics
+
+      def statistics 
+        self.accept_values = model.prediction_feature.accept_values
+        self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
+        self.weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
+        true_rate = {}
+        predictivity = {}
+        nr_instances = 0
+        predictions.each do |cid,pred|
+          # TODO
+          # use predictions without probabilities (single neighbor)??
+          # use measured majority class??
+          if pred[:measurements].uniq.size == 1 and pred[:probabilities]
+            m = pred[:measurements].first
+            if pred[:value] == m
+              if pred[:value] == accept_values[0]
+                confusion_matrix[0][0] += 1
+                weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]]
+                nr_instances += 1
+              elsif pred[:value] == accept_values[1]
+                confusion_matrix[1][1] += 1
+                weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]]
+                nr_instances += 1
+              end
+            elsif pred[:value] != m
+              if pred[:value] == accept_values[0]
+                confusion_matrix[0][1] += 1
+                weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]]
+                nr_instances += 1
+              elsif pred[:value] == accept_values[1]
+                confusion_matrix[1][0] += 1
+                weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]]
+                nr_instances += 1
+              end
             end
           end
         end
+        true_rate = {}
+        predictivity = {}
+        accept_values.each_with_index do |v,i|
+          true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
+          predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
+        end
+        confidence_sum = 0
+        weighted_confusion_matrix.each do |r|
+          r.each do |c|
+            confidence_sum += c
+          end
+        end
+        self.accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f
+        self.weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f
+        $logger.debug "Accuracy #{accuracy}"
+        save
+        {
+          :accept_values => accept_values,
+          :confusion_matrix => confusion_matrix,
+          :weighted_confusion_matrix => weighted_confusion_matrix,
+          :accuracy => accuracy,
+          :weighted_accuracy => weighted_accuracy,
+          :true_rate => true_rate,
+          :predictivity => predictivity,
+        }
       end
-      true_rate = {}
-      predictivity = {}
-      accept_values.each_with_index do |v,i|
-        true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
-        predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
-      end
-      confidence_sum = 0
-      weighted_confusion_matrix.each do |r|
-        r.each do |c|
-          confidence_sum += c
+
+      def confidence_plot
+        unless confidence_plot_id
+          tmpfile = "/tmp/#{id.to_s}_confidence.svg"
+          accuracies = []
+          confidences = []
+          correct_predictions = 0
+          incorrect_predictions = 0
+          predictions.each do |p|
+            p[:measurements].each do |db_act|
+              if p[:value] 
+                p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1
+                accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
+                confidences << p[:confidence]
+
+              end
+            end
+          end
+          R.assign "accuracy", accuracies
+          R.assign "confidence", confidences
+          R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
+          R.eval "ggsave(file='#{tmpfile}', plot=image)"
+          file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
+          plot_id = $gridfs.insert_one(file)
+          update(:confidence_plot_id => plot_id)
         end
+        $gridfs.find_one(_id: confidence_plot_id).data
       end
-      accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f
-      weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f
-      $logger.debug "Accuracy #{accuracy}"
-      {
-        :accept_values => accept_values,
-        :confusion_matrix => confusion_matrix,
-        :weighted_confusion_matrix => weighted_confusion_matrix,
-        :accuracy => accuracy,
-        :weighted_accuracy => weighted_accuracy,
-        :true_rate => true_rate,
-        :predictivity => predictivity,
-        :finished_at => Time.now
-      }
     end
 
-    def self.regression predictions
-      # TODO: predictions within prediction_interval
-      rmse = 0
-      mae = 0
-      x = []
-      y = []
-      predictions.each do |cid,pred|
-        if pred[:value] and pred[:measured] 
-          x << pred[:measured].median
-          y << pred[:value]
-          error = pred[:value]-pred[:measured].median
-          rmse += error**2
-          mae += error.abs
-        else
-          warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
-          $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+    module RegressionStatistics
+
+      def statistics
+        # TODO: predictions within prediction_interval
+        rmse = 0
+        mae = 0
+        x = []
+        y = []
+        predictions.each do |cid,pred|
+          if pred[:value] and pred[:measurements] 
+            x << pred[:measurements].median
+            y << pred[:value]
+            error = pred[:value]-pred[:measurements].median
+            rmse += error**2
+            mae += error.abs
+          else
+            warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+            $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+          end
         end
+        R.assign "measurement", x
+        R.assign "prediction", y
+        R.eval "r <- cor(measurement,prediction,use='pairwise')"
+        r = R.eval("r").to_ruby
+
+        mae = mae/predictions.size
+        rmse = Math.sqrt(rmse/predictions.size)
+        $logger.debug "R^2 #{r**2}"
+        $logger.debug "RMSE #{rmse}"
+        $logger.debug "MAE #{mae}"
+        {
+          :mae => mae,
+          :rmse => rmse,
+          :r_squared => r**2,
+        }
       end
-      R.assign "measurement", x
-      R.assign "prediction", y
-      R.eval "r <- cor(measurement,prediction,use='pairwise')"
-      r = R.eval("r").to_ruby
 
-      mae = mae/predictions.size
-      rmse = Math.sqrt(rmse/predictions.size)
-      $logger.debug "R^2 #{r**2}"
-      $logger.debug "RMSE #{rmse}"
-      $logger.debug "MAE #{mae}"
-      {
-        :mae => mae,
-        :rmse => rmse,
-        :r_squared => r**2,
-        :finished_at => Time.now
-      }
-    end
+      def correlation_plot 
+        unless correlation_plot_id
+          tmpfile = "/tmp/#{id.to_s}_correlation.pdf"
+          x = []
+          y = []
+          feature = Feature.find(predictions.first.last["prediction_feature_id"])
+          predictions.each do |sid,p|
+            x << p["value"]
+            y << p["measurements"].median
+          end
+          R.assign "measurement", x
+          R.assign "prediction", y
+          R.eval "all = c(measurement,prediction)"
+          R.eval "range = c(min(all), max(all))"
+          title = feature.name
+          title += "[#{feature.unit}]" if feature.unit and !feature.unit.blank?
+          R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
+          R.eval "image = image + geom_abline(intercept=0, slope=1)"
+          R.eval "ggsave(file='#{tmpfile}', plot=image)"
+          file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.png")
+          plot_id = $gridfs.insert_one(file)
+          update(:correlation_plot_id => plot_id)
+        end
+        $gridfs.find_one(_id: correlation_plot_id).data
+      end
 
-    def self.correlation_plot id, predictions
-      tmpfile = "/tmp/#{id.to_s}_correlation.png"
-      x = []
-      y = []
-      predictions.each do |sid,p|
-        x << p["value"]
-        y << p["measured"].median
+      def worst_predictions n: 5, show_neigbors: true, show_common_descriptors: false
+        worst_predictions = predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,n]
+        worst_predictions.collect do |p|
+          substance = Substance.find(p.first)
+          prediction = p[1]
+          if show_neigbors
+            neighbors = prediction["neighbors"].collect do |n|
+              common_descriptors = []
+              if show_common_descriptors
+                common_descriptors = n["common_descriptors"].collect do |d|
+                  f=Feature.find(d)
+                  {
+                    :id => f.id.to_s,
+                    :name => "#{f.name} (#{f.conditions})",
+                    :p_value => d[:p_value],
+                    :r_squared => d[:r_squared],
+                  }
+                end
+              else
+                common_descriptors = n["common_descriptors"].size
+              end
+              {
+                :name => Substance.find(n["_id"]).name,
+                :id => n["_id"].to_s,
+                :common_descriptors => common_descriptors
+              }
+            end
+          else
+            neighbors = prediction["neighbors"].size
+          end
+          {
+            :id => substance.id.to_s,
+            :name => substance.name,
+            :feature => Feature.find(prediction["prediction_feature_id"]).name,
+            :error => (prediction["value"] - prediction["measurements"].median).abs,
+            :prediction => prediction["value"],
+            :measurements => prediction["measurements"],
+            :neighbors => neighbors
+          }
+        end
       end
-      R.assign "measurement", x
-      R.assign "prediction", y
-      R.eval "all = c(measurement,prediction)"
-      R.eval "range = c(min(all), max(all))"
-      # TODO units
-      R.eval "image = qplot(prediction,measurement,main='',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
-      R.eval "image = image + geom_abline(intercept=0, slope=1)"
-      R.eval "ggsave(file='#{tmpfile}', plot=image)"
-      file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.png")
-      plot_id = $gridfs.insert_one(file)
-      plot_id
     end
   end
 end
-- 
cgit v1.2.3


From 65b69d4c35890a7a2d2992108f0cf4eb5202dd1b Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Wed, 1 Jun 2016 10:37:00 +0200
Subject: validation tests fixed

---
 lib/validation-statistics.rb | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'lib/validation-statistics.rb')

diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 816824b..e42d298 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -98,8 +98,8 @@ module OpenTox
 
       def statistics
         # TODO: predictions within prediction_interval
-        rmse = 0
-        mae = 0
+        self.rmse = 0
+        self.mae = 0
         x = []
         y = []
         predictions.each do |cid,pred|
@@ -107,8 +107,8 @@ module OpenTox
             x << pred[:measurements].median
             y << pred[:value]
             error = pred[:value]-pred[:measurements].median
-            rmse += error**2
-            mae += error.abs
+            self.rmse += error**2
+            self.mae += error.abs
           else
             warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
             $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
@@ -117,17 +117,18 @@ module OpenTox
         R.assign "measurement", x
         R.assign "prediction", y
         R.eval "r <- cor(measurement,prediction,use='pairwise')"
-        r = R.eval("r").to_ruby
+        self.r_squared = R.eval("r").to_ruby**2
 
-        mae = mae/predictions.size
-        rmse = Math.sqrt(rmse/predictions.size)
-        $logger.debug "R^2 #{r**2}"
+        self.mae = self.mae/predictions.size
+        self.rmse = Math.sqrt(self.rmse/predictions.size)
+        $logger.debug "R^2 #{r_squared}"
         $logger.debug "RMSE #{rmse}"
         $logger.debug "MAE #{mae}"
+        save
         {
           :mae => mae,
           :rmse => rmse,
-          :r_squared => r**2,
+          :r_squared => r_squared,
         }
       end
 
-- 
cgit v1.2.3


From 85f2308c101b4778508c2d767e08af4cfd671b7b Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 2 Jun 2016 12:22:39 +0200
Subject: local pls regression for nanoparticles

---
 lib/validation-statistics.rb | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'lib/validation-statistics.rb')

diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index e42d298..6b252b1 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -100,6 +100,8 @@ module OpenTox
         # TODO: predictions within prediction_interval
         self.rmse = 0
         self.mae = 0
+        #self.within_prediction_interval = 0
+        #self.outside_prediction_interval = 0
         x = []
         y = []
         predictions.each do |cid,pred|
@@ -109,6 +111,9 @@ module OpenTox
             error = pred[:value]-pred[:measurements].median
             self.rmse += error**2
             self.mae += error.abs
+            #if pred[:prediction_interval]
+              #if pred[:measurements]
+            #end
           else
             warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
             $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
@@ -118,7 +123,6 @@ module OpenTox
         R.assign "prediction", y
         R.eval "r <- cor(measurement,prediction,use='pairwise')"
         self.r_squared = R.eval("r").to_ruby**2
-
         self.mae = self.mae/predictions.size
         self.rmse = Math.sqrt(self.rmse/predictions.size)
         $logger.debug "R^2 #{r_squared}"
-- 
cgit v1.2.3


From f7e87b45f15083e5fcdea64821f06ed93ece4c4e Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Tue, 7 Jun 2016 18:07:28 +0200
Subject: (repeated)crossvalidation plots

---
 lib/validation-statistics.rb | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'lib/validation-statistics.rb')

diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 6b252b1..9aa9cff 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -136,9 +136,9 @@ module OpenTox
         }
       end
 
-      def correlation_plot 
+      def correlation_plot format: "png"
         unless correlation_plot_id
-          tmpfile = "/tmp/#{id.to_s}_correlation.pdf"
+          tmpfile = "/tmp/#{id.to_s}_correlation.#{format}"
           x = []
           y = []
           feature = Feature.find(predictions.first.last["prediction_feature_id"])
@@ -155,7 +155,7 @@ module OpenTox
           R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
           R.eval "image = image + geom_abline(intercept=0, slope=1)"
           R.eval "ggsave(file='#{tmpfile}', plot=image)"
-          file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.png")
+          file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}")
           plot_id = $gridfs.insert_one(file)
           update(:correlation_plot_id => plot_id)
         end
-- 
cgit v1.2.3


From f93aad7227c7bb3702fd28aab2d289f1ca9ce7e9 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 21 Jul 2016 17:35:20 +0200
Subject: correlation plot fixed

---
 lib/validation-statistics.rb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib/validation-statistics.rb')

diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 9aa9cff..3582c71 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -143,8 +143,8 @@ module OpenTox
           y = []
           feature = Feature.find(predictions.first.last["prediction_feature_id"])
           predictions.each do |sid,p|
-            x << p["value"]
-            y << p["measurements"].median
+            x << p["measurements"].median
+            y << p["value"]
           end
           R.assign "measurement", x
           R.assign "prediction", y
-- 
cgit v1.2.3


From 8519274487166d75b3b9ae28e61f7a7be9f7e83c Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 27 Oct 2016 11:58:07 +0200
Subject: probability plot for classification validations

---
 lib/validation-statistics.rb | 64 ++++++++++++++++++++++++++------------------
 1 file changed, 38 insertions(+), 26 deletions(-)

(limited to 'lib/validation-statistics.rb')

diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 3582c71..4ab4b13 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -65,43 +65,44 @@ module OpenTox
         }
       end
 
-      def confidence_plot
-        unless confidence_plot_id
-          tmpfile = "/tmp/#{id.to_s}_confidence.svg"
+      def probability_plot format: "pdf"
+        #unless probability_plot_id
+          tmpfile = "/tmp/#{id.to_s}_probability.#{format}"
           accuracies = []
-          confidences = []
+          probabilities = []
           correct_predictions = 0
           incorrect_predictions = 0
-          predictions.each do |p|
-            p[:measurements].each do |db_act|
-              if p[:value] 
-                p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1
-                accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
-                confidences << p[:confidence]
-
-              end
+          pp = []
+          predictions.values.select{|p| p["probabilities"]}.compact.each do |p|
+            p["measurements"].each do |m|
+              pp << [ p["probabilities"][p["value"]], p["value"] == m ]
             end
           end
+          pp.sort_by!{|p| 1-p.first}
+          pp.each do |p|
+            p[1] ? correct_predictions += 1 : incorrect_predictions += 1
+            accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
+            probabilities << p[0]
+          end
           R.assign "accuracy", accuracies
-          R.assign "confidence", confidences
-          R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
+          R.assign "probability", probabilities
+          R.eval "image = qplot(probability,accuracy)+ylab('Accumulated accuracy')+xlab('Prediction probability')+ylim(c(0,1))+scale_x_reverse()+geom_line()"
           R.eval "ggsave(file='#{tmpfile}', plot=image)"
-          file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
+          file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.svg")
           plot_id = $gridfs.insert_one(file)
-          update(:confidence_plot_id => plot_id)
-        end
-        $gridfs.find_one(_id: confidence_plot_id).data
+          update(:probability_plot_id => plot_id)
+        #end
+        $gridfs.find_one(_id: probability_plot_id).data
       end
     end
 
     module RegressionStatistics
 
       def statistics
-        # TODO: predictions within prediction_interval
         self.rmse = 0
         self.mae = 0
-        #self.within_prediction_interval = 0
-        #self.outside_prediction_interval = 0
+        self.within_prediction_interval = 0
+        self.out_of_prediction_interval = 0
         x = []
         y = []
         predictions.each do |cid,pred|
@@ -111,9 +112,13 @@ module OpenTox
             error = pred[:value]-pred[:measurements].median
             self.rmse += error**2
             self.mae += error.abs
-            #if pred[:prediction_interval]
-              #if pred[:measurements]
-            #end
+            if pred[:prediction_interval]
+              if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1]
+                self.within_prediction_interval += 1
+              else
+                self.out_of_prediction_interval += 1
+              end
+            end
           else
             warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
             $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
@@ -128,16 +133,23 @@ module OpenTox
         $logger.debug "R^2 #{r_squared}"
         $logger.debug "RMSE #{rmse}"
         $logger.debug "MAE #{mae}"
+        $logger.debug "#{percent_within_prediction_interval.round(2)}% measurements within prediction interval"
         save
         {
           :mae => mae,
           :rmse => rmse,
           :r_squared => r_squared,
+          :within_prediction_interval => within_prediction_interval,
+          :out_of_prediction_interval => out_of_prediction_interval,
         }
       end
 
+      def percent_within_prediction_interval
+        100*within_prediction_interval.to_f/(within_prediction_interval+out_of_prediction_interval)
+      end
+
       def correlation_plot format: "png"
-        unless correlation_plot_id
+        #unless correlation_plot_id
           tmpfile = "/tmp/#{id.to_s}_correlation.#{format}"
           x = []
           y = []
@@ -158,7 +170,7 @@ module OpenTox
           file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}")
           plot_id = $gridfs.insert_one(file)
           update(:correlation_plot_id => plot_id)
-        end
+        #end
         $gridfs.find_one(_id: correlation_plot_id).data
       end
 
-- 
cgit v1.2.3


From 5418c2477a1a48b06f97d693f6c117336aec5b4c Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 27 Oct 2016 12:09:06 +0200
Subject: GridFS storage for plots.

---
 lib/validation-statistics.rb | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'lib/validation-statistics.rb')

diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 4ab4b13..b251bdb 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -66,7 +66,7 @@ module OpenTox
       end
 
       def probability_plot format: "pdf"
-        #unless probability_plot_id
+        unless probability_plot_id
           tmpfile = "/tmp/#{id.to_s}_probability.#{format}"
           accuracies = []
           probabilities = []
@@ -91,7 +91,7 @@ module OpenTox
           file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.svg")
           plot_id = $gridfs.insert_one(file)
           update(:probability_plot_id => plot_id)
-        #end
+        end
         $gridfs.find_one(_id: probability_plot_id).data
       end
     end
@@ -133,7 +133,7 @@ module OpenTox
         $logger.debug "R^2 #{r_squared}"
         $logger.debug "RMSE #{rmse}"
         $logger.debug "MAE #{mae}"
-        $logger.debug "#{percent_within_prediction_interval.round(2)}% measurements within prediction interval"
+        $logger.debug "#{percent_within_prediction_interval.round(2)}% of measurements within prediction interval"
         save
         {
           :mae => mae,
@@ -149,7 +149,7 @@ module OpenTox
       end
 
       def correlation_plot format: "png"
-        #unless correlation_plot_id
+        unless correlation_plot_id
           tmpfile = "/tmp/#{id.to_s}_correlation.#{format}"
           x = []
           y = []
@@ -170,7 +170,7 @@ module OpenTox
           file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}")
           plot_id = $gridfs.insert_one(file)
           update(:correlation_plot_id => plot_id)
-        #end
+        end
         $gridfs.find_one(_id: correlation_plot_id).data
       end
 
-- 
cgit v1.2.3


From 280f81dcffb3b8b929ff9cbe92ba17403f5a9dd3 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Fri, 28 Oct 2016 12:31:53 +0200
Subject: adjusted r^2 removed (does not apply well to local models)

---
 lib/validation-statistics.rb | 1 +
 1 file changed, 1 insertion(+)

(limited to 'lib/validation-statistics.rb')

diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index b251bdb..799bb34 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -139,6 +139,7 @@ module OpenTox
           :mae => mae,
           :rmse => rmse,
           :r_squared => r_squared,
+          :r_squared_adjusted => r_squared_adjusted,
           :within_prediction_interval => within_prediction_interval,
           :out_of_prediction_interval => out_of_prediction_interval,
         }
-- 
cgit v1.2.3


From c6e86fc1bfee7cb91782dd7067408d78a8e48ed9 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Tue, 8 Nov 2016 16:04:49 +0100
Subject: probability plot for classification

---
 lib/validation-statistics.rb | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'lib/validation-statistics.rb')

diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 799bb34..b6f8a60 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -66,8 +66,13 @@ module OpenTox
       end
 
       def probability_plot format: "pdf"
-        unless probability_plot_id
-          tmpfile = "/tmp/#{id.to_s}_probability.#{format}"
+        #unless probability_plot_id
+
+          #tmpdir = File.join(ENV["HOME"], "tmp")
+          tmpdir = "/tmp"
+          #p tmpdir
+          FileUtils.mkdir_p tmpdir
+          tmpfile = File.join(tmpdir,"#{id.to_s}_probability.#{format}")
           accuracies = []
           probabilities = []
           correct_predictions = 0
@@ -91,7 +96,7 @@ module OpenTox
           file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.svg")
           plot_id = $gridfs.insert_one(file)
           update(:probability_plot_id => plot_id)
-        end
+        #end
         $gridfs.find_one(_id: probability_plot_id).data
       end
     end
@@ -139,7 +144,6 @@ module OpenTox
           :mae => mae,
           :rmse => rmse,
           :r_squared => r_squared,
-          :r_squared_adjusted => r_squared_adjusted,
           :within_prediction_interval => within_prediction_interval,
           :out_of_prediction_interval => out_of_prediction_interval,
         }
-- 
cgit v1.2.3