From e63e97086ac05e7a86f1a53bdcbc72eec0cabf16 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Mon, 9 Nov 2015 14:58:34 +0100
Subject: leave one out validation implemented

---
 lib/compound.rb                 |  18 ++--
 lib/lazar.rb                    |   3 +-
 lib/leave-one-out-validation.rb | 205 ++++++++++++++++++++++++++++++++++++++++
 test/validation.rb              |  25 +++++
 4 files changed, 243 insertions(+), 8 deletions(-)
 create mode 100644 lib/leave-one-out-validation.rb

diff --git a/lib/compound.rb b/lib/compound.rb
index ad0eaba..d5a4cbb 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -344,16 +344,20 @@ module OpenTox
       return mg
     end
 
-   # Get mg from mmol
-   # @return [Float] value in mg
-   def mmol_to_mg(value, mw)
+    # Get mg from mmol
+    # @return [Float] value in mg
+    def mmol_to_mg(value, mw)
       mg = (value.to_f)*(mw.to_f)
       return mg
     end
 
-   # Get mg from logmg
-   # @return [Float] value in mg
-   def logmg_to_mg(value)
+    def mg_to_mmol mg
+      mg.to_f/molecular_weight
+    end
+
+    # Get mg from logmg
+    # @return [Float] value in mg
+    def logmg_to_mg(value)
       mg = 10**value.to_f
       return mg
     end
@@ -364,7 +368,7 @@ module OpenTox
       if self["molecular_weight"]==0.0 || self["molecular_weight"].nil?
         update(:molecular_weight => OpenTox::Algorithm::Descriptor.physchem(self, ["Openbabel.MW"]).first)
       end
-      self["molecular_weight"]
+      self["molecular_weight"].to_f
     end
 
 
diff --git a/lib/lazar.rb b/lib/lazar.rb
index cc66841..5d9bc19 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -60,7 +60,7 @@ ENV['FMINER_SILENT'] = 'true'
 ENV['FMINER_NR_HITS'] = 'true'
 
 # OpenTox classes and includes
-CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
+CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
 
 [ # be aware of the require sequence as it affects class/method overwrites
   "overwrite.rb",
@@ -80,6 +80,7 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","Repeat
   "regression.rb",
   "validation.rb",
   "crossvalidation.rb",
+  "leave-one-out-validation.rb",
   "experiment.rb",
 ].each{ |f| require_relative f }
 
diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb
new file mode 100644
index 0000000..9db10c6
--- /dev/null
+++ b/lib/leave-one-out-validation.rb
@@ -0,0 +1,205 @@
+module OpenTox
+
+  class LeaveOneOutValidation
+
+    field :model_id, type: BSON::ObjectId
+    field :dataset_id, type: BSON::ObjectId
+    field :nr_instances, type: Integer
+    field :nr_unpredicted, type: Integer
+    field :predictions, type: Array
+    field :finished_at, type: Time 
+
+    def self.create model
+      model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation
+      loo = klass.new :model_id => model.id, :dataset_id => model.training_dataset_id
+      compound_ids = model.training_dataset.compound_ids
+      predictions = model.predict model.training_dataset.compounds
+      predictions = predictions.each_with_index {|p,i| p[:compound_id] = compound_ids[i]}
+      predictions.select!{|p| p[:database_activities] and !p[:database_activities].empty?}
+      loo.nr_instances = predictions.size
+      predictions.select!{|p| p[:value]} # remove unpredicted
+      loo.predictions = predictions.sort{|a,b| b[:confidence] <=> a[:confidence]}
+      loo.nr_unpredicted = loo.nr_instances - loo.predictions.size
+      loo.statistics
+      loo.save
+      loo
+    end
+
+    def model
+      Model::Lazar.find model_id
+    end
+  end
+
+  class ClassificationLeaveOneOutValidation < LeaveOneOutValidation
+
+    field :accept_values, type: Array
+    field :confusion_matrix, type: Array, default: []
+    field :weighted_confusion_matrix, type: Array, default: []
+    field :accuracy, type: Float
+    field :weighted_accuracy, type: Float
+    field :true_rate, type: Hash, default: {}
+    field :predictivity, type: Hash, default: {}
+    field :confidence_plot_id, type: BSON::ObjectId
+
+    def statistics
+      accept_values = Feature.find(model.prediction_feature_id).accept_values
+      confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
+      weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
+      predictions.each do |pred|
+        pred[:database_activities].each do |db_act|
+          if pred[:value]
+            if pred[:value] == db_act
+              if pred[:value] == accept_values[0]
+                confusion_matrix[0][0] += 1
+                weighted_confusion_matrix[0][0] += pred[:confidence]
+              elsif pred[:value] == accept_values[1]
+                confusion_matrix[1][1] += 1
+                weighted_confusion_matrix[1][1] += pred[:confidence]
+              end
+            else
+              if pred[:value] == accept_values[0]
+                confusion_matrix[0][1] += 1
+                weighted_confusion_matrix[0][1] += pred[:confidence]
+              elsif pred[:value] == accept_values[1]
+                confusion_matrix[1][0] += 1
+                weighted_confusion_matrix[1][0] += pred[:confidence]
+              end
+            end
+          end
+        end
+      end
+      accept_values.each_with_index do |v,i|
+        true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
+        predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
+      end
+      confidence_sum = 0
+      weighted_confusion_matrix.each do |r|
+        r.each do |c|
+          confidence_sum += c
+        end
+      end
+      update_attributes(
+        accept_values: accept_values,
+        confusion_matrix: confusion_matrix,
+        weighted_confusion_matrix: weighted_confusion_matrix,
+        accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
+        weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
+        true_rate: true_rate,
+        predictivity: predictivity,
+        finished_at: Time.now
+      )
+      $logger.debug "Accuracy #{accuracy}"
+    end
+
+    def confidence_plot
+      unless confidence_plot_id
+        tmpfile = "/tmp/#{id.to_s}_confidence.svg"
+        accuracies = []
+        confidences = []
+        correct_predictions = 0
+        incorrect_predictions = 0
+        predictions.each do |p|
+          p[:database_activities].each do |db_act|
+            if p[:value] 
+              p[:value] == db_act ? correct_predictions += 1 : incorrect_predictions += 1
+              accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
+              confidences << p[:confidence]
+
+            end
+          end
+        end
+        R.assign "accuracy", accuracies
+        R.assign "confidence", confidences
+        R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
+        R.eval "ggsave(file='#{tmpfile}', plot=image)"
+        file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
+        plot_id = $gridfs.insert_one(file)
+        update(:confidence_plot_id => plot_id)
+      end
+      $gridfs.find_one(_id: confidence_plot_id).data
+    end
+  end
+  
+
+  class RegressionLeaveOneOutValidation < LeaveOneOutValidation
+
+
+    field :rmse, type: Float, default: 0.0
+    field :mae, type: Float, default: 0
+    field :weighted_rmse, type: Float, default: 0
+    field :weighted_mae, type: Float, default: 0
+    field :r_squared, type: Float
+    field :correlation_plot_id, type: BSON::ObjectId
+    field :confidence_plot_id, type: BSON::ObjectId
+
+    def statistics
+      confidence_sum = 0
+      predicted_values = []
+      measured_values = []
+      predictions.each do |pred|
+        pred[:database_activities].each do |activity|
+          if pred[:value]
+            predicted_values << pred[:value]
+            measured_values << activity
+            error = Math.log10(pred[:value])-Math.log10(activity)
+            self.rmse += error**2
+            self.weighted_rmse += pred[:confidence]*error**2
+            self.mae += error.abs
+            self.weighted_mae += pred[:confidence]*error.abs
+            confidence_sum += pred[:confidence]
+          end
+        end
+        if pred[:database_activities].empty?
+          warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+          $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+        end
+      end
+      R.assign "measurement", measured_values
+      R.assign "prediction", predicted_values
+      R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
+      r = R.eval("r").to_ruby
+
+      self.mae = self.mae/predictions.size
+      self.weighted_mae = self.weighted_mae/confidence_sum
+      self.rmse = Math.sqrt(self.rmse/predictions.size)
+      self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum)
+      self.r_squared = r**2
+      self.finished_at = Time.now
+      save
+      $logger.debug "R^2 #{r**2}"
+      $logger.debug "RMSE #{rmse}"
+      $logger.debug "MAE #{mae}"
+    end
+
+    def correlation_plot
+      unless correlation_plot_id
+        tmpfile = "/tmp/#{id.to_s}_correlation.svg"
+        predicted_values = []
+        measured_values = []
+        predictions.each do |pred|
+          pred[:database_activities].each do |activity|
+            if pred[:value]
+              predicted_values << pred[:value]
+              measured_values << activity
+            end
+          end
+        end
+        attributes = Model::Lazar.find(self.model_id).attributes
+        attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
+        attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
+        R.assign "measurement", measured_values
+        R.assign "prediction", predicted_values
+        R.eval "all = c(-log(measurement),-log(prediction))"
+        R.eval "range = c(min(all), max(all))"
+        R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
+        R.eval "image = image + geom_abline(intercept=0, slope=1)"
+        R.eval "ggsave(file='#{tmpfile}', plot=image)"
+        file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg")
+        plot_id = $gridfs.insert_one(file)
+        update(:correlation_plot_id => plot_id)
+      end
+      $gridfs.find_one(_id: correlation_plot_id).data
+    end
+  end
+
+end
diff --git a/test/validation.rb b/test/validation.rb
index 7de944c..95f9bc0 100644
--- a/test/validation.rb
+++ b/test/validation.rb
@@ -128,4 +128,29 @@ class ValidationTest < MiniTest::Test
     p cv
   end
 
+  def test_classification_loo_validation
+    dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
+    model = Model::LazarClassification.create dataset
+    loo = ClassificationLeaveOneOutValidation.create model
+    assert_equal 14, loo.nr_unpredicted
+    refute_empty loo.confusion_matrix
+    assert loo.accuracy > 0.77
+    assert loo.weighted_accuracy > 0.85
+    assert loo.accuracy < loo.weighted_accuracy
+  end
+
+  def test_regression_loo_validation
+    dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
+    model = Model::LazarRegression.create dataset
+    loo = RegressionLeaveOneOutValidation.create model
+    assert_equal 11, loo.nr_unpredicted
+    assert loo.weighted_mae < loo.mae
+    assert loo.r_squared > 0.34
+    #assert_equal 14, loo.nr_unpredicted
+    #p loo.confusion_matrix
+    #p loo.accuracy
+    #File.open("tmp.svg","w+"){|f| f.puts loo.correlation_plot}
+    #`inkview tmp.svg`
+  end
+
 end
-- 
cgit v1.2.3