From 6bde559981fa11ffd265af708956f9d4ee6c9a89 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 8 Oct 2015 10:32:31 +0200
Subject: crossvalidation plots, original classification confidence

---
 lib/classification.rb  |   4 +-
 lib/crossvalidation.rb | 111 ++++++++++++++++++++++++++-----------------------
 lib/lazar.rb           |   3 ++
 lib/model.rb           |   4 +-
 lib/overwrite.rb       |  10 +++++
 lib/regression.rb      |   8 ++++
 lib/validation.rb      |   4 +-
 test/compound.rb       |  13 ++++++
 test/setup.rb          |   4 +-
 test/validation.rb     |  36 ++++++++++++----
 10 files changed, 129 insertions(+), 68 deletions(-)

diff --git a/lib/classification.rb b/lib/classification.rb
index 0a32126..b4b2e59 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -11,7 +11,7 @@ module OpenTox
         confidence = 0.0
         neighbors.each do |row|
           n,sim,acts = row
-          confidence = sim if sim > confidence # distance to nearest neighbor
+          #confidence = sim if sim > confidence # distance to nearest neighbor
           acts.each do |act|
             weighted_sum[act] ||= 0
             weighted_sum[act] += sim
@@ -24,7 +24,7 @@ module OpenTox
           sim_sum = weighted_sum[weighted_sum.keys[0]]
           sim_sum -= weighted_sum[weighted_sum.keys[1]]
           sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1] 
-          #confidence = (sim_sum/neighbors.size).abs 
+          confidence = (sim_sum/neighbors.size).abs 
           return {:value => prediction,:confidence => confidence}
         else
           bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 6dc8d7f..cbffb7c 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -52,7 +52,7 @@ module OpenTox
       cv.update_attributes(
         nr_instances: nr_instances,
         nr_unpredicted: nr_unpredicted,
-        predictions: predictions
+        predictions: predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
       )
       $logger.debug "Nr unpredicted: #{nr_unpredicted}"
       cv.statistics
@@ -69,6 +69,7 @@ module OpenTox
     field :weighted_accuracy, type: Float
     field :true_rate, type: Hash
     field :predictivity, type: Hash
+    field :confidence_plot_id, type: BSON::ObjectId
     # TODO auc, f-measure (usability??)
 
     def statistics
@@ -126,6 +127,30 @@ module OpenTox
       $logger.debug "Accuracy #{accuracy}"
     end
 
+    def confidence_plot
+      tmpfile = "/tmp/#{id.to_s}_confidence.svg"
+      accuracies = []
+      confidences = []
+      correct_predictions = 0
+      incorrect_predictions = 0
+      predictions.each do |p|
+        if p[1] and p[2]
+          p[1] == p [2] ? correct_predictions += 1 : incorrect_predictions += 1
+          accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
+          confidences << p[3]
+
+        end
+      end
+      R.assign "accuracy", accuracies
+      R.assign "confidence", confidences
+      R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
+      R.eval "ggsave(file='#{tmpfile}', plot=image)"
+      file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
+      plot_id = $gridfs.insert_one(file)
+      update(:confidence_plot_id => plot_id)
+      $gridfs.find_one(_id: confidence_plot_id).data
+    end
+
     #Average area under roc  0.646
     #Area under roc  0.646
     #F measure carcinogen: 0.769, noncarcinogen: 0.348
@@ -176,16 +201,6 @@ module OpenTox
       weighted_mae = weighted_mae/confidence_sum
       rmse = Math.sqrt(rmse/predictions.size)
       weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
-      # TODO check!!
-=begin
-      predictions.sort! do |a,b|
-        relative_error_a = (a[1]-a[2]).abs/a[1].to_f
-        relative_error_a = 1/relative_error_a if relative_error_a < 1
-        relative_error_b = (b[1]-b[2]).abs/b[1].to_f
-        relative_error_b = 1/relative_error_b if relative_error_b < 1
-        [relative_error_b,b[3]] <=> [relative_error_a,a[3]]
-      end
-=end
       update_attributes(
         mae: mae,
         rmse: rmse,
@@ -201,44 +216,46 @@ module OpenTox
 
     def misclassifications n=nil
       #n = predictions.size unless n
-      n = 20 unless n
+      n ||= 10 
       model = Model::Lazar.find(self.model_id)
       training_dataset = Dataset.find(model.training_dataset_id)
       prediction_feature = training_dataset.features.first
-      predictions[0..n-1].collect do |p|
-        compound = Compound.find(p[0])
-        neighbors = compound.neighbors.collect do |n|
-          neighbor = Compound.find(n[0])
-          values = training_dataset.values(neighbor,prediction_feature)
-          { :smiles => neighbor.smiles, :fingerprint => neighbor.fp4.collect{|id| Smarts.find(id).name},:similarity => n[1], :measurements => values}
+      predictions.collect do |p|
+        unless p.include? nil
+          compound = Compound.find(p[0])
+          neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters)
+          neighbors.collect! do |n|
+            neighbor = Compound.find(n[0])
+            values = training_dataset.values(neighbor,prediction_feature)
+            { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values}
+          end
+          {
+            :smiles => compound.smiles, 
+            #:fingerprint => compound.fp4.collect{|id|  Smarts.find(id).name},
+            :measured => p[1],
+            :predicted => p[2],
+            #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs,
+            :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs,
+            :relative_error => (p[1]-p[2]).abs/p[1],
+            :confidence => p[3],
+            :neighbors => neighbors
+          }
         end
-        {
-          :smiles => compound.smiles, 
-          :fingerprint => compound.fp4.collect{|id|  Smarts.find(id).name},
-          :measured => p[1],
-          :predicted => p[2],
-          :relative_error => (p[1]-p[2]).abs/p[1].to_f,
-          :confidence => p[3],
-          :neighbors => neighbors
-        }
-      end
+      end.compact.sort{|a,b| p a; b[:relative_error] <=> a[:relative_error]}[0..n-1]
     end
 
     def confidence_plot
       tmpfile = "/tmp/#{id.to_s}_confidence.svg"
-      sorted_predictions = predictions.sort{|a,b| b[3]<=>a[3]}.collect{|p| [(Math.log10(p[1])-Math.log10(p[2]))**2,p[3]]}
+      sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact
       R.assign "error", sorted_predictions.collect{|p| p[0]}
-      #R.assign "p", predictions.collect{|p| p[2]}
-      R.assign "confidence", predictions.collect{|p| p[2]}
-      #R.eval "diff = log(m)-log(p)"
-      R.eval "library(ggplot2)"
-      R.eval "svg(filename='#{tmpfile}')"
-      R.eval "image = qplot(confidence,error)"#,main='#{self.name}',asp=1,xlim=range, ylim=range)"
+      R.assign "confidence", sorted_predictions.collect{|p| p[1]}
+      # TODO fix axis names
+      R.eval "image = qplot(confidence,error)"
+      R.eval "image = image + stat_smooth(method='lm', se=FALSE)"
       R.eval "ggsave(file='#{tmpfile}', plot=image)"
-        R.eval "dev.off()"
-        file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
-        plot_id = $gridfs.insert_one(file)
-        update(:confidence_plot_id => plot_id)
+      file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
+      plot_id = $gridfs.insert_one(file)
+      update(:confidence_plot_id => plot_id)
       $gridfs.find_one(_id: confidence_plot_id).data
     end
 
@@ -250,29 +267,17 @@ module OpenTox
         attributes = Model::Lazar.find(self.model_id).attributes
         attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
         attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
-        p "'"+attributes
-        R.eval "library(ggplot2)"
-        R.eval "library(grid)"
-        R.eval "library(gridExtra)"
         R.assign "measurement", x
         R.assign "prediction", y
-        #R.eval "error <- log(Measurement)-log(Prediction)"
-        #R.eval "rmse <- sqrt(mean(error^2, na.rm=T))"
-        #R.eval "mae <- mean(abs(error), na.rm=T)"
-        #R.eval "r <- cor(-log(prediction),-log(measurement))"
-        R.eval "svg(filename='#{tmpfile}')"
         R.eval "all = c(-log(measurement),-log(prediction))"
         R.eval "range = c(min(all), max(all))"
         R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
-        R.eval "image = image + geom_abline(intercept=0, slope=1) + stat_smooth(method='lm', se=FALSE)"
-        R.eval "text = textGrob(paste('RMSE: ', '#{rmse.round(2)},','MAE:','#{mae.round(2)},','r^2: ','#{r_squared.round(2)}','\n\n','#{attributes}'),just=c('left','top'),check.overlap = T)"
-        R.eval "grid.arrange(image, text, ncol=2)"
-        R.eval "dev.off()"
+        R.eval "image = image + geom_abline(intercept=0, slope=1)"
+        R.eval "ggsave(file='#{tmpfile}', plot=image)"
         file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg")
         plot_id = $gridfs.insert_one(file)
         update(:correlation_plot_id => plot_id)
       end
-      p correlation_plot_id
       $gridfs.find_one(_id: correlation_plot_id).data
     end
   end
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 89b50f7..f801062 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -21,6 +21,9 @@ $gridfs = $mongo.database.fs
 
 # R setup
 R = Rserve::Connection.new
+R.eval "library(ggplot2)"
+R.eval "library(grid)"
+R.eval "library(gridExtra)"
 
 # Logger setup
 STDOUT.sync = true # for redirection, etc see http://stackoverflow.com/questions/8549443/why-doesnt-logger-output-to-stdout-get-redirected-to-files
diff --git a/lib/model.rb b/lib/model.rb
index cd88e0c..98433d0 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -48,7 +48,7 @@ module OpenTox
         self
       end
 
-      def predict object
+      def predict object, use_database_values=true
 
         t = Time.now
         at = Time.now
@@ -75,7 +75,7 @@ module OpenTox
         compounds.each_with_index do |compound,c|
           t = Time.new
           database_activities = training_dataset.values(compound,prediction_feature)
-          if database_activities and !database_activities.empty?
+          if use_database_values and database_activities and !database_activities.empty?
             database_activities = database_activities.first if database_activities.size == 1
             predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."}
             next
diff --git a/lib/overwrite.rb b/lib/overwrite.rb
index be90c56..c92ad2b 100644
--- a/lib/overwrite.rb
+++ b/lib/overwrite.rb
@@ -96,6 +96,16 @@ class Array
     self.inject{ |sum, el| sum + el }.to_f / self.size
   end
 
+  def sample_variance
+    m = self.mean
+    sum = self.inject(0){|accum, i| accum +(i-m)**2 }
+    sum/(self.length - 1).to_f
+  end
+
+  def standard_deviation
+    Math.sqrt(self.sample_variance)
+  end
+
 end
 
 module URI
diff --git a/lib/regression.rb b/lib/regression.rb
index 9062a9e..868c25f 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -24,16 +24,24 @@ module OpenTox
         sim_sum = 0.0
         confidence = 0.0
         neighbors = params[:neighbors]
+        activities = []
         neighbors.each do |row|
           n,sim,acts = row
           confidence = sim if sim > confidence # distance to nearest neighbor
           # TODO add LOO errors
           acts.each do |act|
             weighted_sum += sim*Math.log10(act)
+            activities << act
             sim_sum += sim
           end
         end
+        #R.assign "activities", activities
+        #R.eval "cv = cv(activities)"
+        #confidence /= activities.standard_deviation#/activities.mean
         #confidence = sim_sum*neighbors.size.to_f/params[:training_dataset_size]
+        #confidence = sim_sum/neighbors.size.to_f
+        #confidence = neighbors.size.to_f
+        confidence = 0 if confidence.nan?
         sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
         {:value => prediction,:confidence => confidence}
       end
diff --git a/lib/validation.rb b/lib/validation.rb
index 9eebef8..c52ffc0 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -39,7 +39,7 @@ module OpenTox
           activity = activities[i]
           prediction = de.first
           confidence = de[1]
-          predictions << [prediction_dataset.compound_ids[i], activity, prediction,confidence]
+          predictions << [prediction_dataset.compound_ids[i], activity, prediction, de[1]]
         else
           nr_unpredicted += 1
         end
@@ -50,7 +50,7 @@ module OpenTox
         :test_dataset_id => test_set.id,
         :nr_instances => test_set.compound_ids.size,
         :nr_unpredicted => nr_unpredicted,
-        :predictions => predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
+        :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence
       )
       validation.crossvalidation_id = crossvalidation.id if crossvalidation
       validation.save
diff --git a/test/compound.rb b/test/compound.rb
index 036f384..24356d3 100644
--- a/test/compound.rb
+++ b/test/compound.rb
@@ -160,4 +160,17 @@ print c.sdf
       end
     end
   end
+
+  def test_fingerprint_db_neighbors
+    training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.csv")
+    [
+      "CC(=O)CC(C)C#N",
+      "CC(=O)CC(C)C",
+      "C(=O)CC(C)C#N",
+    ].each do |smi|
+      c = OpenTox::Compound.from_smiles smi
+      neighbors = c.db_neighbors(:training_dataset_id => training_dataset.id, :min_sim => 0.2)
+      p neighbors
+    end
+  end
 end
diff --git a/test/setup.rb b/test/setup.rb
index 3dad683..ba1b7af 100644
--- a/test/setup.rb
+++ b/test/setup.rb
@@ -3,5 +3,7 @@ require_relative '../lib/lazar.rb'
 include OpenTox
 TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
 DATA_DIR ||= File.join(TEST_DIR,"data")
+Mongoid.configure.connect_to("test")
+$mongo = Mongo::Client.new('mongodb://127.0.0.1:27017/test')
 #$mongo.database.drop
-#$gridfs = $mongo.database.fs # recreate GridFS indexes
+$gridfs = $mongo.database.fs
diff --git a/test/validation.rb b/test/validation.rb
index af5ea60..6764a32 100644
--- a/test/validation.rb
+++ b/test/validation.rb
@@ -16,11 +16,35 @@ class ValidationTest < MiniTest::Test
     model = Model::LazarClassification.create dataset#, features
     cv = ClassificationCrossValidation.create model
     assert cv.accuracy > 0.7
+    File.open("tmp.svg","w+"){|f| f.puts cv.confidence_plot}
+    `inkview tmp.svg`
     p cv.nr_unpredicted
     p cv.accuracy
     #assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy should be larger than unweighted accuracy."
   end
 
+  def test_default_regression_crossvalidation
+    dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
+    model = Model::LazarRegression.create dataset
+    cv = RegressionCrossValidation.create model
+    #cv = RegressionCrossValidation.find '561503262b72ed54fd000001'
+    p cv.id
+    File.open("tmp.svg","w+"){|f| f.puts cv.correlation_plot}
+    `inkview tmp.svg`
+    File.open("tmp.svg","w+"){|f| f.puts cv.confidence_plot}
+    `inkview tmp.svg`
+    
+    #puts cv.misclassifications.to_yaml
+    p cv.rmse
+    p cv.weighted_rmse 
+    assert cv.rmse < 1.5, "RMSE > 1.5"
+    #assert cv.weighted_rmse < cv.rmse, "Weighted RMSE (#{cv.weighted_rmse}) larger than unweighted RMSE(#{cv.rmse}) "
+    p cv.mae 
+    p cv.weighted_mae 
+    assert cv.mae < 1
+    #assert cv.weighted_mae < cv.mae
+  end
+
   def test_regression_crossvalidation
     dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
     #dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.csv"
@@ -41,13 +65,8 @@ class ValidationTest < MiniTest::Test
       refute_equal params[:neighbor_algorithm_parameters][:training_dataset_id], model[:neighbor_algorithm_parameters][:training_dataset_id]
     end
 
-    #`inkview #{cv.plot}`
-    #puts JSON.pretty_generate(cv.misclassifications)#.collect{|l| l.join ", "}.join "\n"
-    #`inkview #{cv.plot}`
-    assert cv.rmse < 30, "RMSE > 30"
-    #assert cv.weighted_rmse < cv.rmse, "Weighted RMSE (#{cv.weighted_rmse}) larger than unweighted RMSE(#{cv.rmse}) "
-    assert cv.mae < 12
-    #assert cv.weighted_mae < cv.mae
+    assert cv.rmse < 1.5, "RMSE > 30"
+    assert cv.mae < 1
   end
 
   def test_repeated_crossvalidation
@@ -55,7 +74,8 @@ class ValidationTest < MiniTest::Test
     model = Model::LazarClassification.create dataset
     repeated_cv = RepeatedCrossValidation.create model
     repeated_cv.crossvalidations.each do |cv|
-      assert cv.accuracy > 0.7
+      assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"
+      assert_operator cv.weighted_accuracy, :>, cv.accuracy
     end
   end
 
-- 
cgit v1.2.3