From 0b686f924a42105f2516aea44c27b6d3f75e1672 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Tue, 20 Oct 2020 20:42:54 +0200
Subject: Summary table of PA predictions

---
 Makefile                             |   5 +-
 mutagenicity.md                      |  38 +++++--
 pyrrolizidine-alkaloids/summary.yaml |  24 +++--
 scripts/pa-summary-table.rb          |  18 ++++
 scripts/pa-summary.rb                | 190 +++++++++++++++++++++++++++++++++++
 tables/pa-summary.csv                |  12 +++
 6 files changed, 265 insertions(+), 22 deletions(-)
 create mode 100755 scripts/pa-summary-table.rb
 create mode 100755 scripts/pa-summary.rb
 create mode 100644 tables/pa-summary.csv

diff --git a/Makefile b/Makefile
index fb5eb30..c63cdf5 100644
--- a/Makefile
+++ b/Makefile
@@ -36,7 +36,7 @@ PA_PREDICTIONS = $(PA_LAZAR_DIR)/pa-mp2d-predictions.csv $(PA_LAZAR_DIR)/pa-pade
 
 # manuscript
 
-TABLES = tables/lazar-summary.csv tables/r-summary.csv tables/tensorflow-summary.csv tables/pa-tab.tex
+TABLES = tables/lazar-summary.csv tables/r-summary.csv tables/tensorflow-summary.csv tables/pa-tab.tex tables/pa-summary.csv
 FIGURES = figures/roc.png figures/tsne-mp2d.png figures/tsne-padel.png
 
 all: $(TABLES) $(FIGURES) $(CV_SUMMARY) mutagenicity.pdf 
@@ -67,6 +67,9 @@ figures/roc.csv: $(CV_SUMMARY)
 
 # tables
 
+tables/pa-summary.csv: $(PA_SUMMARY)
+	scripts/pa-summary-table.rb $< > $@
+
 tables/pa-tab.tex: tables/pa-table.csv
 	scripts/pa-tex-table.rb $< > $@
 
diff --git a/mutagenicity.md b/mutagenicity.md
index c278142..d05cbc7 100644
--- a/mutagenicity.md
+++ b/mutagenicity.md
@@ -478,7 +478,9 @@ Results
 10-fold crossvalidations
 ------------------------
 
-Crossvalidation results are summarized in the following tables: @tbl:lazar shows `lazar` results with MolPrint2D and PaDEL descriptors, @tbl:R R results and @tbl:tensorflow Tensorflow results.
+Crossvalidation results are summarized in the following tables: @tbl:lazar
+shows `lazar` results with MolPrint2D and PaDEL descriptors, @tbl:R R results
+and @tbl:tensorflow Tensorflow results.
 
 
 ```{#tbl:lazar .table file="tables/lazar-summary.csv" caption="Summary of lazar crossvalidation results (all/high confidence predictions)"}
@@ -494,25 +496,41 @@ Crossvalidation results are summarized in the following tables: @tbl:lazar shows
 
 ![ROC plot of crossvalidation results.](figures/roc.png){#fig:roc}
 
-Confusion matrices for all models are available from the git repository http://git.in-silico.ch/mutagenicity-paper/10-fold-crossvalidations/confusion-matrices/, individual predictions can be found in 
-http://git.in-silico.ch/mutagenicity-paper/10-fold-crossvalidations/predictions/.
+Confusion matrices for all models are available from the git repository
+https://git.in-silico.ch/mutagenicity-paper/10-fold-crossvalidations/confusion-matrices/,
+individual predictions can be found in
+https://git.in-silico.ch/mutagenicity-paper/10-fold-crossvalidations/predictions/.
 
-The most accurate crossvalidation predictions have been obtained with standard `lazar` models using MolPrint2D descriptors ({{cv.lazar-high-confidence.acc}} for predictions with high confidence, {{cv.lazar-all.acc}} for all predictions). Models utilizing PaDEL descriptors have generally lower accuracies ranging from {{cv.R-DL.acc}} (R deep learning) to {{cv.R-RF.acc}} (R/Tensorflow random forests). Sensitivity and specificity is generally well balanced with the exception of `lazar`-PaDEL (low sensitivity) and R deep learning (low specificity) models.
+The most accurate crossvalidation predictions have been obtained with standard
+`lazar` models using MolPrint2D descriptors ({{cv.lazar-high-confidence.acc}}
+for predictions with high confidence, {{cv.lazar-all.acc}} for all
+predictions). Models utilizing PaDEL descriptors have generally lower
+accuracies ranging from {{cv.R-DL.acc}} (R deep learning) to {{cv.R-RF.acc}}
+(R/Tensorflow random forests). Sensitivity and specificity is generally well
+balanced with the exception of `lazar`-PaDEL (low sensitivity) and R deep
+learning (low specificity) models.
 
 Pyrrolizidine alkaloid mutagenicity predictions 
 -----------------------------------------------
 
-Mutagenicity predictions from all investigated models for 602 pyrrolizidine alkaloids are summarized in Table 4. 
+Mutagenicity predictions from all investigated models for 602 pyrrolizidine
+alkaloids (PAs) are summarized in Table 4. A CSV table with all predictions can be
+downloaded from https://git.in-silico.ch/mutagenicity-paper/tables/pa-table.csv
 
 **TODO** **Verena und Philipp** Koennt Ihr bitte stichprobenweise die Tabelle ueberpruefen, mir verrutscht bei der Auswertung immer gerne etwas.
 
 \input{tables/pa-tab.tex}
 
-Training data and 
-pyrrolizidine alkaloids were visualised with t-distributed stochastic neighbor embedding (t-SNE, @Maaten2008)
-for MolPrint2D and PaDEL descriptors.  t-SNA maps each high-dimensional object
-(chemical) to a two-dimensional point. Similar objects are represented by
-nearby points and dissimilar objects are represented by distant points.
+```{#tbl:pa-summary .table file="tables/pa-summary.csv" caption="Summary of pyrrolizidine alkaloid mutagenicity predictions"}
+```
+
+For the visualisation of the position of pyrrolizidine alkaloids in respect to
+the training data set we have applied t-distributed stochastic neighbor
+embedding (t-SNE, @Maaten2008) for MolPrint2D and PaDEL descriptors.  t-SNE
+maps each high-dimensional object (chemical) to a two-dimensional point,
+maintaining the high-dimensional distances of the objects. Similar objects are
+represented by nearby points and dissimilar objects are represented by distant
+points.
 
 @fig:tsne-mp2d shows the t-SNE of pyrrolizidine alkaloids (PA) and the mutagenicity training data in MP2D space (Tanimoto/Jaccard similarity).
 
diff --git a/pyrrolizidine-alkaloids/summary.yaml b/pyrrolizidine-alkaloids/summary.yaml
index 66c5030..9c3a39f 100644
--- a/pyrrolizidine-alkaloids/summary.yaml
+++ b/pyrrolizidine-alkaloids/summary.yaml
@@ -8,7 +8,7 @@
         :mut: 111
         :non_mut: 449
         :n_perc: 93
-        :mut_perc: 19
+        :mut_perc: 20
         :non_mut_perc: 80
       :high_confidence:
         :n: 301
@@ -16,41 +16,43 @@
         :non_mut: 225
         :n_perc: 50
         :mut_perc: 25
-        :non_mut_perc: 74
+        :non_mut_perc: 75
     :padel:
       :all:
         :n: 600
         :mut: 83
         :non_mut: 517
-        :n_perc: 99
-        :mut_perc: 13
+        :n_perc: 100
+        :mut_perc: 14
         :non_mut_perc: 86
       :high_confidence:
         :n: 0
         :mut: 0
         :non_mut: 0
         :n_perc: 0
+        :mut_perc: 0
+        :non_mut_perc: 0
   :r:
     :rf:
       :n: 602
       :mut: 18
       :non_mut: 584
       :n_perc: 100
-      :mut_perc: 2
+      :mut_perc: 3
       :non_mut_perc: 97
     :svm:
       :n: 602
       :mut: 11
       :non_mut: 591
       :n_perc: 100
-      :mut_perc: 1
+      :mut_perc: 2
       :non_mut_perc: 98
     :dl:
       :n: 602
       :mut: 521
       :non_mut: 81
       :n_perc: 100
-      :mut_perc: 86
+      :mut_perc: 87
       :non_mut_perc: 13
   :tf:
     :rf:
@@ -58,21 +60,21 @@
       :mut: 186
       :non_mut: 416
       :n_perc: 100
-      :mut_perc: 30
+      :mut_perc: 31
       :non_mut_perc: 69
     :lr_sgd:
       :n: 602
       :mut: 286
       :non_mut: 316
       :n_perc: 100
-      :mut_perc: 47
+      :mut_perc: 48
       :non_mut_perc: 52
     :lr_scikit:
       :n: 602
       :mut: 395
       :non_mut: 207
       :n_perc: 100
-      :mut_perc: 65
+      :mut_perc: 66
       :non_mut_perc: 34
     :nn:
       :n: 602
@@ -80,4 +82,4 @@
       :non_mut: 307
       :n_perc: 100
       :mut_perc: 49
-      :non_mut_perc: 50
+      :non_mut_perc: 51
diff --git a/scripts/pa-summary-table.rb b/scripts/pa-summary-table.rb
new file mode 100755
index 0000000..48546bd
--- /dev/null
+++ b/scripts/pa-summary-table.rb
@@ -0,0 +1,18 @@
+#!/usr/bin/env ruby
+require 'yaml'
+data = YAML.load_file(ARGV[0])
+puts "Model,Nr.predictions,mutagenic,non-mutagenic"
+puts "lazar-MP2D (all),#{data[:pa][:lazar][:mp2d][:all][:n]} (#{data[:pa][:lazar][:mp2d][:all][:n_perc]} %),#{data[:pa][:lazar][:mp2d][:all][:mut]} (#{data[:pa][:lazar][:mp2d][:all][:mut_perc]} %),#{data[:pa][:lazar][:mp2d][:all][:non_mut]} (#{data[:pa][:lazar][:mp2d][:all][:non_mut_perc]} %)"
+puts "lazar-MP2D (high-confidence),#{data[:pa][:lazar][:mp2d][:high_confidence][:n]} (#{data[:pa][:lazar][:mp2d][:high_confidence][:n_perc]} %),#{data[:pa][:lazar][:mp2d][:high_confidence][:mut]} (#{data[:pa][:lazar][:mp2d][:high_confidence][:mut_perc]} %),#{data[:pa][:lazar][:mp2d][:high_confidence][:non_mut]} (#{data[:pa][:lazar][:mp2d][:high_confidence][:non_mut_perc]} %)"
+
+puts "lazar-PaDEL (all),#{data[:pa][:lazar][:padel][:all][:n]} (#{data[:pa][:lazar][:padel][:all][:n_perc]} %),#{data[:pa][:lazar][:padel][:all][:mut]} (#{data[:pa][:lazar][:padel][:all][:mut_perc]} %),#{data[:pa][:lazar][:padel][:all][:non_mut]} (#{data[:pa][:lazar][:padel][:all][:non_mut_perc]} %)"
+puts "lazar-PaDEL (high-confidence),#{data[:pa][:lazar][:padel][:high_confidence][:n]} (#{data[:pa][:lazar][:padel][:high_confidence][:n_perc]} %),#{data[:pa][:lazar][:padel][:high_confidence][:mut]} (#{data[:pa][:lazar][:padel][:high_confidence][:mut_perc]} %),#{data[:pa][:lazar][:padel][:high_confidence][:non_mut]} (#{data[:pa][:lazar][:padel][:high_confidence][:non_mut_perc]} %)"
+
+puts "R-RF,#{data[:pa][:r][:rf][:n]} (#{data[:pa][:r][:rf][:n_perc]} %),#{data[:pa][:r][:rf][:mut]} (#{data[:pa][:r][:rf][:mut_perc]} %),#{data[:pa][:r][:rf][:non_mut]} (#{data[:pa][:r][:rf][:non_mut_perc]} %)"
+puts "R-SVM,#{data[:pa][:r][:svm][:n]} (#{data[:pa][:r][:svm][:n_perc]} %),#{data[:pa][:r][:svm][:mut]} (#{data[:pa][:r][:svm][:mut_perc]} %),#{data[:pa][:r][:svm][:non_mut]} (#{data[:pa][:r][:svm][:non_mut_perc]} %)"
+puts "R-DL,#{data[:pa][:r][:dl][:n]} (#{data[:pa][:r][:dl][:n_perc]} %),#{data[:pa][:r][:dl][:mut]} (#{data[:pa][:r][:dl][:mut_perc]} %),#{data[:pa][:r][:dl][:non_mut]} (#{data[:pa][:r][:dl][:non_mut_perc]} %)"
+
+puts "Tensorflow-RF,#{data[:pa][:tf][:rf][:n]} (#{data[:pa][:tf][:rf][:n_perc]} %),#{data[:pa][:tf][:rf][:mut]} (#{data[:pa][:tf][:rf][:mut_perc]} %),#{data[:pa][:tf][:rf][:non_mut]} (#{data[:pa][:tf][:rf][:non_mut_perc]} %)"
+puts "Tensorflow-LR-sgd,#{data[:pa][:tf][:lr_sgd][:n]} (#{data[:pa][:tf][:lr_sgd][:n_perc]} %),#{data[:pa][:tf][:lr_sgd][:mut]} (#{data[:pa][:tf][:lr_sgd][:mut_perc]} %),#{data[:pa][:tf][:lr_sgd][:non_mut]} (#{data[:pa][:tf][:lr_sgd][:non_mut_perc]} %)"
+puts "Tensorflow-LR-scikit,#{data[:pa][:tf][:lr_scikit][:n]} (#{data[:pa][:tf][:lr_scikit][:n_perc]} %),#{data[:pa][:tf][:lr_scikit][:mut]} (#{data[:pa][:tf][:lr_scikit][:mut_perc]} %),#{data[:pa][:tf][:lr_scikit][:non_mut]} (#{data[:pa][:tf][:lr_scikit][:non_mut_perc]} %)"
+puts "Tensorflow-NN,#{data[:pa][:tf][:nn][:n]} (#{data[:pa][:tf][:nn][:n_perc]} %),#{data[:pa][:tf][:nn][:mut]} (#{data[:pa][:tf][:nn][:mut_perc]} %),#{data[:pa][:tf][:nn][:non_mut]} (#{data[:pa][:tf][:nn][:non_mut_perc]} %)"
diff --git a/scripts/pa-summary.rb b/scripts/pa-summary.rb
new file mode 100755
index 0000000..0715a6c
--- /dev/null
+++ b/scripts/pa-summary.rb
@@ -0,0 +1,190 @@
+#!/usr/bin/env ruby
+require 'yaml'
+
+summary = { 
+  :n => 0,
+  :lazar => {
+    :mp2d => {
+      :all => {
+        :n => 0,
+        :mut => 0,
+        :non_mut => 0
+      },
+      :high_confidence => {
+        :n => 0,
+        :mut => 0,
+        :non_mut => 0
+      }
+    },
+    :padel => {
+      :all => {
+        :n => 0,
+        :mut => 0,
+        :non_mut => 0
+      },
+      :high_confidence => {
+        :n => 0,
+        :mut => 0,
+        :non_mut => 0
+      }
+    },
+  },
+  :r => {
+    :rf => {
+      :n => 0,
+      :mut => 0,
+      :non_mut => 0
+    },
+    :svm => {
+      :n => 0,
+      :mut => 0,
+      :non_mut => 0
+    },
+    :dl => {
+      :n => 0,
+      :mut => 0,
+      :non_mut => 0
+    },
+  },
+  :tf => {
+    :rf => {
+      :n => 0,
+      :mut => 0,
+      :non_mut => 0
+    },
+    :lr_sgd => {
+      :n => 0,
+      :mut => 0,
+      :non_mut => 0
+    },
+    :lr_scikit => {
+      :n => 0,
+      :mut => 0,
+      :non_mut => 0
+    },
+    :nn => {
+      :n => 0,
+      :mut => 0,
+      :non_mut => 0
+    },
+  },
+}
+
+n = 0
+File.read(ARGV[0]).each_line do |l|
+  unless l.match("SMILES")
+    id,cid,smi,cansmi,exp,lazar_MP2D,lazar_MP2D_high_confidence,lazar_PaDEL,lazar_PaDEL_high_confidence,r_DL,r_RF,r_SVM,tf_lr_sgd,tf_lr_scikit,tf_NN,tf_RF = l.chomp.split(",")
+
+    if lazar_MP2D == "1"
+      summary[:lazar][:mp2d][:all][:n] += 1
+      summary[:lazar][:mp2d][:all][:mut] += 1
+      if lazar_MP2D_high_confidence == "T" 
+        summary[:lazar][:mp2d][:high_confidence][:n] += 1
+        summary[:lazar][:mp2d][:high_confidence][:mut] += 1
+      end
+    elsif lazar_MP2D == "0"
+      summary[:lazar][:mp2d][:all][:n] += 1
+      summary[:lazar][:mp2d][:all][:non_mut] += 1
+      if lazar_MP2D_high_confidence == "T" 
+        summary[:lazar][:mp2d][:high_confidence][:n] += 1
+        summary[:lazar][:mp2d][:high_confidence][:non_mut] += 1
+      end
+    end
+    if lazar_PaDEL == "1"
+      summary[:lazar][:padel][:all][:n] += 1
+      summary[:lazar][:padel][:all][:mut] += 1
+      if lazar_PaDEL_high_confidence == "T" 
+        summary[:lazar][:padel][:high_confidence][:n] += 1
+        summary[:lazar][:padel][:high_confidence][:mut] += 1
+      end
+    elsif lazar_PaDEL == "0"
+      summary[:lazar][:padel][:all][:n] += 1
+      summary[:lazar][:padel][:all][:non_mut] += 1
+      if lazar_PaDEL_high_confidence == "T" 
+        summary[:lazar][:padel][:high_confidence][:n] += 1
+        summary[:lazar][:padel][:high_confidence][:non_mut] += 1
+      end
+    end
+    if r_DL == "1"
+      summary[:r][:dl][:n] += 1
+      summary[:r][:dl][:mut] += 1
+    elsif r_DL == "0"
+      summary[:r][:dl][:n] += 1
+      summary[:r][:dl][:non_mut] += 1
+    end
+    if r_RF == "1"
+      summary[:r][:rf][:n] += 1
+      summary[:r][:rf][:mut] += 1
+    elsif r_RF == "0"
+      summary[:r][:rf][:n] += 1
+      summary[:r][:rf][:non_mut] += 1
+    end
+    if r_SVM == "1"
+      summary[:r][:svm][:n] += 1
+      summary[:r][:svm][:mut] += 1
+    elsif r_SVM == "0"
+      summary[:r][:svm][:n] += 1
+      summary[:r][:svm][:non_mut] += 1
+    end
+    if tf_lr_sgd == "1"
+      summary[:tf][:lr_sgd][:n] += 1
+      summary[:tf][:lr_sgd][:mut] += 1
+    elsif tf_lr_sgd == "0"
+      summary[:tf][:lr_sgd][:n] += 1
+      summary[:tf][:lr_sgd][:non_mut] += 1
+    end
+    if tf_lr_scikit == "1"
+      summary[:tf][:lr_scikit][:n] += 1
+      summary[:tf][:lr_scikit][:mut] += 1
+    elsif tf_lr_scikit == "0"
+      summary[:tf][:lr_scikit][:n] += 1
+      summary[:tf][:lr_scikit][:non_mut] += 1
+    end
+    if tf_RF == "1"
+      summary[:tf][:rf][:n] += 1
+      summary[:tf][:rf][:mut] += 1
+    elsif tf_RF == "0"
+      summary[:tf][:rf][:n] += 1
+      summary[:tf][:rf][:non_mut] += 1
+    end
+    if tf_NN == "1"
+      summary[:tf][:nn][:n] += 1
+      summary[:tf][:nn][:mut] += 1
+    elsif tf_NN == "0"
+      summary[:tf][:nn][:n] += 1
+      summary[:tf][:nn][:non_mut] += 1
+    end
+    summary[:n] += 1
+  end
+end
+summary[:lazar][:mp2d][:all][:n_perc] = (100.0*summary[:lazar][:mp2d][:all][:n]/summary[:n]).round
+summary[:lazar][:mp2d][:all][:mut_perc] = (100.0*summary[:lazar][:mp2d][:all][:mut]/summary[:lazar][:mp2d][:all][:n]).round
+summary[:lazar][:mp2d][:all][:non_mut_perc] = (100.0*summary[:lazar][:mp2d][:all][:non_mut]/summary[:lazar][:mp2d][:all][:n]).round
+summary[:lazar][:mp2d][:high_confidence][:n_perc] = (100.0*summary[:lazar][:mp2d][:high_confidence][:n]/summary[:n]).round
+summary[:lazar][:mp2d][:high_confidence][:mut_perc] = (100.0*summary[:lazar][:mp2d][:high_confidence][:mut]/summary[:lazar][:mp2d][:high_confidence][:n]).round
+summary[:lazar][:mp2d][:high_confidence][:non_mut_perc] = (100.0*summary[:lazar][:mp2d][:high_confidence][:non_mut]/summary[:lazar][:mp2d][:high_confidence][:n]).round
+summary[:lazar][:padel][:all][:n_perc] = (100.0*summary[:lazar][:padel][:all][:n]/summary[:n]).round
+summary[:lazar][:padel][:all][:mut_perc] = (100.0*summary[:lazar][:padel][:all][:mut]/summary[:lazar][:padel][:all][:n]).round
+summary[:lazar][:padel][:all][:non_mut_perc] = (100.0*summary[:lazar][:padel][:all][:non_mut]/summary[:lazar][:padel][:all][:n]).round
+summary[:lazar][:padel][:high_confidence][:n_perc] = (100.0*summary[:lazar][:padel][:high_confidence][:n]/summary[:n]).round
+if summary[:lazar][:padel][:high_confidence][:n] == 0
+  summary[:lazar][:padel][:high_confidence][:mut_perc] = 0
+  summary[:lazar][:padel][:high_confidence][:non_mut_perc] = 0
+else
+  summary[:lazar][:padel][:high_confidence][:mut_perc] = (100.0*summary[:lazar][:padel][:high_confidence][:mut]/summary[:lazar][:padel][:high_confidence][:n]).round
+  summary[:lazar][:padel][:high_confidence][:non_mut_perc] = (100.0*summary[:lazar][:padel][:high_confidence][:non_mut]/summary[:lazar][:padel][:high_confidence][:n]).round
+end
+
+[:rf,:svm,:dl].each do |a|
+  summary[:r][a][:n_perc] = (100.0*summary[:r][a][:n]/summary[:n]).round
+  summary[:r][a][:mut_perc] = (100.0*summary[:r][a][:mut]/summary[:r][a][:n]).round
+  summary[:r][a][:non_mut_perc] = (100.0*summary[:r][a][:non_mut]/summary[:r][a][:n]).round
+end
+
+[:rf,:lr_sgd,:lr_scikit,:nn].each do |a|
+  summary[:tf][a][:n_perc] = (100.0*summary[:tf][a][:n]/summary[:n]).round
+  summary[:tf][a][:mut_perc] = (100.0*summary[:tf][a][:mut]/summary[:tf][a][:n]).round
+  summary[:tf][a][:non_mut_perc] = (100.0*summary[:tf][a][:non_mut]/summary[:tf][a][:n]).round
+end
+summary = {:pa => summary}
+puts summary.to_yaml
diff --git a/tables/pa-summary.csv b/tables/pa-summary.csv
new file mode 100644
index 0000000..0bc0e97
--- /dev/null
+++ b/tables/pa-summary.csv
@@ -0,0 +1,12 @@
+Model,Nr.predictions,mutagenic,non-mutagenic
+lazar-MP2D (all),560 (93 %),111 (20 %),449 (80 %)
+lazar-MP2D (high-confidence),301 (50 %),76 (25 %),225 (75 %)
+lazar-PaDEL (all),600 (100 %),83 (14 %),517 (86 %)
+lazar-PaDEL (high-confidence),0 (0 %),0 (0 %),0 (0 %)
+R-RF,602 (100 %),18 (3 %),584 (97 %)
+R-SVM,602 (100 %),11 (2 %),591 (98 %)
+R-DL,602 (100 %),521 (87 %),81 (13 %)
+Tensorflow-RF,602 (100 %),186 (31 %),416 (69 %)
+Tensorflow-LR-sgd,602 (100 %),286 (48 %),316 (52 %)
+Tensorflow-LR-scikit,602 (100 %),395 (66 %),207 (34 %)
+Tensorflow-NN,602 (100 %),295 (49 %),307 (51 %)
-- 
cgit v1.2.3