From 2c3bc133700f7e1e1ea8d038d87da1f3095ed103 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Mon, 19 Oct 2020 23:54:19 +0200
Subject: PA prediction summary

---
 10-fold-crossvalidations/summary.yaml | 375 +++++++++++++++++-----------------
 Makefile                              |  23 ++-
 figures/roc.png                       | Bin 134226 -> 146544 bytes
 mutagenicity.md                       |  16 +-
 pyrrolizidine-alkaloids/summary.yaml  |  83 ++++++++
 scripts/confusion-matrix-summary.rb   |   2 +-
 scripts/summary2roc.rb                |   2 +-
 scripts/summary2table.rb              |   2 +-
 8 files changed, 301 insertions(+), 202 deletions(-)
 create mode 100644 pyrrolizidine-alkaloids/summary.yaml

diff --git a/10-fold-crossvalidations/summary.yaml b/10-fold-crossvalidations/summary.yaml
index c05db63..08c0b40 100644
--- a/10-fold-crossvalidations/summary.yaml
+++ b/10-fold-crossvalidations/summary.yaml
@@ -1,188 +1,189 @@
 ---
-lazar-all:
-  :tp: 3326
-  :fp: 833
-  :tn: 3039
-  :fn: 583
-  :n: 7781
-  :acc: 0.82
-  :tpr: 0.85
-  :fpr: 0.22
-  :tnr: 0.78
-  :ppv: 0.8
-  :npv: 0.84
-  :acc_perc: 82
-  :tpr_perc: 85
-  :tnr_perc: 78
-  :ppv_perc: 80
-  :npv_perc: 84
-lazar-high-confidence:
-  :tp: 2816
-  :fp: 571
-  :tn: 2138
-  :fn: 365
-  :n: 5890
-  :acc: 0.84
-  :tpr: 0.89
-  :fpr: 0.21
-  :tnr: 0.79
-  :ppv: 0.83
-  :npv: 0.85
-  :acc_perc: 84
-  :tpr_perc: 89
-  :tnr_perc: 79
-  :ppv_perc: 83
-  :npv_perc: 85
-lazar-padel-all:
-  :tp: 593
-  :fp: 466
-  :tn: 1777
-  :fn: 1253
-  :n: 4089
-  :acc: 0.58
-  :tpr: 0.32
-  :fpr: 0.21
-  :tnr: 0.79
-  :ppv: 0.56
-  :npv: 0.59
-  :acc_perc: 58
-  :tpr_perc: 32
-  :tnr_perc: 79
-  :ppv_perc: 56
-  :npv_perc: 59
-lazar-padel-high-confidence:
-  :tp: 593
-  :fp: 466
-  :tn: 1771
-  :fn: 1251
-  :n: 4081
-  :acc: 0.58
-  :tpr: 0.32
-  :fpr: 0.21
-  :tnr: 0.79
-  :ppv: 0.56
-  :npv: 0.59
-  :acc_perc: 58
-  :tpr_perc: 32
-  :tnr_perc: 79
-  :ppv_perc: 56
-  :npv_perc: 59
-R-RF:
-  :tp: 2259
-  :fp: 1173
-  :tn: 2897
-  :fn: 1741
-  :n: 8070
-  :acc: 0.64
-  :tpr: 0.56
-  :fpr: 0.29
-  :tnr: 0.71
-  :ppv: 0.66
-  :npv: 0.62
-  :acc_perc: 64
-  :tpr_perc: 56
-  :tnr_perc: 71
-  :ppv_perc: 66
-  :npv_perc: 62
-R-SVM:
-  :tp: 2243
-  :fp: 1353
-  :tn: 2717
-  :fn: 1757
-  :n: 8070
-  :acc: 0.61
-  :tpr: 0.56
-  :fpr: 0.33
-  :tnr: 0.67
-  :ppv: 0.62
-  :npv: 0.61
-  :acc_perc: 61
-  :tpr_perc: 56
-  :tnr_perc: 67
-  :ppv_perc: 62
-  :npv_perc: 61
-R-DL:
-  :tp: 3517
-  :fp: 3099
-  :tn: 971
-  :fn: 483
-  :n: 8070
-  :acc: 0.56
-  :tpr: 0.88
-  :fpr: 0.76
-  :tnr: 0.24
-  :ppv: 0.53
-  :npv: 0.67
-  :acc_perc: 56
-  :tpr_perc: 88
-  :tnr_perc: 24
-  :ppv_perc: 53
-  :npv_perc: 67
-tensorflow-rf.v3:
-  :tp: 2362
-  :fp: 1243
-  :tn: 2835
-  :fn: 1640
-  :n: 8080
-  :acc: 0.64
-  :tpr: 0.59
-  :fpr: 0.3
-  :tnr: 0.7
-  :ppv: 0.66
-  :npv: 0.63
-  :acc_perc: 64
-  :tpr_perc: 59
-  :tnr_perc: 70
-  :ppv_perc: 66
-  :npv_perc: 63
-tensorflow-lr.v3:
-  :tp: 2395
-  :fp: 1427
-  :tn: 2651
-  :fn: 1607
-  :n: 8080
-  :acc: 0.62
-  :tpr: 0.6
-  :fpr: 0.35
-  :tnr: 0.65
-  :ppv: 0.63
-  :npv: 0.62
-  :acc_perc: 62
-  :tpr_perc: 60
-  :tnr_perc: 65
-  :ppv_perc: 63
-  :npv_perc: 62
-tensorflow-lr2.v3:
-  :tp: 2487
-  :fp: 1497
-  :tn: 2581
-  :fn: 1515
-  :n: 8080
-  :acc: 0.63
-  :tpr: 0.62
-  :fpr: 0.37
-  :tnr: 0.63
-  :ppv: 0.62
-  :npv: 0.63
-  :acc_perc: 63
-  :tpr_perc: 62
-  :tnr_perc: 63
-  :ppv_perc: 62
-  :npv_perc: 63
-tensorflow-nn.v3:
-  :tp: 2452
-  :fp: 1468
-  :tn: 2610
-  :fn: 1550
-  :n: 8080
-  :acc: 0.63
-  :tpr: 0.61
-  :fpr: 0.36
-  :tnr: 0.64
-  :ppv: 0.63
-  :npv: 0.63
-  :acc_perc: 63
-  :tpr_perc: 61
-  :tnr_perc: 64
-  :ppv_perc: 63
-  :npv_perc: 63
+:cv:
+  lazar-all:
+    :tp: 3326
+    :fp: 833
+    :tn: 3039
+    :fn: 583
+    :n: 7781
+    :acc: 0.82
+    :tpr: 0.85
+    :fpr: 0.22
+    :tnr: 0.78
+    :ppv: 0.8
+    :npv: 0.84
+    :acc_perc: 82
+    :tpr_perc: 85
+    :tnr_perc: 78
+    :ppv_perc: 80
+    :npv_perc: 84
+  lazar-high-confidence:
+    :tp: 2816
+    :fp: 571
+    :tn: 2138
+    :fn: 365
+    :n: 5890
+    :acc: 0.84
+    :tpr: 0.89
+    :fpr: 0.21
+    :tnr: 0.79
+    :ppv: 0.83
+    :npv: 0.85
+    :acc_perc: 84
+    :tpr_perc: 89
+    :tnr_perc: 79
+    :ppv_perc: 83
+    :npv_perc: 85
+  lazar-padel-all:
+    :tp: 593
+    :fp: 466
+    :tn: 1777
+    :fn: 1253
+    :n: 4089
+    :acc: 0.58
+    :tpr: 0.32
+    :fpr: 0.21
+    :tnr: 0.79
+    :ppv: 0.56
+    :npv: 0.59
+    :acc_perc: 58
+    :tpr_perc: 32
+    :tnr_perc: 79
+    :ppv_perc: 56
+    :npv_perc: 59
+  lazar-padel-high-confidence:
+    :tp: 593
+    :fp: 466
+    :tn: 1771
+    :fn: 1251
+    :n: 4081
+    :acc: 0.58
+    :tpr: 0.32
+    :fpr: 0.21
+    :tnr: 0.79
+    :ppv: 0.56
+    :npv: 0.59
+    :acc_perc: 58
+    :tpr_perc: 32
+    :tnr_perc: 79
+    :ppv_perc: 56
+    :npv_perc: 59
+  R-RF:
+    :tp: 2259
+    :fp: 1173
+    :tn: 2897
+    :fn: 1741
+    :n: 8070
+    :acc: 0.64
+    :tpr: 0.56
+    :fpr: 0.29
+    :tnr: 0.71
+    :ppv: 0.66
+    :npv: 0.62
+    :acc_perc: 64
+    :tpr_perc: 56
+    :tnr_perc: 71
+    :ppv_perc: 66
+    :npv_perc: 62
+  R-SVM:
+    :tp: 2243
+    :fp: 1353
+    :tn: 2717
+    :fn: 1757
+    :n: 8070
+    :acc: 0.61
+    :tpr: 0.56
+    :fpr: 0.33
+    :tnr: 0.67
+    :ppv: 0.62
+    :npv: 0.61
+    :acc_perc: 61
+    :tpr_perc: 56
+    :tnr_perc: 67
+    :ppv_perc: 62
+    :npv_perc: 61
+  R-DL:
+    :tp: 3517
+    :fp: 3099
+    :tn: 971
+    :fn: 483
+    :n: 8070
+    :acc: 0.56
+    :tpr: 0.88
+    :fpr: 0.76
+    :tnr: 0.24
+    :ppv: 0.53
+    :npv: 0.67
+    :acc_perc: 56
+    :tpr_perc: 88
+    :tnr_perc: 24
+    :ppv_perc: 53
+    :npv_perc: 67
+  tensorflow-rf.v3:
+    :tp: 2362
+    :fp: 1243
+    :tn: 2835
+    :fn: 1640
+    :n: 8080
+    :acc: 0.64
+    :tpr: 0.59
+    :fpr: 0.3
+    :tnr: 0.7
+    :ppv: 0.66
+    :npv: 0.63
+    :acc_perc: 64
+    :tpr_perc: 59
+    :tnr_perc: 70
+    :ppv_perc: 66
+    :npv_perc: 63
+  tensorflow-lr.v3:
+    :tp: 2395
+    :fp: 1427
+    :tn: 2651
+    :fn: 1607
+    :n: 8080
+    :acc: 0.62
+    :tpr: 0.6
+    :fpr: 0.35
+    :tnr: 0.65
+    :ppv: 0.63
+    :npv: 0.62
+    :acc_perc: 62
+    :tpr_perc: 60
+    :tnr_perc: 65
+    :ppv_perc: 63
+    :npv_perc: 62
+  tensorflow-lr2.v3:
+    :tp: 2487
+    :fp: 1497
+    :tn: 2581
+    :fn: 1515
+    :n: 8080
+    :acc: 0.63
+    :tpr: 0.62
+    :fpr: 0.37
+    :tnr: 0.63
+    :ppv: 0.62
+    :npv: 0.63
+    :acc_perc: 63
+    :tpr_perc: 62
+    :tnr_perc: 63
+    :ppv_perc: 62
+    :npv_perc: 63
+  tensorflow-nn.v3:
+    :tp: 2452
+    :fp: 1468
+    :tn: 2610
+    :fn: 1550
+    :n: 8080
+    :acc: 0.63
+    :tpr: 0.61
+    :fpr: 0.36
+    :tnr: 0.64
+    :ppv: 0.63
+    :npv: 0.63
+    :acc_perc: 63
+    :tpr_perc: 61
+    :tnr_perc: 64
+    :ppv_perc: 63
+    :npv_perc: 63
diff --git a/Makefile b/Makefile
index 5ceb4aa..fb5eb30 100644
--- a/Makefile
+++ b/Makefile
@@ -13,6 +13,7 @@ TEMPLATE_FILE_LATEX   = pandoc-scholar.latex
 # Experiments
 
 # crossvalidations
+
 LAZAR_CONFUSION_MATRIX_DIR = 10-fold-crossvalidations/lazar/crossvalidation/confusion_matrices
 LAZAR_PADEL_CONFUSION_MATRIX_DIR = 10-fold-crossvalidations/lazar-padel/crossvalidation/confusion_matrices
 R_CV_DIR = 10-fold-crossvalidations/R
@@ -22,8 +23,10 @@ CONFUSION_MATRICES_DIR = 10-fold-crossvalidations/confusion-matrices
 CONFUSION_MATRICES = $(CONFUSION_MATRICES_DIR)/lazar-all.csv $(CONFUSION_MATRICES_DIR)/lazar-high-confidence.csv $(CONFUSION_MATRICES_DIR)/lazar-padel-all.csv $(CONFUSION_MATRICES_DIR)/lazar-padel-high-confidence.csv $(CONFUSION_MATRICES_DIR)/R-RF.csv $(CONFUSION_MATRICES_DIR)/R-SVM.csv $(CONFUSION_MATRICES_DIR)/R-DL.csv $(CONFUSION_MATRICES_DIR)/tensorflow-rf.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-lr.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-lr2.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-nn.v3.csv
 
 CV_SUMMARY = 10-fold-crossvalidations/summary.yaml
+PA_SUMMARY = pyrrolizidine-alkaloids/summary.yaml
 
 # PA predictions
+
 PA_DIR = pyrrolizidine-alkaloids
 PA_LAZAR_DIR = $(PA_DIR)/lazar
 PA_R_DIR = $(PA_DIR)/R
@@ -32,13 +35,14 @@ PA_TF_DIR = $(PA_DIR)/tensorflow
 PA_PREDICTIONS = $(PA_LAZAR_DIR)/pa-mp2d-predictions.csv $(PA_LAZAR_DIR)/pa-padel-predictions.csv $(PA_R_DIR)/PA.RF.outcome.csv $(PA_R_DIR)/PA.SVM.outcome.csv $(PA_R_DIR)/PA.DL.outcome.csv $(PA_TF_DIR)/pred.lr.v3-ext-Padel-2D.csv $(PA_TF_DIR)/pred.lr2.v3-ext-Padel-2D.csv $(PA_TF_DIR)/pred.rf.v3-ext-Padel-2D.csv $(PA_TF_DIR)/pred.nn.v3-ext-Padel-2D.csv
 
 # manuscript
+
 TABLES = tables/lazar-summary.csv tables/r-summary.csv tables/tensorflow-summary.csv tables/pa-tab.tex
 FIGURES = figures/roc.png figures/tsne-mp2d.png figures/tsne-padel.png
 
-all: $(TABLES) $(FIGURES) mutagenicity.pdf 
+all: $(TABLES) $(FIGURES) $(CV_SUMMARY) mutagenicity.pdf 
 include $(PANDOC_SCHOLAR_PATH)/Makefile
 
-mutagenicity.mustache.md: $(CV_SUMMARY) mutagenicity.md $(TABLES) $(FIGURES)
+mutagenicity.mustache.md: $(CV_SUMMARY) $(PA_SUMMARY) mutagenicity.md $(TABLES) $(FIGURES)
 	mustache $^ > $@
 
 # figures
@@ -62,6 +66,7 @@ figures/roc.csv: $(CV_SUMMARY)
 	scripts/summary2roc.rb $< > $@
 
 # tables
+
 tables/pa-tab.tex: tables/pa-table.csv
 	scripts/pa-tex-table.rb $< > $@
 
@@ -77,10 +82,10 @@ tables/r-summary.csv: $(CV_SUMMARY)
 tables/tensorflow-summary.csv: $(CV_SUMMARY)
 	scripts/summary2table.rb tensorflow > $@
 
-# crossvalidation summary
+# PA summary
 
-$(CV_SUMMARY): $(CONFUSION_MATRICES)
-	scripts/confusion-matrix-summary.rb $^ > $@
+$(PA_SUMMARY): tables/pa-table.csv
+	scripts/pa-summary.rb $< > $@
 
 # PA predictions
 
@@ -93,9 +98,15 @@ $(PA_LAZAR_DIR)/pa-mp2d-predictions.csv: $(PA_LAZAR_DIR)/pa-smiles.csv
 $(PA_LAZAR_DIR)/pa-smiles.csv: pyrrolizidine-alkaloids/180920_PA_complete_SMILES.csv
 	cut -f1,4 -d ';' $< | sed 's/;/,/' > $@
 
+# crossvalidation summary
+
+$(CV_SUMMARY): $(CONFUSION_MATRICES)
+	scripts/confusion-matrix-summary.rb $^ > $@
+
 # confusion matrices
 
 ## lazar
+
 $(CONFUSION_MATRICES_DIR)/lazar-all.csv: $(LAZAR_CONFUSION_MATRIX_DIR)
 	cp $</all $@
 
@@ -109,6 +120,7 @@ $(CONFUSION_MATRICES_DIR)/lazar-padel-high-confidence.csv: $(LAZAR_PADEL_CONFUSI
 	cp $</high_confidence $@
 
 ## R
+
 $(CONFUSION_MATRICES_DIR)/R-SVM.csv: $(R_CV_DIR)/Sgl-Observations-SVM.csv
 	scripts/cv-r-confusion-matrix.rb $< > $@
 
@@ -119,6 +131,7 @@ $(CONFUSION_MATRICES_DIR)/R-DL.csv: $(R_CV_DIR)/Sgl-Observations-DL.csv
 	scripts/cv-r-confusion-matrix.rb $< > $@
 
 ## tensorflow
+
 $(TENSORFLOW_CV_DIR)/pred.lr.v3.norm.sorted.csv: $(TENSORFLOW_CV_DIR)/pred.lr.v3.norm.csv
 	sort -n $< > $@
 
diff --git a/figures/roc.png b/figures/roc.png
index 732299b..24a9dfb 100644
Binary files a/figures/roc.png and b/figures/roc.png differ
diff --git a/mutagenicity.md b/mutagenicity.md
index 9f7e349..c278142 100644
--- a/mutagenicity.md
+++ b/mutagenicity.md
@@ -42,7 +42,7 @@ Abstract
 Random forest, support vector machine, logistic regression, neural networks and k-nearest neighbor
 (`lazar`) algorithms, were applied to new *Salmonella* mutagenicity dataset
 with 8309 unique chemical structures. The best prediction accuracies in
-10-fold-crossvalidation were obtained with `lazar` models and MolPrint2D descriptors, that gave accuracies ({{lazar-high-confidence.acc_perc}}%)
+10-fold-crossvalidation were obtained with `lazar` models and MolPrint2D descriptors, that gave accuracies ({{cv.lazar-high-confidence.acc_perc}}%)
 similar to the interlaboratory variability of the Ames test.
 
 **TODO**: PA results
@@ -497,13 +497,15 @@ Crossvalidation results are summarized in the following tables: @tbl:lazar shows
 Confusion matrices for all models are available from the git repository http://git.in-silico.ch/mutagenicity-paper/10-fold-crossvalidations/confusion-matrices/, individual predictions can be found in 
 http://git.in-silico.ch/mutagenicity-paper/10-fold-crossvalidations/predictions/.
 
-The most accurate crossvalidation predictions have been obtained with standard `lazar` models using MolPrint2D descriptors ({{lazar-high-confidence.acc}} for predictions with high confidence, {{lazar-all.acc}} for all predictions). Models utilizing PaDEL descriptors have generally lower accuracies ranging from {{R-DL.acc}} (R deep learning) to {{R-RF.acc}} (R/Tensorflow random forests). Sensitivity and specificity is generally well balanced with the exception of `lazar`-PaDEL (low sensitivity) and R deep learning (low specificity) models.
+The most accurate crossvalidation predictions have been obtained with standard `lazar` models using MolPrint2D descriptors ({{cv.lazar-high-confidence.acc}} for predictions with high confidence, {{cv.lazar-all.acc}} for all predictions). Models utilizing PaDEL descriptors have generally lower accuracies ranging from {{cv.R-DL.acc}} (R deep learning) to {{cv.R-RF.acc}} (R/Tensorflow random forests). Sensitivity and specificity is generally well balanced with the exception of `lazar`-PaDEL (low sensitivity) and R deep learning (low specificity) models.
 
 Pyrrolizidine alkaloid mutagenicity predictions 
 -----------------------------------------------
 
 Mutagenicity predictions from all investigated models for 602 pyrrolizidine alkaloids are summarized in Table 4. 
 
+**TODO** **Verena und Philipp** Koennt Ihr bitte stichprobenweise die Tabelle ueberpruefen, mir verrutscht bei der Auswertung immer gerne etwas.
+
 \input{tables/pa-tab.tex}
 
 Training data and 
@@ -546,16 +548,16 @@ models have low specificity.
 The accuracy of `lazar` *in-silico* predictions are comparable to the
 interlaboratory variability of the Ames test (80-85% according to
 @Benigni1988), especially for predictions with high confidence
-({{lazar-high-confidence.acc_perc}}%). This is a clear indication that
+({{cv.lazar-high-confidence.acc_perc}}%). This is a clear indication that
 *in-silico* predictions can be as reliable as the bioassays, if the compounds
 are close to the applicability domain. This conclusion is also supported by our
 analysis of `lazar` lowest observed effect level predictions, which are also
 similar to the experimental variability (@Helma2018).
 
-The lowest number of predictions ({{lazar-padel-high-confidence.n}}) has been
+The lowest number of predictions ({{cv.lazar-padel-high-confidence.n}}) has been
 obtained from `lazar`-PaDEL high confidence predictions, the largest number of
-predictions comes from Tensorflow models ({{tensorflow-rf.v3.n}}). Standard
-`lazar` give a slightly lower number of predictions ({{lazar-all.n}}) than R
+predictions comes from Tensorflow models ({{cv.tensorflow-rf.v3.n}}). Standard
+`lazar` give a slightly lower number of predictions ({{cv.lazar-all.n}}) than R
 and Tensorflow models. This is not necessarily a disadvantage, because `lazar`
 abstains from predictions, if the query compound is very dissimilar from the
 compounds in the training set and thus avoids to make predictions for compounds
@@ -751,7 +753,7 @@ A new public *Salmonella* mutagenicity training dataset with 8309 compounds was
 created and used it to train `lazar`, R and Tensorflow models with MolPrint2D
 and PaDEL descriptors. The best performance was obtained with `lazar` models
 using MolPrint2D descriptors, with prediction accuracies
-({{lazar-high-confidence.acc_perc}}%) comparable to the interlaboratory variability
+({{cv.lazar-high-confidence.acc_perc}}%) comparable to the interlaboratory variability
 of the Ames test (80-85%). Models based on PaDEL descriptors had lower
 accuracies than MolPrint2D models, but only the `lazar` algorithm could use
 MolPrint2D descriptors.
diff --git a/pyrrolizidine-alkaloids/summary.yaml b/pyrrolizidine-alkaloids/summary.yaml
new file mode 100644
index 0000000..66c5030
--- /dev/null
+++ b/pyrrolizidine-alkaloids/summary.yaml
@@ -0,0 +1,83 @@
+---
+:pa:
+  :n: 602
+  :lazar:
+    :mp2d:
+      :all:
+        :n: 560
+        :mut: 111
+        :non_mut: 449
+        :n_perc: 93
+        :mut_perc: 19
+        :non_mut_perc: 80
+      :high_confidence:
+        :n: 301
+        :mut: 76
+        :non_mut: 225
+        :n_perc: 50
+        :mut_perc: 25
+        :non_mut_perc: 74
+    :padel:
+      :all:
+        :n: 600
+        :mut: 83
+        :non_mut: 517
+        :n_perc: 99
+        :mut_perc: 13
+        :non_mut_perc: 86
+      :high_confidence:
+        :n: 0
+        :mut: 0
+        :non_mut: 0
+        :n_perc: 0
+  :r:
+    :rf:
+      :n: 602
+      :mut: 18
+      :non_mut: 584
+      :n_perc: 100
+      :mut_perc: 2
+      :non_mut_perc: 97
+    :svm:
+      :n: 602
+      :mut: 11
+      :non_mut: 591
+      :n_perc: 100
+      :mut_perc: 1
+      :non_mut_perc: 98
+    :dl:
+      :n: 602
+      :mut: 521
+      :non_mut: 81
+      :n_perc: 100
+      :mut_perc: 86
+      :non_mut_perc: 13
+  :tf:
+    :rf:
+      :n: 602
+      :mut: 186
+      :non_mut: 416
+      :n_perc: 100
+      :mut_perc: 30
+      :non_mut_perc: 69
+    :lr_sgd:
+      :n: 602
+      :mut: 286
+      :non_mut: 316
+      :n_perc: 100
+      :mut_perc: 47
+      :non_mut_perc: 52
+    :lr_scikit:
+      :n: 602
+      :mut: 395
+      :non_mut: 207
+      :n_perc: 100
+      :mut_perc: 65
+      :non_mut_perc: 34
+    :nn:
+      :n: 602
+      :mut: 295
+      :non_mut: 307
+      :n_perc: 100
+      :mut_perc: 49
+      :non_mut_perc: 50
diff --git a/scripts/confusion-matrix-summary.rb b/scripts/confusion-matrix-summary.rb
index 129d69a..8a32f79 100755
--- a/scripts/confusion-matrix-summary.rb
+++ b/scripts/confusion-matrix-summary.rb
@@ -30,5 +30,5 @@ ARGV.each do |f|
   }
   results[File.basename(f,".csv")] = result
 end
-
+results = {:cv => results}
 puts results.to_yaml
diff --git a/scripts/summary2roc.rb b/scripts/summary2roc.rb
index e50d97a..e692d74 100755
--- a/scripts/summary2roc.rb
+++ b/scripts/summary2roc.rb
@@ -1,7 +1,7 @@
 #!/usr/bin/env ruby
 require "yaml"
 
-data = YAML.load(File.read ARGV[0])
+data = YAML.load(File.read ARGV[0])[:cv]
 puts "tpr,fpr"
 data.each do |algo,values|
   algo = algo.sub("tensorflow","Tensorflow").sub("selected","FS").sub(".v3","").sub("-all"," (all)").sub("-high-confidence"," (high confidence)").sub("padel","PaDEL").sub("lazar ","lazar-MP2D ").sub("lr2","LR-scikit").sub("lr","LR-sgd").sub("nn","NN").sub("-rf","-RF")
diff --git a/scripts/summary2table.rb b/scripts/summary2table.rb
index 555097c..267bb97 100755
--- a/scripts/summary2table.rb
+++ b/scripts/summary2table.rb
@@ -2,7 +2,7 @@
 require 'yaml'
 
 rows = {:acc => "Accuracy", :tpr => "True positive rate/Sensitivity", :tnr => "True negative rate/Specificity", :ppv => "Positive predictive value/Precision", :npv => "Negative predictive value", :n => "Nr. predictions"}
-data = YAML.load_file "10-fold-crossvalidations/summary.yaml"
+data = YAML.load_file("10-fold-crossvalidations/summary.yaml")[:cv]
 
 case ARGV[0]
 when "R"
-- 
cgit v1.2.3