summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2020-10-16 19:39:25 +0200
committerChristoph Helma <helma@in-silico.ch>2020-10-16 19:39:25 +0200
commit0b34eeae710600c2e145f5257eec08a785811adb (patch)
treed93917a428674f64fe59d9c2ec511f6e5430aac9
parentb4f6729dfaf0cf2c9a2b4a6fcb5f4f0660000afe (diff)
ROC plot updated for readability
-rw-r--r--10-fold-crossvalidations/summary.yaml136
-rw-r--r--Makefile8
-rw-r--r--figures/roc.csv18
-rw-r--r--figures/roc.pngbin75557 -> 149175 bytes
-rw-r--r--mutagenicity.md11
-rwxr-xr-xscripts/results2csv.rb9
-rwxr-xr-xscripts/roc.R4
-rwxr-xr-xscripts/summary2roc.rb9
8 files changed, 97 insertions, 98 deletions
diff --git a/10-fold-crossvalidations/summary.yaml b/10-fold-crossvalidations/summary.yaml
index 2c6f98b..c05db63 100644
--- a/10-fold-crossvalidations/summary.yaml
+++ b/10-fold-crossvalidations/summary.yaml
@@ -1,55 +1,4 @@
---
-R-SVM:
- :tp: 2243
- :fp: 1353
- :tn: 2717
- :fn: 1757
- :n: 8070
- :acc: 0.61
- :tpr: 0.56
- :fpr: 0.33
- :tnr: 0.67
- :ppv: 0.62
- :npv: 0.61
- :acc_perc: 61
- :tpr_perc: 56
- :tnr_perc: 67
- :ppv_perc: 62
- :npv_perc: 61
-R-RF:
- :tp: 2259
- :fp: 1173
- :tn: 2897
- :fn: 1741
- :n: 8070
- :acc: 0.64
- :tpr: 0.56
- :fpr: 0.29
- :tnr: 0.71
- :ppv: 0.66
- :npv: 0.62
- :acc_perc: 64
- :tpr_perc: 56
- :tnr_perc: 71
- :ppv_perc: 66
- :npv_perc: 62
-R-DL:
- :tp: 3517
- :fp: 3099
- :tn: 971
- :fn: 483
- :n: 8070
- :acc: 0.56
- :tpr: 0.88
- :fpr: 0.76
- :tnr: 0.24
- :ppv: 0.53
- :npv: 0.67
- :acc_perc: 56
- :tpr_perc: 88
- :tnr_perc: 24
- :ppv_perc: 53
- :npv_perc: 67
lazar-all:
:tp: 3326
:fp: 833
@@ -118,6 +67,74 @@ lazar-padel-high-confidence:
:tnr_perc: 79
:ppv_perc: 56
:npv_perc: 59
+R-RF:
+ :tp: 2259
+ :fp: 1173
+ :tn: 2897
+ :fn: 1741
+ :n: 8070
+ :acc: 0.64
+ :tpr: 0.56
+ :fpr: 0.29
+ :tnr: 0.71
+ :ppv: 0.66
+ :npv: 0.62
+ :acc_perc: 64
+ :tpr_perc: 56
+ :tnr_perc: 71
+ :ppv_perc: 66
+ :npv_perc: 62
+R-SVM:
+ :tp: 2243
+ :fp: 1353
+ :tn: 2717
+ :fn: 1757
+ :n: 8070
+ :acc: 0.61
+ :tpr: 0.56
+ :fpr: 0.33
+ :tnr: 0.67
+ :ppv: 0.62
+ :npv: 0.61
+ :acc_perc: 61
+ :tpr_perc: 56
+ :tnr_perc: 67
+ :ppv_perc: 62
+ :npv_perc: 61
+R-DL:
+ :tp: 3517
+ :fp: 3099
+ :tn: 971
+ :fn: 483
+ :n: 8070
+ :acc: 0.56
+ :tpr: 0.88
+ :fpr: 0.76
+ :tnr: 0.24
+ :ppv: 0.53
+ :npv: 0.67
+ :acc_perc: 56
+ :tpr_perc: 88
+ :tnr_perc: 24
+ :ppv_perc: 53
+ :npv_perc: 67
+tensorflow-rf.v3:
+ :tp: 2362
+ :fp: 1243
+ :tn: 2835
+ :fn: 1640
+ :n: 8080
+ :acc: 0.64
+ :tpr: 0.59
+ :fpr: 0.3
+ :tnr: 0.7
+ :ppv: 0.66
+ :npv: 0.63
+ :acc_perc: 64
+ :tpr_perc: 59
+ :tnr_perc: 70
+ :ppv_perc: 66
+ :npv_perc: 63
tensorflow-lr.v3:
:tp: 2395
:fp: 1427
@@ -169,20 +186,3 @@ tensorflow-nn.v3:
:tnr_perc: 64
:ppv_perc: 63
:npv_perc: 63
-tensorflow-rf.v3:
- :tp: 2362
- :fp: 1243
- :tn: 2835
- :fn: 1640
- :n: 8080
- :acc: 0.64
- :tpr: 0.59
- :fpr: 0.3
- :tnr: 0.7
- :ppv: 0.66
- :npv: 0.63
- :acc_perc: 64
- :tpr_perc: 59
- :tnr_perc: 70
- :ppv_perc: 66
- :npv_perc: 63
diff --git a/Makefile b/Makefile
index 89153e1..911aa52 100644
--- a/Makefile
+++ b/Makefile
@@ -19,19 +19,17 @@ R_CV_DIR = 10-fold-crossvalidations/R
TENSORFLOW_CV_DIR = 10-fold-crossvalidations/tensorflow
CONFUSION_MATRICES_DIR = 10-fold-crossvalidations/confusion-matrices
-CONFUSION_MATRICES = $(CONFUSION_MATRICES_DIR)/R-SVM.csv $(CONFUSION_MATRICES_DIR)/R-RF.csv $(CONFUSION_MATRICES_DIR)/R-DL.csv $(CONFUSION_MATRICES_DIR)/lazar-all.csv $(CONFUSION_MATRICES_DIR)/lazar-high-confidence.csv $(CONFUSION_MATRICES_DIR)/lazar-padel-all.csv $(CONFUSION_MATRICES_DIR)/lazar-padel-high-confidence.csv $(CONFUSION_MATRICES_DIR)/tensorflow-lr.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-lr2.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-nn.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-rf.v3.csv
+CONFUSION_MATRICES = $(CONFUSION_MATRICES_DIR)/lazar-all.csv $(CONFUSION_MATRICES_DIR)/lazar-high-confidence.csv $(CONFUSION_MATRICES_DIR)/lazar-padel-all.csv $(CONFUSION_MATRICES_DIR)/lazar-padel-high-confidence.csv $(CONFUSION_MATRICES_DIR)/R-RF.csv $(CONFUSION_MATRICES_DIR)/R-SVM.csv $(CONFUSION_MATRICES_DIR)/R-DL.csv $(CONFUSION_MATRICES_DIR)/tensorflow-rf.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-lr.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-lr2.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-nn.v3.csv
CV_SUMMARY = 10-fold-crossvalidations/summary.yaml
# PA predictions
PA_DIR = pyrrolizidine-alkaloids
PA_LAZAR_DIR = $(PA_DIR)/lazar
-#PA_LAZAR_MP2D_DIR = $(PA_LAZAR_DIR)/mp2d
-#PA_LAZAR_PADEL_DIR = $(PA_LAZAR_DIR)/padel
# manuscript
TABLES = tables/lazar-summary.csv tables/r-summary.csv tables/tensorflow-summary.csv tables/pa-tab.tex
-FIGURES = figures/roc.png figures/tsne-mp2d.png figures/tsne-padel.png #figures/pa-predictions.png
+FIGURES = figures/roc.png figures/tsne-mp2d.png figures/tsne-padel.png
all: $(TABLES) $(FIGURES) mutagenicity.pdf
include $(PANDOC_SCHOLAR_PATH)/Makefile
@@ -57,7 +55,7 @@ figures/roc.png: figures/roc.csv
scripts/roc.R
figures/roc.csv: $(CV_SUMMARY)
- scripts/results2csv.rb $< > $@
+ scripts/summary2roc.rb $< > $@
# tables
tables/pa-tab.tex: scripts/pa-table.rb
diff --git a/figures/roc.csv b/figures/roc.csv
index ac79a2e..10a2f39 100644
--- a/figures/roc.csv
+++ b/figures/roc.csv
@@ -1,12 +1,12 @@
tpr,fpr
-R-SVM,0.56,0.33
+lazar-MP2D (all),0.85,0.22
+lazar-MP2D (high confidence),0.89,0.21
+lazar-PaDEL (all),0.32,0.21
+lazar-PaDEL (high confidence),0.32,0.21
R-RF,0.56,0.29
+R-SVM,0.56,0.33
R-DL,0.88,0.76
-L,0.85,0.22
-L-HC,0.89,0.21
-L-P,0.32,0.21
-L-P-HC,0.32,0.21
-TF-lr,0.6,0.35
-TF-lr2,0.62,0.37
-TF-nn,0.61,0.36
-TF-rf,0.59,0.3
+Tensorflow-RF,0.59,0.3
+Tensorflow-LR (SGD),0.6,0.35
+Tensorflow-LR (scikit),0.62,0.37
+Tensorflow-NN,0.61,0.36
diff --git a/figures/roc.png b/figures/roc.png
index 2e71ac3..9ccf2fa 100644
--- a/figures/roc.png
+++ b/figures/roc.png
Binary files differ
diff --git a/mutagenicity.md b/mutagenicity.md
index 4ac5a32..274519e 100644
--- a/mutagenicity.md
+++ b/mutagenicity.md
@@ -1,6 +1,5 @@
---
-title: A comparison of twelve machine learning models based on an expanded mutagenicity dataset and their application for predicting pyrrolizidine alkaloid mutagenicity
-# TODO check # algorithms
+title: A comparison of nine machine learning models based on an expanded mutagenicity dataset and their application for predicting pyrrolizidine alkaloid mutagenicity
#title: A comparison of random forest, support vector machine, linear regression, deep learning and lazar algorithms for predicting the mutagenic potential of different pyrrolizidine alkaloids
#subtitle: Performance comparison with a new expanded dataset
@@ -26,7 +25,7 @@ institute:
name: Berlin Institute for Medical Systems Biology, Max Delbrück Center for Molecular Medicine in the Helmholtz Association
address: "Robert-Rössle-Strasse 10, Berlin, 13125, Germany"
bibliography: bibliography.bib
-keywords: mutagenicity, QSAR, lazar, random forest, support vector machine, deep learning
+keywords: mutagenicity, QSAR, lazar, random forest, support vector machine, linear regression, neural nets, deep learning
documentclass: scrartcl
tblPrefix: Table
@@ -435,13 +434,13 @@ R scripts for these experiments can be found in https://git.in-silico.ch/mutagen
TODO: **Philipp** bitte ergaenzen
+#### Random forests
+
#### Logistic regression (SGD)
#### Logistic regression (scikit)
-#### Random forests
-
-#### Deep Learning
+#### Neural Nets
Alternatively, a DL model was established with Python-based Tensorflow
program (<https://www.tensorflow.org/>) using the high-level API Keras
diff --git a/scripts/results2csv.rb b/scripts/results2csv.rb
deleted file mode 100755
index cb4d550..0000000
--- a/scripts/results2csv.rb
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/env ruby
-require "yaml"
-
-data = YAML.load(File.read ARGV[0])
-puts "tpr,fpr"
-data.each do |algo,values|
- algo = algo.sub("tensorflow","TF").sub("selected","FS").sub("lazar","L").sub("padel","P").sub("high-confidence","HC").sub("-all","").sub(".v3","")
- puts [algo,values[:tpr],values[:fpr]].join(",")
-end
diff --git a/scripts/roc.R b/scripts/roc.R
index afc8293..459252a 100755
--- a/scripts/roc.R
+++ b/scripts/roc.R
@@ -2,7 +2,9 @@
library(ggplot2)
data <- read.csv("figures/roc.csv",header=T)
p <- ggplot(data, aes(x=fpr, y=tpr)) + geom_abline()
-p <- p + geom_label(label=rownames(data) )
+#p <- p + geom_label(label=rownames(data) )
+p <- p + geom_point(aes(color=rownames(data)))
+p <- p + theme(legend.title=element_blank())
p <- p + expand_limits(x=c(0,1),y=c(0,1))
p <- p + labs(x = "False positive rate", y = "True positive rate")
ggsave("figures/roc.png")
diff --git a/scripts/summary2roc.rb b/scripts/summary2roc.rb
new file mode 100755
index 0000000..dbac2f4
--- /dev/null
+++ b/scripts/summary2roc.rb
@@ -0,0 +1,9 @@
+#!/usr/bin/env ruby
+require "yaml"
+
+data = YAML.load(File.read ARGV[0])
+puts "tpr,fpr"
+data.each do |algo,values|
+ algo = algo.sub("tensorflow","Tensorflow").sub("selected","FS").sub(".v3","").sub("-all"," (all)").sub("-high-confidence"," (high confidence)").sub("padel","PaDEL").sub("lazar ","lazar-MP2D ").sub("lr2","LR (scikit)").sub("lr","LR (SGD)").sub("nn","NN").sub("-rf","-RF")
+ puts [algo,values[:tpr],values[:fpr]].join(",")
+end