summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2020-10-10 17:05:41 +0200
committerChristoph Helma <helma@in-silico.ch>2020-10-10 17:05:41 +0200
commite451d812f3b63d1987c8f1e7f5557156fdab984f (patch)
treef5b4e1730f0b75593925b3287d3a37fa70fa507e
parent23ce84a7da69104fa763d5a3911b7b0ad98fbdbc (diff)
Makefile and scripts cleanup; lazar, R and tensorflow tables
-rw-r--r--10-fold-crossvalidations/summaries/R-DL.json1
-rw-r--r--10-fold-crossvalidations/summaries/R-RF.json1
-rw-r--r--10-fold-crossvalidations/summaries/R-SVM.json1
-rw-r--r--10-fold-crossvalidations/summaries/lazar-all.json1
-rw-r--r--10-fold-crossvalidations/summaries/lazar-high-confidence.json1
-rw-r--r--10-fold-crossvalidations/summaries/lazar-padel-all.json1
-rw-r--r--10-fold-crossvalidations/summaries/lazar-padel-high-confidence.json1
-rw-r--r--10-fold-crossvalidations/summaries/results.json1
-rw-r--r--10-fold-crossvalidations/summaries/tensorflow-all.json1
-rw-r--r--10-fold-crossvalidations/summaries/tensorflow-selected.json1
-rw-r--r--10-fold-crossvalidations/summary.yaml (renamed from results.yaml)34
-rw-r--r--Makefile132
-rw-r--r--figures/roc.csv (renamed from figures/results.csv)2
-rw-r--r--figures/roc.pngbin75787 -> 76764 bytes
-rw-r--r--mutagenicity.md12
-rw-r--r--pyrrolizidine-alkaloids/pa_carcinogenicity.csv (renamed from export/pa_carcinogenicity.csv)0
-rw-r--r--pyrrolizidine-alkaloids/pa_mutagenicity.csv (renamed from export/pa_mutagenicity.csv)0
-rwxr-xr-xscripts/confusion-matrix2table.rb10
-rwxr-xr-xscripts/crossvalidation-summary.rb9
-rwxr-xr-xscripts/json2csv.rb29
-rwxr-xr-xscripts/results.rb36
-rwxr-xr-xscripts/roc.R2
-rwxr-xr-xscripts/summaries2table.rb32
-rwxr-xr-xscripts/tsne-mp2d.R (renamed from scripts/mp2d-tsne.R)0
-rwxr-xr-xscripts/tsne-padel.R (renamed from scripts/padel-tsne.R)0
-rw-r--r--tables/R-DL.csv4
-rw-r--r--tables/R-RF.csv4
-rw-r--r--tables/R-SVM.csv4
-rw-r--r--tables/lazar-all.csv4
-rw-r--r--tables/lazar-high-confidence.csv4
-rw-r--r--tables/lazar-padel-all.csv4
-rw-r--r--tables/lazar-padel-high-confidence.csv4
-rw-r--r--tables/lazar-summary.csv0
-rw-r--r--tables/r-summary.csv0
-rw-r--r--tables/tensorflow-all.csv4
-rw-r--r--tables/tensorflow-selected.csv4
-rw-r--r--tables/tensorflow-summary.csv7
37 files changed, 79 insertions, 272 deletions
diff --git a/10-fold-crossvalidations/summaries/R-DL.json b/10-fold-crossvalidations/summaries/R-DL.json
deleted file mode 100644
index 8a48d30..0000000
--- a/10-fold-crossvalidations/summaries/R-DL.json
+++ /dev/null
@@ -1 +0,0 @@
-{"accuracy":0.5561338289962825,"true_positive_rate":0.87925,"true_negative_rate":0.23857493857493858,"positive_predictive_value":0.531590084643289,"negative_predictive_value":0.6678129298486932}
diff --git a/10-fold-crossvalidations/summaries/R-RF.json b/10-fold-crossvalidations/summaries/R-RF.json
deleted file mode 100644
index ab7d6e7..0000000
--- a/10-fold-crossvalidations/summaries/R-RF.json
+++ /dev/null
@@ -1 +0,0 @@
-{"accuracy":0.638909541511772,"true_positive_rate":0.56475,"true_negative_rate":0.7117936117936118,"positive_predictive_value":0.6582167832167832,"negative_predictive_value":0.6246226821905994}
diff --git a/10-fold-crossvalidations/summaries/R-SVM.json b/10-fold-crossvalidations/summaries/R-SVM.json
deleted file mode 100644
index a038447..0000000
--- a/10-fold-crossvalidations/summaries/R-SVM.json
+++ /dev/null
@@ -1 +0,0 @@
-{"accuracy":0.6146220570012392,"true_positive_rate":0.56075,"true_negative_rate":0.6675675675675675,"positive_predictive_value":0.6237486095661846,"negative_predictive_value":0.6072865444792133}
diff --git a/10-fold-crossvalidations/summaries/lazar-all.json b/10-fold-crossvalidations/summaries/lazar-all.json
deleted file mode 100644
index e68ff79..0000000
--- a/10-fold-crossvalidations/summaries/lazar-all.json
+++ /dev/null
@@ -1 +0,0 @@
-{"accuracy":0.818018249582316,"true_positive_rate":0.8508569966743412,"true_negative_rate":0.7848657024793388,"positive_predictive_value":0.7997114691031498,"negative_predictive_value":0.8390392048591938}
diff --git a/10-fold-crossvalidations/summaries/lazar-high-confidence.json b/10-fold-crossvalidations/summaries/lazar-high-confidence.json
deleted file mode 100644
index a9f852e..0000000
--- a/10-fold-crossvalidations/summaries/lazar-high-confidence.json
+++ /dev/null
@@ -1 +0,0 @@
-{"accuracy":0.8410865874363328,"true_positive_rate":0.8852562087393901,"true_negative_rate":0.7892211148025101,"positive_predictive_value":0.8314142308827871,"negative_predictive_value":0.8541749900119856}
diff --git a/10-fold-crossvalidations/summaries/lazar-padel-all.json b/10-fold-crossvalidations/summaries/lazar-padel-all.json
deleted file mode 100644
index d8ce18a..0000000
--- a/10-fold-crossvalidations/summaries/lazar-padel-all.json
+++ /dev/null
@@ -1 +0,0 @@
-{"accuracy":0.5796038151137197,"true_positive_rate":0.32123510292524377,"true_negative_rate":0.792242532322782,"positive_predictive_value":0.5599622285174694,"negative_predictive_value":0.5864686468646865}
diff --git a/10-fold-crossvalidations/summaries/lazar-padel-high-confidence.json b/10-fold-crossvalidations/summaries/lazar-padel-high-confidence.json
deleted file mode 100644
index 7ec0b1e..0000000
--- a/10-fold-crossvalidations/summaries/lazar-padel-high-confidence.json
+++ /dev/null
@@ -1 +0,0 @@
-{"accuracy":0.5792697868169566,"true_positive_rate":0.3215835140997831,"true_negative_rate":0.7916852928028609,"positive_predictive_value":0.5599622285174694,"negative_predictive_value":0.586035737921906}
diff --git a/10-fold-crossvalidations/summaries/results.json b/10-fold-crossvalidations/summaries/results.json
deleted file mode 100644
index 033c728..0000000
--- a/10-fold-crossvalidations/summaries/results.json
+++ /dev/null
@@ -1 +0,0 @@
-{"programs":[{"name":"R","algos":[{"accuracy":0.61,"true_positive_rate":0.56,"true_negative_rate":0.67,"positive_predictive_value":0.62,"negative_predictive_value":0.61,"accuracy_perc":61,"true_positive_rate_perc":56,"true_negative_rate_perc":67,"positive_predictive_value_perc":62,"negative_predictive_value_perc":61,"name":"SVM","abbrev":"R-SVM"},{"accuracy":0.64,"true_positive_rate":0.56,"true_negative_rate":0.71,"positive_predictive_value":0.66,"negative_predictive_value":0.62,"accuracy_perc":64,"true_positive_rate_perc":56,"true_negative_rate_perc":71,"positive_predictive_value_perc":66,"negative_predictive_value_perc":62,"name":"RF","abbrev":"R-RF"},{"accuracy":0.56,"true_positive_rate":0.88,"true_negative_rate":0.24,"positive_predictive_value":0.53,"negative_predictive_value":0.67,"accuracy_perc":56,"true_positive_rate_perc":88,"true_negative_rate_perc":24,"positive_predictive_value_perc":53,"negative_predictive_value_perc":67,"name":"DL","abbrev":"R-DL"}]},{"name":"tensorflow","algos":[{"accuracy":0.63,"true_positive_rate":0.63,"true_negative_rate":0.63,"positive_predictive_value":0.62,"negative_predictive_value":0.63,"accuracy_perc":63,"true_positive_rate_perc":63,"true_negative_rate_perc":63,"positive_predictive_value_perc":62,"negative_predictive_value_perc":63,"name":"without feature selection","abbrev":"tensorflow-without feature selection"},{"accuracy":0.63,"true_positive_rate":0.61,"true_negative_rate":0.64,"positive_predictive_value":0.63,"negative_predictive_value":0.63,"accuracy_perc":63,"true_positive_rate_perc":61,"true_negative_rate_perc":64,"positive_predictive_value_perc":63,"negative_predictive_value_perc":63,"name":"with feature selection","abbrev":"tensorflow-with feature selection"}]},{"name":"lazar","algos":[{"accuracy":0.82,"true_positive_rate":0.85,"true_negative_rate":0.78,"positive_predictive_value":0.8,"negative_predictive_value":0.84,"accuracy_perc":82,"true_positive_rate_perc":85,"true_negative_rate_perc":78,"positive_predictive_value_perc":80,"negative_predictive_value_perc":84,"name":"all","abbrev":"lazar-all"},{"accuracy":0.84,"true_positive_rate":0.89,"true_negative_rate":0.79,"positive_predictive_value":0.83,"negative_predictive_value":0.85,"accuracy_perc":84,"true_positive_rate_perc":89,"true_negative_rate_perc":79,"positive_predictive_value_perc":83,"negative_predictive_value_perc":85,"name":"high-confidence","abbrev":"lazar-high-confidence"},{"accuracy":0.58,"true_positive_rate":0.32,"true_negative_rate":0.79,"positive_predictive_value":0.56,"negative_predictive_value":0.59,"accuracy_perc":58,"true_positive_rate_perc":32,"true_negative_rate_perc":79,"positive_predictive_value_perc":56,"negative_predictive_value_perc":59,"name":"PaDEL all","abbrev":"lazar-PaDEL all"},{"accuracy":0.58,"true_positive_rate":0.32,"true_negative_rate":0.79,"positive_predictive_value":0.56,"negative_predictive_value":0.59,"accuracy_perc":58,"true_positive_rate_perc":32,"true_negative_rate_perc":79,"positive_predictive_value_perc":56,"negative_predictive_value_perc":59,"name":"PaDEL high-confidence","abbrev":"lazar-PaDEL high-confidence"}]}]}
diff --git a/10-fold-crossvalidations/summaries/tensorflow-all.json b/10-fold-crossvalidations/summaries/tensorflow-all.json
deleted file mode 100644
index a605a4d..0000000
--- a/10-fold-crossvalidations/summaries/tensorflow-all.json
+++ /dev/null
@@ -1 +0,0 @@
-{"accuracy":0.6258663366336633,"true_positive_rate":0.6264367816091954,"true_negative_rate":0.6253065228052967,"positive_predictive_value":0.6213135068153656,"negative_predictive_value":0.630407911001236}
diff --git a/10-fold-crossvalidations/summaries/tensorflow-selected.json b/10-fold-crossvalidations/summaries/tensorflow-selected.json
deleted file mode 100644
index 93c54ef..0000000
--- a/10-fold-crossvalidations/summaries/tensorflow-selected.json
+++ /dev/null
@@ -1 +0,0 @@
-{"accuracy":0.6283415841584158,"true_positive_rate":0.612943528235882,"true_negative_rate":0.6434526728788622,"positive_predictive_value":0.6278474532889685,"negative_predictive_value":0.6288042175892643}
diff --git a/results.yaml b/10-fold-crossvalidations/summary.yaml
index 5952b39..2c6f98b 100644
--- a/results.yaml
+++ b/10-fold-crossvalidations/summary.yaml
@@ -50,40 +50,6 @@ R-DL:
:tnr_perc: 24
:ppv_perc: 53
:npv_perc: 67
-tensorflow-all:
- :tp: 2507
- :fp: 1528
- :tn: 2550
- :fn: 1495
- :n: 8080
- :acc: 0.63
- :tpr: 0.63
- :fpr: 0.37
- :tnr: 0.63
- :ppv: 0.62
- :npv: 0.63
- :acc_perc: 63
- :tpr_perc: 63
- :tnr_perc: 63
- :ppv_perc: 62
- :npv_perc: 63
-tensorflow-selected:
- :tp: 2453
- :fp: 1454
- :tn: 2624
- :fn: 1549
- :n: 8080
- :acc: 0.63
- :tpr: 0.61
- :fpr: 0.36
- :tnr: 0.64
- :ppv: 0.63
- :npv: 0.63
- :acc_perc: 63
- :tpr_perc: 61
- :tnr_perc: 64
- :ppv_perc: 63
- :npv_perc: 63
lazar-all:
:tp: 3326
:fp: 833
diff --git a/Makefile b/Makefile
index 8b39538..b2a4708 100644
--- a/Makefile
+++ b/Makefile
@@ -7,122 +7,69 @@ ARTICLE_FILE = mutagenicity.mustache.md
PANDOC_SCHOLAR_PATH = pandoc-scholar
OUTFILE_PREFIX = mutagenicity
DEFAULT_EXTENSIONS = pdf #latex docx html #odt epub
-#PANDOC_WRITER_OPTIONS = --filter=panpipe --filter=pandoc-placetable --filter=pandoc-citeproc -M tmpvar=test
PANDOC_WRITER_OPTIONS = --filter=pandoc-placetable --filter=pandoc-crossref --filter=pandoc-citeproc
TEMPLATE_FILE_LATEX = pandoc-scholar.latex
-# Lazar
-
-LAZAR_DIR = ../lazar
-LAZAR_MODEL_DIR = $(LAZAR_DIR)/models/mutagenicity
-LAZAR_PADEL_MODEL_DIR = $(LAZAR_DIR)/models/mutagenicity-padel
-LAZAR_SUMMARY_DIR = $(LAZAR_MODEL_DIR)/crossvalidation/summaries
-LAZAR_PADEL_SUMMARY_DIR = $(LAZAR_PADEL_MODEL_DIR)/crossvalidation/summaries
-LAZAR_CONFUSION_MATRIX_DIR = $(LAZAR_MODEL_DIR)/crossvalidation/confusion_matrices
-LAZAR_PADEL_CONFUSION_MATRIX_DIR = $(LAZAR_PADEL_MODEL_DIR)/crossvalidation/confusion_matrices
-
# Experiments
-SUMMARIES_DIR = 10-fold-crossvalidations/summaries
-CONFUSION_MATRICES_DIR = 10-fold-crossvalidations/confusion-matrices
+LAZAR_CONFUSION_MATRIX_DIR = 10-fold-crossvalidations/lazar/crossvalidation/confusion_matrices
+LAZAR_PADEL_CONFUSION_MATRIX_DIR = 10-fold-crossvalidations/lazar-padel/crossvalidation/confusion_matrices
R_CV_DIR = 10-fold-crossvalidations/R
TENSORFLOW_CV_DIR = 10-fold-crossvalidations/tensorflow
-#TABLES = tables/r-summary.csv tables/tf-summary.csv tables/lazar-summary.csv tables/R-SVM.csv tables/R-RF.csv tables/R-DL.csv tables/tensorflow-all.csv tables/tensorflow-selected.csv tables/lazar-all.csv tables/lazar-high-confidence.csv tables/lazar-padel-all.csv tables/lazar-padel-high-confidence.csv
-TABLES = tables/R-SVM.csv tables/R-RF.csv tables/R-DL.csv tables/tensorflow-all.csv tables/tensorflow-selected.csv tables/lazar-all.csv tables/lazar-high-confidence.csv tables/lazar-padel-all.csv tables/lazar-padel-high-confidence.csv #tables/pred.rf.v3.csv tables/pred.lr.v3.csv tables/pred.lr2.v3.csv tables/pred.nn.v3.csv
-
-R_SUMMARIES = $(SUMMARIES_DIR)/R-SVM.json $(SUMMARIES_DIR)/R-RF.json $(SUMMARIES_DIR)/R-DL.json
-TF_SUMMARIES = $(SUMMARIES_DIR)/tensorflow-all.json $(SUMMARIES_DIR)/tensorflow-selected.json $(SUMMARIES_DIR)/pred.lr.v3.json $(SUMMARIES_DIR)/pred.lr2.v3.json $(SUMMARIES_DIR)/pred.nn.v3.json $(SUMMARIES_DIR)/pred.rf.v3.json
-LAZAR_SUMMARIES = $(SUMMARIES_DIR)/lazar-all.json $(SUMMARIES_DIR)/lazar-high-confidence.json $(SUMMARIES_DIR)/lazar-padel-all.json $(SUMMARIES_DIR)/lazar-padel-high-confidence.json
+CONFUSION_MATRICES_DIR = 10-fold-crossvalidations/confusion-matrices
+CONFUSION_MATRICES = $(CONFUSION_MATRICES_DIR)/R-SVM.csv $(CONFUSION_MATRICES_DIR)/R-RF.csv $(CONFUSION_MATRICES_DIR)/R-DL.csv $(CONFUSION_MATRICES_DIR)/lazar-all.csv $(CONFUSION_MATRICES_DIR)/lazar-high-confidence.csv $(CONFUSION_MATRICES_DIR)/lazar-padel-all.csv $(CONFUSION_MATRICES_DIR)/lazar-padel-high-confidence.csv $(CONFUSION_MATRICES_DIR)/tensorflow-lr.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-lr2.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-nn.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-rf.v3.csv
-#SUMMARIES = $(R_SUMMARIES) $(TF_SUMMARIES) $(LAZAR_SUMMARIES)
+CV_SUMMARY = 10-fold-crossvalidations/summary.yaml
+TABLES = tables/lazar-summary.csv tables/r-summary.csv tables/tensorflow-summary.csv
+FIGURES = figures/roc.png figures/tsne-mp2d.png figures/tsne-padel.png #figures/pa-predictions.png
-CONFUSION_MATRICES = $(CONFUSION_MATRICES_DIR)/R-SVM.csv $(CONFUSION_MATRICES_DIR)/R-RF.csv $(CONFUSION_MATRICES_DIR)/R-DL.csv $(CONFUSION_MATRICES_DIR)/tensorflow-all.csv $(CONFUSION_MATRICES_DIR)/tensorflow-selected.csv $(CONFUSION_MATRICES_DIR)/lazar-all.csv $(CONFUSION_MATRICES_DIR)/lazar-high-confidence.csv $(CONFUSION_MATRICES_DIR)/lazar-padel-all.csv $(CONFUSION_MATRICES_DIR)/lazar-padel-high-confidence.csv $(CONFUSION_MATRICES_DIR)/tensorflow-lr.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-lr2.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-nn.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-rf.v3.csv
-DATA = data/mutagenicity.sdf data/mutagenicity.csv data/mutagenicity-fingerprints.csv
-FIGURES = figures/roc.png figures/tsne-mp2d.png figures/tsne-padel.png
+# manuscript
-all: $(DATA) $(TABLES) $(FIGURES) mutagenicity.pdf
+all: $(TABLES) $(FIGURES) mutagenicity.pdf
include $(PANDOC_SCHOLAR_PATH)/Makefile
-export: $(DATA)
-mutagenicity.mustache.md: results.yaml mutagenicity.md $(FIGURES)
+mutagenicity.mustache.md: $(CV_SUMMARY) mutagenicity.md $(TABLES) $(FIGURES)
mustache $^ > $@
# figures
+
figures/tsne-padel.png: figures/tsne-padel.csv
- scripts/padel-tsne.R
+ scripts/tsne-padel.R
figures/tsne-padel.csv: data/GenoTox-database.csv pyrrolizidine-alkaloids/PA-Padel-2D_m2.csv
scripts/padel-descriptors.rb $^ > $@
figures/tsne-mp2d.png: figures/tsne-mp2d.csv
- scripts/mp2d-tsne.R
+ scripts/tsne-mp2d.R
-figures/tsne-mp2d.csv: ../lazar/models/mutagenicity/independent_variables
+figures/tsne-mp2d.csv: 10-fold-crossvalidations/lazar/independent_variables
scripts/mp2d-distances.rb > figures/tsne-mp2d.csv
-figures/roc.png: figures/results.csv
+figures/roc.png: figures/roc.csv
scripts/roc.R
-figures/results.csv: results.yaml
+figures/roc.csv: $(CV_SUMMARY)
scripts/results2csv.rb $< > $@
# tables
-tables/r-summary.csv: $(R_SUMMARIES)
- scripts/summaries2table.rb $^ > $@
-
-tables/tf-summary.csv: $(TF_SUMMARIES)
- scripts/summaries2table.rb $^ > $@
+tables/lazar-summary.csv: $(CV_SUMMARY)
+ scripts/summaries2table.rb lazar > $@
-tables/lazar-summary.csv: $(LAZAR_SUMMARIES)
- scripts/summaries2table.rb $^ > $@
+tables/r-summary.csv: $(CV_SUMMARY)
+ scripts/summaries2table.rb R > $@
-tables/%.csv: $(CONFUSION_MATRICES_DIR)/%.csv
- scripts/confusion-matrix2table.rb $< > $@
+tables/tensorflow-summary.csv: $(CV_SUMMARY)
+ scripts/summaries2table.rb tensorflow > $@
-# summaries
+# crossvalidation summary
-#$(SUMMARIES_DIR)/results.json: $(SUMMARIES)
- #scripts/results.rb $^ > $@
-
-#$(SUMMARIES_DIR)/%.json: $(CONFUSION_MATRICES_DIR)/%.csv
- #scripts/confusion-matrix-summary.rb $< > $@
-
-results.yaml: $(CONFUSION_MATRICES)
+$(CV_SUMMARY): $(CONFUSION_MATRICES)
scripts/confusion-matrix-summary.rb $^ > $@
# confusion matrices
-## tensorflow
-$(CONFUSION_MATRICES_DIR)/tensorflow-selected.csv: $(TENSORFLOW_CV_DIR)/pred.sorted.csv
- scripts/cv-tensorflow-confusion-matrix.rb $< > $@
-
-$(CONFUSION_MATRICES_DIR)/tensorflow-all.csv: $(TENSORFLOW_CV_DIR)/pred_ext.sorted.csv
- scripts/cv-tensorflow-confusion-matrix.rb $< > $@
-
-$(CONFUSION_MATRICES_DIR)/tensorflow-lr.v3.csv: $(TENSORFLOW_CV_DIR)/pred.lr.v3.sorted.csv
- scripts/cv-tensorflow-confusion-matrix.rb $< > $@
-
-$(CONFUSION_MATRICES_DIR)/tensorflow-lr2.v3.csv: $(TENSORFLOW_CV_DIR)/pred.lr2.v3.sorted.csv
- scripts/cv-tensorflow-confusion-matrix.rb $< > $@
-
-$(CONFUSION_MATRICES_DIR)/tensorflow-nn.v3.csv: $(TENSORFLOW_CV_DIR)/pred.nn.v3.sorted.csv
- scripts/cv-tensorflow-confusion-matrix.rb $< > $@
-
-$(CONFUSION_MATRICES_DIR)/tensorflow-rf.v3.csv: $(TENSORFLOW_CV_DIR)/pred.rf.v3.sorted.csv
- scripts/cv-tensorflow-confusion-matrix.rb $< > $@
-
-## R
-$(CONFUSION_MATRICES_DIR)/R-SVM.csv: $(R_CV_DIR)/Sgl-Observations-SVM.csv
- scripts/cv-r-confusion-matrix.rb $< > $@
-
-$(CONFUSION_MATRICES_DIR)/R-RF.csv: $(R_CV_DIR)/Sgl-Observations-RF.csv
- scripts/cv-r-confusion-matrix.rb $< > $@
-
-$(CONFUSION_MATRICES_DIR)/R-DL.csv: $(R_CV_DIR)/Sgl-Observations-DL.csv
- scripts/cv-r-confusion-matrix.rb $< > $@
-
## lazar
$(CONFUSION_MATRICES_DIR)/lazar-all.csv: $(LAZAR_CONFUSION_MATRIX_DIR)
cp $</all $@
@@ -136,21 +83,26 @@ $(CONFUSION_MATRICES_DIR)/lazar-padel-all.csv: $(LAZAR_PADEL_CONFUSION_MATRIX_DI
$(CONFUSION_MATRICES_DIR)/lazar-padel-high-confidence.csv: $(LAZAR_PADEL_CONFUSION_MATRIX_DIR)
cp $</high_confidence $@
-# exports
+## R
+$(CONFUSION_MATRICES_DIR)/R-SVM.csv: $(R_CV_DIR)/Sgl-Observations-SVM.csv
+ scripts/cv-r-confusion-matrix.rb $< > $@
-data/mutagenicity-fingerprints.csv: $(LAZAR_DIR)/models/mutagenicity
- $(LAZAR_DIR)/bin/export-fingerprints.rb $< > $@
+$(CONFUSION_MATRICES_DIR)/R-RF.csv: $(R_CV_DIR)/Sgl-Observations-RF.csv
+ scripts/cv-r-confusion-matrix.rb $< > $@
-data/mutagenicity.csv: $(LAZAR_DIR)/models/mutagenicity/Mutagenicity-Salmonella_typhimurium.csv
- cp $< > $@
+$(CONFUSION_MATRICES_DIR)/R-DL.csv: $(R_CV_DIR)/Sgl-Observations-DL.csv
+ scripts/cv-r-confusion-matrix.rb $< > $@
-data/mutagenicity.sdf: $(LAZAR_DIR)/models/mutagenicity/Mutagenicity-Salmonella_typhimurium.csv
- $(LAZAR_DIR)/bin/export-sdf.rb $< > $@
-
-# lazar models and crossvalidations
+## tensorflow
+$(CONFUSION_MATRICES_DIR)/tensorflow-lr.v3.csv: $(TENSORFLOW_CV_DIR)/pred.lr.v3.sorted.csv
+ scripts/cv-tensorflow-confusion-matrix.rb $< > $@
-$(LAZAR_SUMMARY_DIR):
- make -C $(LAZAR_MODEL_DIR)
+$(CONFUSION_MATRICES_DIR)/tensorflow-lr2.v3.csv: $(TENSORFLOW_CV_DIR)/pred.lr2.v3.sorted.csv
+ scripts/cv-tensorflow-confusion-matrix.rb $< > $@
+
+$(CONFUSION_MATRICES_DIR)/tensorflow-nn.v3.csv: $(TENSORFLOW_CV_DIR)/pred.nn.v3.sorted.csv
+ scripts/cv-tensorflow-confusion-matrix.rb $< > $@
+
+$(CONFUSION_MATRICES_DIR)/tensorflow-rf.v3.csv: $(TENSORFLOW_CV_DIR)/pred.rf.v3.sorted.csv
+ scripts/cv-tensorflow-confusion-matrix.rb $< > $@
-$(LAZAR_PADEL_SUMMARY_DIR):
- make -C $(LAZAR_PADEL_MODEL_DIR)
diff --git a/figures/results.csv b/figures/roc.csv
index a9a3676..ac79a2e 100644
--- a/figures/results.csv
+++ b/figures/roc.csv
@@ -2,8 +2,6 @@ tpr,fpr
R-SVM,0.56,0.33
R-RF,0.56,0.29
R-DL,0.88,0.76
-TF,0.63,0.37
-TF-FS,0.61,0.36
L,0.85,0.22
L-HC,0.89,0.21
L-P,0.32,0.21
diff --git a/figures/roc.png b/figures/roc.png
index a7cb04f..4ced78d 100644
--- a/figures/roc.png
+++ b/figures/roc.png
Binary files differ
diff --git a/mutagenicity.md b/mutagenicity.md
index 418c2d1..9012ce5 100644
--- a/mutagenicity.md
+++ b/mutagenicity.md
@@ -476,6 +476,16 @@ Results
Crossvalidation results are summarized in the following tables: @tbl:lazar shows `lazar` results with MolPrint2D and PaDEL descriptors, @tbl:R summarizes R results and @tbl:tensorflow Tensorflow results.
+
+```{#tbl:lazar .table file="tables/lazar-summary.csv" caption="Summary of lazar crossvalidation results"}
+```
+
+```{#tbl:R .table file="tables/r-summary.csv" caption="Summary of R crossvalidation results"}
+```
+
+```{#tbl:tensorflow .table file="tables/tensorflow-summary.csv" caption="Summary of tensorflow crossvalidation results"}
+```
+
@fig:roc depicts the position of all crossvalidation results in receiver operating characteristic (ROC) space.
Confusion matrices for all models are available from the git repository http://git.in-silico.ch/mutagenicity-paper/10-fold-crossvalidations/confusion-matrices/, individual predictions can be found in
@@ -483,6 +493,7 @@ http://git.in-silico.ch/mutagenicity-paper/10-fold-crossvalidations/predictions/
The most accurate crossvalidation predictions have been obtained with `lazar` models with MolPrint2D descriptors ({{lazar-high-confidence.acc}} for predictions with high confidence, {{lazar-all.acc}} for all predictions). Models utilizing PaDEL descriptors have generally lower accuracies ranging from TODO to TODO. Sensitivity and specificity is generally well balanced with the exception of `lazar`-PaDEL (low sensitivity) and R deep learning (low specificity) models.
+<!--
| |R-RF | R-SVM | R-DL | TF | TF-FS | L | L-HC | L-P | L-P-HC|
|-|-----|-------|------|----|-------|---|------|------|--------|
|Accuracy|{{R-RF.acc}}|{{R-SVM.acc}}|{{R-DL.acc}}|{{tensorflow-all.acc}}|{{tensorflow-selected.acc}}|{{lazar-all.acc}}|{{lazar-high-confidence.acc}}|{{lazar-padel-all.acc}}|{{lazar-padel-high-confidence.acc}}|
@@ -496,7 +507,6 @@ The most accurate crossvalidation predictions have been obtained with `lazar` mo
![ROC plot of crossvalidation results. *R-RF*: R Random Forests, *R-SVM*: R Support Vector Machines, *R-DL*: R Deep Learning, *TF*: Tensorflow without feature selection, *TF-FS*: Tensorflow with feature selection, *L*: lazar, *L-HC*: lazar high confidence predictions, *L-P*: lazar with PaDEL descriptors, *L-P-HC*: lazar PaDEL high confidence predictions (overlaps with L-P)](figures/roc.png){#fig:roc}
-<!--
R Models
--------
diff --git a/export/pa_carcinogenicity.csv b/pyrrolizidine-alkaloids/pa_carcinogenicity.csv
index 1a3815b..1a3815b 100644
--- a/export/pa_carcinogenicity.csv
+++ b/pyrrolizidine-alkaloids/pa_carcinogenicity.csv
diff --git a/export/pa_mutagenicity.csv b/pyrrolizidine-alkaloids/pa_mutagenicity.csv
index 7b0d6ea..7b0d6ea 100644
--- a/export/pa_mutagenicity.csv
+++ b/pyrrolizidine-alkaloids/pa_mutagenicity.csv
diff --git a/scripts/confusion-matrix2table.rb b/scripts/confusion-matrix2table.rb
deleted file mode 100755
index ccb4817..0000000
--- a/scripts/confusion-matrix2table.rb
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/usr/bin/env ruby
-
-mat = []
-File.readlines(ARGV[0]).each do |l|
- mat << l.chomp.split(",")
-end
-puts ",,Predictions,"
-puts ",,mutagenic,non-mutagenic"
-puts "Measurements,mutagenic,#{mat[0][0]},#{mat[0][1]}"
-puts ",non-mutagenic,#{mat[1][0]},#{mat[1][1]}"
diff --git a/scripts/crossvalidation-summary.rb b/scripts/crossvalidation-summary.rb
deleted file mode 100755
index 13b0dfa..0000000
--- a/scripts/crossvalidation-summary.rb
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/env ruby
-require_relative '../../lazar/lib/lazar'
-include OpenTox
-
-summary = []
-model = Model::Validation.find(File.read(ARGV[0]).chomp).crossvalidations.each do |cv|
- summary << cv.statistics
-end
-puts JSON.pretty_generate(summary)
diff --git a/scripts/json2csv.rb b/scripts/json2csv.rb
deleted file mode 100755
index 03191de..0000000
--- a/scripts/json2csv.rb
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env ruby
-require_relative '../../lazar/lib/lazar'
-include OpenTox
-
-results = JSON.parse File.read(ARGV[0])
-puts "SMILES,#{File.basename(ARGV[0],".json").sub("pa_","").capitalize},Probability(0),Probability(1),Nr Neighbors,Warnings"
-results.each do |id,r|
- s = Compound.find(id).smiles
- if r["value"]
- puts [
- s,
- r["value"],
- r["probabilities"]["0"],
- r["probabilities"]["1"],
- r["neighbors"].size,
- r["warnings"],
- ].join(",")
- else
- r["neighbors"] ? n = r["neighbors"].size : n = nil
- puts [
- s,
- r["value"],
- nil,
- nil,
- n,
- r["warnings"],
- ].join(",")
- end
-end
diff --git a/scripts/results.rb b/scripts/results.rb
deleted file mode 100755
index 1a36278..0000000
--- a/scripts/results.rb
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env ruby
-require 'json'
-
-result = {}
-ARGV.each do |f|
- fname = File.basename(f,".json")
- program,algo = fname.split('-')
- case program
- when "tensorflow"
- algo == "all" ? algo = "without feature selection" : algo = "with feature selection"
- when "lazar"
- algo = "high-confidence" if algo == "high"
- if algo == "padel"
- algo = "PaDEL"
- fname.match("high") ? algo += " high-confidence" : algo += " all"
- end
- end
- result[program] ||= {}
- result[program][algo] = JSON.parse(File.read(f)).collect{|k,v| [k,v.round(2)]}.to_h
-end
-
-out = {:programs => []}
-result.keys.each do |prog|
- out[:programs] << {:name => prog, :algos => []}
- result[prog].keys.each do |algo|
- r = result[prog][algo].dup
- result[prog][algo].each do |k,v|
- r[k+"_perc"] = (v*100).round
- end
- r[:name] = algo
- r[:abbrev] = prog+"-"+algo
- out[:programs].last[:algos] << r
- end
-end
-
-puts out.to_json
diff --git a/scripts/roc.R b/scripts/roc.R
index cb219fc..afc8293 100755
--- a/scripts/roc.R
+++ b/scripts/roc.R
@@ -1,6 +1,6 @@
#!/usr/bin/env Rscript
library(ggplot2)
-data <- read.csv("figures/results.csv",header=T)
+data <- read.csv("figures/roc.csv",header=T)
p <- ggplot(data, aes(x=fpr, y=tpr)) + geom_abline()
p <- p + geom_label(label=rownames(data) )
p <- p + expand_limits(x=c(0,1),y=c(0,1))
diff --git a/scripts/summaries2table.rb b/scripts/summaries2table.rb
index 5470b26..f98ec54 100755
--- a/scripts/summaries2table.rb
+++ b/scripts/summaries2table.rb
@@ -1,19 +1,23 @@
#!/usr/bin/env ruby
-require 'json'
+require 'yaml'
-results = {}
+rows = {:acc => "Accuracy", :tpr => "True positive rate/Sensitivity", :tnr => "True negative rate/Specificity", :ppv => "Positive predictive value/Precision", :npv => "Negative predictive value", :n => "Nr. predictions"}
+data = YAML.load_file "10-fold-crossvalidations/summary.yaml"
-ARGV.each do |f|
- results[File.basename(f,".json")] = JSON.parse(File.read(f))
+case ARGV[0]
+when "R"
+ header = ["RF","SVM","DL"]
+ keys = header.collect{|h| "R-"+h}
+when "tensorflow"
+ header = ["RF","LR (SGD)","LR (SCIKIT)","NN"]
+ keys = ["lr","lr2","nn"].collect{|n| "tensorflow-"+n+".v3"}
+when "lazar"
+ header = ["lazar-mp2d (all)","lazar-mp2d (high confidence)", "lazar-padel (all)","lazar-padel (high confidence)"]
+ keys = ["lazar-all","lazar-high-confidence", "lazar-padel-all","lazar-padel-high-confidence"]
end
-
-print ","
-puts results.keys.collect{|k| k.sub("tensorflow","TF")}.join(",")
-["accuracy","true_positive_rate","true_negative_rate","positive_predictive_value","negative_predictive_value"].each do |m|
- line = [m.gsub("_"," ")]
- results.each do |k,v|
- line << v[m].round(2)
- end
- puts line.join(",")
+puts ","+header.join(",")
+rows.each do |short,long|
+ print long+","
+ puts keys.collect{|k| data[k][short]}.join(",")
end
-
+exit
diff --git a/scripts/mp2d-tsne.R b/scripts/tsne-mp2d.R
index 0877622..0877622 100755
--- a/scripts/mp2d-tsne.R
+++ b/scripts/tsne-mp2d.R
diff --git a/scripts/padel-tsne.R b/scripts/tsne-padel.R
index b8e9763..b8e9763 100755
--- a/scripts/padel-tsne.R
+++ b/scripts/tsne-padel.R
diff --git a/tables/R-DL.csv b/tables/R-DL.csv
deleted file mode 100644
index d622e0e..0000000
--- a/tables/R-DL.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-,,Predictions,
-,,mutagenic,non-mutagenic
-Measurements,mutagenic,3517,3099
-,non-mutagenic,483,971
diff --git a/tables/R-RF.csv b/tables/R-RF.csv
deleted file mode 100644
index d81cff6..0000000
--- a/tables/R-RF.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-,,Predictions,
-,,mutagenic,non-mutagenic
-Measurements,mutagenic,2259,1173
-,non-mutagenic,1741,2897
diff --git a/tables/R-SVM.csv b/tables/R-SVM.csv
deleted file mode 100644
index 9aaf85f..0000000
--- a/tables/R-SVM.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-,,Predictions,
-,,mutagenic,non-mutagenic
-Measurements,mutagenic,2243,1353
-,non-mutagenic,1757,2717
diff --git a/tables/lazar-all.csv b/tables/lazar-all.csv
deleted file mode 100644
index c4db6a1..0000000
--- a/tables/lazar-all.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-,,Predictions,
-,,mutagenic,non-mutagenic
-Measurements,mutagenic,3326,833
-,non-mutagenic,583,3039
diff --git a/tables/lazar-high-confidence.csv b/tables/lazar-high-confidence.csv
deleted file mode 100644
index 049c73b..0000000
--- a/tables/lazar-high-confidence.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-,,Predictions,
-,,mutagenic,non-mutagenic
-Measurements,mutagenic,2816,571
-,non-mutagenic,365,2138
diff --git a/tables/lazar-padel-all.csv b/tables/lazar-padel-all.csv
deleted file mode 100644
index e2436f7..0000000
--- a/tables/lazar-padel-all.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-,,Predictions,
-,,mutagenic,non-mutagenic
-Measurements,mutagenic,593,466
-,non-mutagenic,1253,1777
diff --git a/tables/lazar-padel-high-confidence.csv b/tables/lazar-padel-high-confidence.csv
deleted file mode 100644
index 1b551d9..0000000
--- a/tables/lazar-padel-high-confidence.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-,,Predictions,
-,,mutagenic,non-mutagenic
-Measurements,mutagenic,593,466
-,non-mutagenic,1251,1771
diff --git a/tables/lazar-summary.csv b/tables/lazar-summary.csv
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tables/lazar-summary.csv
diff --git a/tables/r-summary.csv b/tables/r-summary.csv
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tables/r-summary.csv
diff --git a/tables/tensorflow-all.csv b/tables/tensorflow-all.csv
deleted file mode 100644
index afd74c5..0000000
--- a/tables/tensorflow-all.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-,,Predictions,
-,,mutagenic,non-mutagenic
-Measurements,mutagenic,2507,1528
-,non-mutagenic,1495,2550
diff --git a/tables/tensorflow-selected.csv b/tables/tensorflow-selected.csv
deleted file mode 100644
index 6c0f6e5..0000000
--- a/tables/tensorflow-selected.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-,,Predictions,
-,,mutagenic,non-mutagenic
-Measurements,mutagenic,2453,1454
-,non-mutagenic,1549,2624
diff --git a/tables/tensorflow-summary.csv b/tables/tensorflow-summary.csv
new file mode 100644
index 0000000..bbd4885
--- /dev/null
+++ b/tables/tensorflow-summary.csv
@@ -0,0 +1,7 @@
+,RF,LR (SGD),LR (SCIKIT),NN
+Accuracy,0.62,0.63,0.63
+True positive rate/Sensitivity,0.6,0.62,0.61
+True negative rate/Specificity,0.65,0.63,0.64
+Positive predictive value/Precision,0.63,0.62,0.63
+Negative predictive value,0.62,0.63,0.63
+Nr. predictions,8080,8080,8080