summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2019-10-21 17:29:52 +0200
committerChristoph Helma <helma@in-silico.ch>2019-10-21 17:29:52 +0200
commit93f2fb17788b9d02b00935e0d1be7cd1d81ff555 (patch)
tree95ea869bf48bd41bb0d6d341e6cee7f3e01d2c81
parent1035124b854e21998d3fd9de4935780a19a2d3d3 (diff)
mustache preprocessing
-rw-r--r--10-fold-crossvalidations/confusion-matrices/tensorflow-all.csv4
-rw-r--r--10-fold-crossvalidations/confusion-matrices/tensorflow-selected.csv4
-rw-r--r--10-fold-crossvalidations/summaries/R-DL.json (renamed from 10-fold-crossvalidations/summaries/R-DL.csv)0
-rw-r--r--10-fold-crossvalidations/summaries/R-RF.json (renamed from 10-fold-crossvalidations/summaries/R-RF.csv)0
-rw-r--r--10-fold-crossvalidations/summaries/R-SVM.json (renamed from 10-fold-crossvalidations/summaries/R-SVM.csv)0
-rw-r--r--10-fold-crossvalidations/summaries/lazar-all.json (renamed from 10-fold-crossvalidations/summaries/lazar-all.csv)0
-rw-r--r--10-fold-crossvalidations/summaries/lazar-high-confidence.json (renamed from 10-fold-crossvalidations/summaries/lazar-high-confidence.csv)0
-rw-r--r--10-fold-crossvalidations/summaries/lazar-padel-all.json (renamed from 10-fold-crossvalidations/summaries/lazar-padel-all.csv)0
-rw-r--r--10-fold-crossvalidations/summaries/lazar-padel-high-confidence.json (renamed from 10-fold-crossvalidations/summaries/lazar-padel-high-confidence.csv)0
-rw-r--r--10-fold-crossvalidations/summaries/results.json1
-rw-r--r--10-fold-crossvalidations/summaries/tensorflow-all.csv1
-rw-r--r--10-fold-crossvalidations/summaries/tensorflow-all.json1
-rw-r--r--10-fold-crossvalidations/summaries/tensorflow-selected.csv1
-rw-r--r--10-fold-crossvalidations/summaries/tensorflow-selected.json1
-rw-r--r--Makefile52
-rw-r--r--mutagenicity.md139
-rwxr-xr-xscripts/confusion-matrix2table.rb10
-rwxr-xr-xscripts/cv-tensorflow-confusion-matrix.rb2
-rwxr-xr-xscripts/results.rb36
-rwxr-xr-xscripts/summaries2table.rb19
20 files changed, 176 insertions, 95 deletions
diff --git a/10-fold-crossvalidations/confusion-matrices/tensorflow-all.csv b/10-fold-crossvalidations/confusion-matrices/tensorflow-all.csv
index 329eae9..f4b80d7 100644
--- a/10-fold-crossvalidations/confusion-matrices/tensorflow-all.csv
+++ b/10-fold-crossvalidations/confusion-matrices/tensorflow-all.csv
@@ -1,2 +1,2 @@
-1991,2044
-2011,2034
+2507,1528
+1495,2550
diff --git a/10-fold-crossvalidations/confusion-matrices/tensorflow-selected.csv b/10-fold-crossvalidations/confusion-matrices/tensorflow-selected.csv
index 93b18af..9d5102e 100644
--- a/10-fold-crossvalidations/confusion-matrices/tensorflow-selected.csv
+++ b/10-fold-crossvalidations/confusion-matrices/tensorflow-selected.csv
@@ -1,2 +1,2 @@
-1928,1979
-2074,2099
+2453,1454
+1549,2624
diff --git a/10-fold-crossvalidations/summaries/R-DL.csv b/10-fold-crossvalidations/summaries/R-DL.json
index 8a48d30..8a48d30 100644
--- a/10-fold-crossvalidations/summaries/R-DL.csv
+++ b/10-fold-crossvalidations/summaries/R-DL.json
diff --git a/10-fold-crossvalidations/summaries/R-RF.csv b/10-fold-crossvalidations/summaries/R-RF.json
index ab7d6e7..ab7d6e7 100644
--- a/10-fold-crossvalidations/summaries/R-RF.csv
+++ b/10-fold-crossvalidations/summaries/R-RF.json
diff --git a/10-fold-crossvalidations/summaries/R-SVM.csv b/10-fold-crossvalidations/summaries/R-SVM.json
index a038447..a038447 100644
--- a/10-fold-crossvalidations/summaries/R-SVM.csv
+++ b/10-fold-crossvalidations/summaries/R-SVM.json
diff --git a/10-fold-crossvalidations/summaries/lazar-all.csv b/10-fold-crossvalidations/summaries/lazar-all.json
index e68ff79..e68ff79 100644
--- a/10-fold-crossvalidations/summaries/lazar-all.csv
+++ b/10-fold-crossvalidations/summaries/lazar-all.json
diff --git a/10-fold-crossvalidations/summaries/lazar-high-confidence.csv b/10-fold-crossvalidations/summaries/lazar-high-confidence.json
index a9f852e..a9f852e 100644
--- a/10-fold-crossvalidations/summaries/lazar-high-confidence.csv
+++ b/10-fold-crossvalidations/summaries/lazar-high-confidence.json
diff --git a/10-fold-crossvalidations/summaries/lazar-padel-all.csv b/10-fold-crossvalidations/summaries/lazar-padel-all.json
index d8ce18a..d8ce18a 100644
--- a/10-fold-crossvalidations/summaries/lazar-padel-all.csv
+++ b/10-fold-crossvalidations/summaries/lazar-padel-all.json
diff --git a/10-fold-crossvalidations/summaries/lazar-padel-high-confidence.csv b/10-fold-crossvalidations/summaries/lazar-padel-high-confidence.json
index 7ec0b1e..7ec0b1e 100644
--- a/10-fold-crossvalidations/summaries/lazar-padel-high-confidence.csv
+++ b/10-fold-crossvalidations/summaries/lazar-padel-high-confidence.json
diff --git a/10-fold-crossvalidations/summaries/results.json b/10-fold-crossvalidations/summaries/results.json
new file mode 100644
index 0000000..033c728
--- /dev/null
+++ b/10-fold-crossvalidations/summaries/results.json
@@ -0,0 +1 @@
+{"programs":[{"name":"R","algos":[{"accuracy":0.61,"true_positive_rate":0.56,"true_negative_rate":0.67,"positive_predictive_value":0.62,"negative_predictive_value":0.61,"accuracy_perc":61,"true_positive_rate_perc":56,"true_negative_rate_perc":67,"positive_predictive_value_perc":62,"negative_predictive_value_perc":61,"name":"SVM","abbrev":"R-SVM"},{"accuracy":0.64,"true_positive_rate":0.56,"true_negative_rate":0.71,"positive_predictive_value":0.66,"negative_predictive_value":0.62,"accuracy_perc":64,"true_positive_rate_perc":56,"true_negative_rate_perc":71,"positive_predictive_value_perc":66,"negative_predictive_value_perc":62,"name":"RF","abbrev":"R-RF"},{"accuracy":0.56,"true_positive_rate":0.88,"true_negative_rate":0.24,"positive_predictive_value":0.53,"negative_predictive_value":0.67,"accuracy_perc":56,"true_positive_rate_perc":88,"true_negative_rate_perc":24,"positive_predictive_value_perc":53,"negative_predictive_value_perc":67,"name":"DL","abbrev":"R-DL"}]},{"name":"tensorflow","algos":[{"accuracy":0.63,"true_positive_rate":0.63,"true_negative_rate":0.63,"positive_predictive_value":0.62,"negative_predictive_value":0.63,"accuracy_perc":63,"true_positive_rate_perc":63,"true_negative_rate_perc":63,"positive_predictive_value_perc":62,"negative_predictive_value_perc":63,"name":"without feature selection","abbrev":"tensorflow-without feature selection"},{"accuracy":0.63,"true_positive_rate":0.61,"true_negative_rate":0.64,"positive_predictive_value":0.63,"negative_predictive_value":0.63,"accuracy_perc":63,"true_positive_rate_perc":61,"true_negative_rate_perc":64,"positive_predictive_value_perc":63,"negative_predictive_value_perc":63,"name":"with feature selection","abbrev":"tensorflow-with feature selection"}]},{"name":"lazar","algos":[{"accuracy":0.82,"true_positive_rate":0.85,"true_negative_rate":0.78,"positive_predictive_value":0.8,"negative_predictive_value":0.84,"accuracy_perc":82,"true_positive_rate_perc":85,"true_negative_rate_perc":78,"positive_predictive_value_perc":80,"negative_predictive_value_perc":84,"name":"all","abbrev":"lazar-all"},{"accuracy":0.84,"true_positive_rate":0.89,"true_negative_rate":0.79,"positive_predictive_value":0.83,"negative_predictive_value":0.85,"accuracy_perc":84,"true_positive_rate_perc":89,"true_negative_rate_perc":79,"positive_predictive_value_perc":83,"negative_predictive_value_perc":85,"name":"high-confidence","abbrev":"lazar-high-confidence"},{"accuracy":0.58,"true_positive_rate":0.32,"true_negative_rate":0.79,"positive_predictive_value":0.56,"negative_predictive_value":0.59,"accuracy_perc":58,"true_positive_rate_perc":32,"true_negative_rate_perc":79,"positive_predictive_value_perc":56,"negative_predictive_value_perc":59,"name":"PaDEL all","abbrev":"lazar-PaDEL all"},{"accuracy":0.58,"true_positive_rate":0.32,"true_negative_rate":0.79,"positive_predictive_value":0.56,"negative_predictive_value":0.59,"accuracy_perc":58,"true_positive_rate_perc":32,"true_negative_rate_perc":79,"positive_predictive_value_perc":56,"negative_predictive_value_perc":59,"name":"PaDEL high-confidence","abbrev":"lazar-PaDEL high-confidence"}]}]}
diff --git a/10-fold-crossvalidations/summaries/tensorflow-all.csv b/10-fold-crossvalidations/summaries/tensorflow-all.csv
deleted file mode 100644
index 804b900..0000000
--- a/10-fold-crossvalidations/summaries/tensorflow-all.csv
+++ /dev/null
@@ -1 +0,0 @@
-{"accuracy":0.49814356435643564,"true_positive_rate":0.49750124937531237,"true_negative_rate":0.49877390877881317,"positive_predictive_value":0.49343246592317225,"negative_predictive_value":0.5028430160692212}
diff --git a/10-fold-crossvalidations/summaries/tensorflow-all.json b/10-fold-crossvalidations/summaries/tensorflow-all.json
new file mode 100644
index 0000000..a605a4d
--- /dev/null
+++ b/10-fold-crossvalidations/summaries/tensorflow-all.json
@@ -0,0 +1 @@
+{"accuracy":0.6258663366336633,"true_positive_rate":0.6264367816091954,"true_negative_rate":0.6253065228052967,"positive_predictive_value":0.6213135068153656,"negative_predictive_value":0.630407911001236}
diff --git a/10-fold-crossvalidations/summaries/tensorflow-selected.csv b/10-fold-crossvalidations/summaries/tensorflow-selected.csv
deleted file mode 100644
index 321dfc3..0000000
--- a/10-fold-crossvalidations/summaries/tensorflow-selected.csv
+++ /dev/null
@@ -1 +0,0 @@
-{"accuracy":0.4983910891089109,"true_positive_rate":0.4817591204397801,"true_negative_rate":0.5147130946542423,"positive_predictive_value":0.493473253135398,"negative_predictive_value":0.5029954469206805}
diff --git a/10-fold-crossvalidations/summaries/tensorflow-selected.json b/10-fold-crossvalidations/summaries/tensorflow-selected.json
new file mode 100644
index 0000000..93c54ef
--- /dev/null
+++ b/10-fold-crossvalidations/summaries/tensorflow-selected.json
@@ -0,0 +1 @@
+{"accuracy":0.6283415841584158,"true_positive_rate":0.612943528235882,"true_negative_rate":0.6434526728788622,"positive_predictive_value":0.6278474532889685,"negative_predictive_value":0.6288042175892643}
diff --git a/Makefile b/Makefile
index 421d75e..a95eb32 100644
--- a/Makefile
+++ b/Makefile
@@ -1,16 +1,16 @@
# Manuscript
-# please install pandoc-scholar (https://github.com/pandoc-scholar/pandoc-scholar) in the pandoc-scholar directory or point PANDOC_SCHOLAR_PATH to your installation
+# Requirements:
+# pandoc-scholar (https://github.com/pandoc-scholar/pandoc-scholar) in PANDOC_SCHOLAR_PATH
+# pandoc-placetable (https://github.com/mb21/pandoc-placetable)
-ARTICLE_FILE = mutagenicity.md
+ARTICLE_FILE = mutagenicity.mustache.md
PANDOC_SCHOLAR_PATH = pandoc-scholar
OUTFILE_PREFIX = mutagenicity
-DEFAULT_EXTENSIONS = latex pdf docx #odt epub html
-PANDOC_WRITER_OPTIONS = --filter=pandoc-citeproc
-#PANDOC_WRITER_OPTIONS = --filter=pandoc-placetable --filter=pandoc-citeproc
+DEFAULT_EXTENSIONS = pdf #latex docx html #odt epub
+#PANDOC_WRITER_OPTIONS = --filter=panpipe --filter=pandoc-placetable --filter=pandoc-citeproc -M tmpvar=test
+PANDOC_WRITER_OPTIONS = --filter=pandoc-crossref --filter=pandoc-placetable --filter=pandoc-citeproc
TEMPLATE_FILE_LATEX = pandoc-scholar.latex
-include $(PANDOC_SCHOLAR_PATH)/Makefile
-
# Lazar
LAZAR_DIR = ../lazar
@@ -28,29 +28,55 @@ CONFUSION_MATRICES_DIR = 10-fold-crossvalidations/confusion-matrices
R_CV_DIR = 10-fold-crossvalidations/R
TENSORFLOW_CV_DIR = 10-fold-crossvalidations/tensorflow
-#tables = tables/R-SVM.csv tables/R-RF.csv tables/R-DL.csv
+TABLES = tables/r-summary.csv tables/tf-summary.csv tables/lazar-summary.csv tables/R-SVM.csv tables/R-RF.csv tables/R-DL.csv tables/tensorflow-all.csv tables/tensorflow-selected.csv tables/lazar-all.csv tables/lazar-high-confidence.csv tables/lazar-padel-all.csv tables/lazar-padel-high-confidence.csv
+
+R_SUMMARIES = $(SUMMARIES_DIR)/R-SVM.json $(SUMMARIES_DIR)/R-RF.json $(SUMMARIES_DIR)/R-DL.json
+TF_SUMMARIES = $(SUMMARIES_DIR)/tensorflow-all.json $(SUMMARIES_DIR)/tensorflow-selected.json
+LAZAR_SUMMARIES = $(SUMMARIES_DIR)/lazar-all.json $(SUMMARIES_DIR)/lazar-high-confidence.json $(SUMMARIES_DIR)/lazar-padel-all.json $(SUMMARIES_DIR)/lazar-padel-high-confidence.json
-SUMMARIES = $(SUMMARIES_DIR)/R-SVM.csv $(SUMMARIES_DIR)/R-RF.csv $(SUMMARIES_DIR)/R-DL.csv $(SUMMARIES_DIR)/tensorflow-all.csv $(SUMMARIES_DIR)/tensorflow-selected.csv $(SUMMARIES_DIR)/lazar-all.csv $(SUMMARIES_DIR)/lazar-high-confidence.csv $(SUMMARIES_DIR)/lazar-padel-all.csv $(SUMMARIES_DIR)/lazar-padel-high-confidence.csv
+SUMMARIES = $(R_SUMMARIES) $(TF_SUMMARIES) $(LAZAR_SUMMARIES)
CONFUSION_MATRICES = $(CONFUSION_MATRICES_DIR)/R-SVM.csv $(CONFUSION_MATRICES_DIR)/R-RF.csv $(CONFUSION_MATRICES_DIR)/R-DL.csv $(CONFUSION_MATRICES_DIR)/tensorflow-all.csv $(CONFUSION_MATRICES_DIR)/tensorflow-selected.csv $(CONFUSION_MATRICES_DIR)/lazar-all.csv $(CONFUSION_MATRICES_DIR)/lazar-high-confidence.csv $(CONFUSION_MATRICES_DIR)/lazar-padel-all.csv $(CONFUSION_MATRICES_DIR)/lazar-padel-high-confidence.csv
DATA = data/mutagenicity.sdf data/mutagenicity.csv data/mutagenicity-fingerprints.csv
-all: $(SUMMARIES) $(DATA) #$(tables)
+all: $(DATA) $(TABLES) mutagenicity.pdf $(SUMMARIES_DIR)/results.json
+#all: $(SUMMARIES) $(DATA) $(TABLES) mutagenicity.pdf
+include $(PANDOC_SCHOLAR_PATH)/Makefile
export: $(DATA)
+mutagenicity.mustache.md: $(SUMMARIES_DIR)/results.json mutagenicity.md
+ mustache $^ > $@
+
+# tables
+
+tables/r-summary.csv: $(R_SUMMARIES)
+ scripts/summaries2table.rb $^ > $@
+
+tables/tf-summary.csv: $(TF_SUMMARIES)
+ scripts/summaries2table.rb $^ > $@
+
+tables/lazar-summary.csv: $(LAZAR_SUMMARIES)
+ scripts/summaries2table.rb $^ > $@
+
+tables/%.csv: $(CONFUSION_MATRICES_DIR)/%.csv
+ scripts/confusion-matrix2table.rb $< > $@
+
# summaries
-$(SUMMARIES_DIR)/%.csv: $(CONFUSION_MATRICES_DIR)/%.csv
+$(SUMMARIES_DIR)/results.json: $(SUMMARIES)
+ scripts/results.rb $^ > $@
+
+$(SUMMARIES_DIR)/%.json: $(CONFUSION_MATRICES_DIR)/%.csv
scripts/confusion-matrix-summary.rb $< > $@
# confusion matrices
## tensorflow
-$(CONFUSION_MATRICES_DIR)/tensorflow-selected.csv: $(TENSORFLOW_CV_DIR)/pred.csv
+$(CONFUSION_MATRICES_DIR)/tensorflow-selected.csv: $(TENSORFLOW_CV_DIR)/pred.sorted.csv
scripts/cv-tensorflow-confusion-matrix.rb $< > $@
-$(CONFUSION_MATRICES_DIR)/tensorflow-all.csv: $(TENSORFLOW_CV_DIR)/pred_ext.csv
+$(CONFUSION_MATRICES_DIR)/tensorflow-all.csv: $(TENSORFLOW_CV_DIR)/pred_ext.sorted.csv
scripts/cv-tensorflow-confusion-matrix.rb $< > $@
## R
diff --git a/mutagenicity.md b/mutagenicity.md
index bf4f6d1..2f80bad 100644
--- a/mutagenicity.md
+++ b/mutagenicity.md
@@ -134,8 +134,8 @@ of a compound can be constructed that can be used to calculate chemical
similarities.
The chemical similarity between two compounds a and b is expressed as
-the proportion between atom environments common in both structures A ∩ B
-and the total number of atom environments A U B (Jaccard/Tanimoto
+the proportion between atom environments common in both structures $A \cap B$
+and the total number of atom environments $A \cup B$ (Jaccard/Tanimoto
index).
$$sim = \frac{\left| A\ \cap B \right|}{\left| A\ \cup B \right|}$$
@@ -335,117 +335,106 @@ Validation
Results
=======
-`lazar`
------
+{{#programs}}
+{{name}} Models
+--------
+{{#algos}}
-Random Forest
--------------
+### {{name}}
-The validation showed that the RF model has an accuracy of 64%, a
-sensitivity of 66% and a specificity of 63%. The confusion matrix of the
+10-fold crossvalidation of the {{abbrev}} model gave an accuracy of
+{{accuracy_perc}}%
+a sensitivity of
+{{true_positive_rate_perc}}%
+and a specificity of
+{{true_negative_rate_perc}}%
+The confusion matrix of the
model, calculated for 8080 instances, is provided in Table 1.
-Table 1: Confusion matrix of the RF model
+```{.table file="tables/R-RF.csv" caption="Confusion matrix for R Random Forest predictions"}
+```
+{{/algos}}
+{{/programs}}
- Predicted genotoxicity
- ----------------------- ------------------------ ---------- ---------- -------------
- Measured genotoxicity ***PP*** ***PN*** ***Total***
- ***TP*** 2274 1163 3437
- ***TN*** 1736 2907 4643
- ***Total*** 4010 4070 8080
+R Models
+--------
-PP: Predicted positive; PN: Predicted negative, TP: True positive, TN:
-True negative
+### Random Forest
-Support Vector Machines
------------------------
+The validation showed that the RF model has an accuracy of
+{{R-RF.accuracy}}%
+`cat /home/ch/src/mutagenicity-paper/10-fold-crossvalidations/summaries/R-RF.json|jq '.accuracy * 100 | round'`{pipe="sh"}%,
+a sensitivity of
+`cat /home/ch/src/mutagenicity-paper/10-fold-crossvalidations/summaries/R-RF.json|jq '.true_positive_rate * 100 | round'`{pipe="sh"}%,
+and a specificity of
+`cat /home/ch/src/mutagenicity-paper/10-fold-crossvalidations/summaries/R-RF.json|jq '.true_negative_rate * 100 | round'`{pipe="sh"}%,
+The confusion matrix of the
+model, calculated for 8080 instances, is provided in Table 1.
+
+```{.table file="tables/R-RF.csv" caption="Confusion matrix for R Random Forest predictions"}
+```
+
+### Support Vector Machines
The validation showed that the SVM model has an accuracy of 62%, a
sensitivity of 65% and a specificity of 60%. The confusion matrix of SVM
model, calculated for 8080 instances, is provided in Table 2.
-Table 2: Confusion matrix of the SVM model
-
- Predicted genotoxicity
- ----------------------- ------------------------ ---------- ---------- -------------
- Measured genotoxicity ***PP*** ***PN*** ***Total***
- ***TP*** 2057 1107 3164
- ***TN*** 1953 2963 4916
- ***Total*** 4010 4070 8080
-PP: Predicted positive; PN: Predicted negative, TP: True positive, TN:
-True negative
+```{.table file="tables/R-SVM.csv" caption="Confusion matrix for R Support Vector Machine predictions"}
+```
-Deep Learning (R-project)
--------------------------
+### Deep Learning
The validation showed that the DL model generated in R has an accuracy
of 59%, a sensitivity of 89% and a specificity of 30%. The confusion
matrix of the model, normalised to 8080 instances, is provided in Table
3.
-Table 3: Confusion matrix of the DL model (R-project)
+```{.table file="tables/R-DL.csv" caption="Confusion matrix for R Deep Learning predictions"}
+```
- Predicted genotoxicity
- ----------------------- ------------------------ ---------- ---------- -------------
- Measured genotoxicity ***PP*** ***PN*** ***Total***
- ***TP*** 3575 435 4010
- ***TN*** 2853 1217 4070
- ***Total*** 6428 1652 8080
+```{.table file="tables/r-summary.csv" caption="Summary of R model validations"}
+```
-PP: Predicted positive; PN: Predicted negative, TP: True positive, TN:
-True negative
-
-DL model (TensorFlow)
----------------------
+TensorFlow Models
+-----------------
The validation showed that the DL model generated in TensorFlow has an
accuracy of 68%, a sensitivity of 70% and a specificity of 46%. The
confusion matrix of the model, normalised to 8080 instances, is provided
in Table 4.
-Table 4: Confusion matrix of the DL model (TensorFlow)
-
- Predicted genotoxicity
- ----------------------- ------------------------ ---------- ---------- -------------
- Measured genotoxicity ***PP*** ***PN*** ***Total***
- ***TP*** 2851 1227 4078
- ***TN*** 1825 2177 4002
- ***Total*** 4676 3404 8080
-
-PP: Predicted positive; PN: Predicted negative, TP: True positive, TN:
-True negative
-
-The ROC curves from the 6-fold validation are shown in Figure 7.
+```{.table file="tables/tensorflow-all.csv" caption="Confusion matrix for Tensorflow predictions without variable selecetion"}
+```
-![](figures/image7.png){width="3.825in"
-height="2.7327045056867894in"}
+```{.table file="tables/tensorflow-selected.csv" caption="Confusion matrix for Tensorflow predictions with variable selecetion"}
+```
-Figure 7: Six-fold cross-validation of TensorFlow DL model show an
-average area under the ROC-curve (ROC-AUC; measure of accuracy) of 68%.
+```{.table file="tables/tf-summary.csv" caption="Summary of TensorFlow model validations"}
+```
-In summary, the validation results of the four methods are presented in
-the following table.
+`lazar` Models
+--------------
-Table 5 Results of the cross-validation of the four models and after
-y-randomisation
+### MolPrint2D Descriptors
- ----------------------------------------------------------------------
- Accuracy CCR Sensitivity Specificity
- ----------------------- ---------- ------- ------------- -------------
- RF model 64.1% 64.4% 66.2% 62.6%
+```{.table file="tables/lazar-all.csv" caption="Confusion matrix for lazar predictions with MolPrint2D descriptors"}
+```
- SVM model 62.1% 62.6% 65.0% 60.3%
+```{.table file="tables/lazar-high-confidence.csv" caption="Confusion matrix for high confidence lazar predictions with MolPrint2D descriptors"}
+```
- DL model\ 59.3% 59.5% 89.2% 29.9%
- (R-project)
+### PaDEL Descriptors
- DL model (TensorFlow) 68% 62.2% 69.9% 45.6%
+```{.table file="tables/lazar-padel-all.csv" caption="Confusion matrix for lazar predictions with PaDEL descriptors"}
+```
- y-randomisation 50.5% 50.4% 50.3% 50.6%
- ----------------------------------------------------------------------
+```{.table file="tables/lazar-padel-high-confidence.csv" caption="Confusion matrix for high confidence lazar predictions with PaDEL descriptors"}
+```
-CCR (correct classification rate)
+```{.table file="tables/lazar-summary.csv" caption="Summary of lazar model validations"}
+```
Discussion
==========
diff --git a/scripts/confusion-matrix2table.rb b/scripts/confusion-matrix2table.rb
new file mode 100755
index 0000000..ccb4817
--- /dev/null
+++ b/scripts/confusion-matrix2table.rb
@@ -0,0 +1,10 @@
+#!/usr/bin/env ruby
+
+mat = []
+File.readlines(ARGV[0]).each do |l|
+ mat << l.chomp.split(",")
+end
+puts ",,Predictions,"
+puts ",,mutagenic,non-mutagenic"
+puts "Measurements,mutagenic,#{mat[0][0]},#{mat[0][1]}"
+puts ",non-mutagenic,#{mat[1][0]},#{mat[1][1]}"
diff --git a/scripts/cv-tensorflow-confusion-matrix.rb b/scripts/cv-tensorflow-confusion-matrix.rb
index 067519b..2b0ee58 100755
--- a/scripts/cv-tensorflow-confusion-matrix.rb
+++ b/scripts/cv-tensorflow-confusion-matrix.rb
@@ -7,7 +7,7 @@ tn = 0
fn = 0
pred = CSV.read(ARGV[0],headers: true,:col_sep => ",")
-act = CSV.read(File.join(File.dirname(ARGV[0]),"GenoTox-database.csv"),headers: true,:col_sep => ",")
+act = CSV.read(File.join("data","GenoTox-database.csv"),headers: true,:col_sep => ",")
pred.each_with_index do |row,i|
diff --git a/scripts/results.rb b/scripts/results.rb
new file mode 100755
index 0000000..1a36278
--- /dev/null
+++ b/scripts/results.rb
@@ -0,0 +1,36 @@
+#!/usr/bin/env ruby
+require 'json'
+
+result = {}
+ARGV.each do |f|
+ fname = File.basename(f,".json")
+ program,algo = fname.split('-')
+ case program
+ when "tensorflow"
+ algo == "all" ? algo = "without feature selection" : algo = "with feature selection"
+ when "lazar"
+ algo = "high-confidence" if algo == "high"
+ if algo == "padel"
+ algo = "PaDEL"
+ fname.match("high") ? algo += " high-confidence" : algo += " all"
+ end
+ end
+ result[program] ||= {}
+ result[program][algo] = JSON.parse(File.read(f)).collect{|k,v| [k,v.round(2)]}.to_h
+end
+
+out = {:programs => []}
+result.keys.each do |prog|
+ out[:programs] << {:name => prog, :algos => []}
+ result[prog].keys.each do |algo|
+ r = result[prog][algo].dup
+ result[prog][algo].each do |k,v|
+ r[k+"_perc"] = (v*100).round
+ end
+ r[:name] = algo
+ r[:abbrev] = prog+"-"+algo
+ out[:programs].last[:algos] << r
+ end
+end
+
+puts out.to_json
diff --git a/scripts/summaries2table.rb b/scripts/summaries2table.rb
new file mode 100755
index 0000000..5470b26
--- /dev/null
+++ b/scripts/summaries2table.rb
@@ -0,0 +1,19 @@
+#!/usr/bin/env ruby
+require 'json'
+
+results = {}
+
+ARGV.each do |f|
+ results[File.basename(f,".json")] = JSON.parse(File.read(f))
+end
+
+print ","
+puts results.keys.collect{|k| k.sub("tensorflow","TF")}.join(",")
+["accuracy","true_positive_rate","true_negative_rate","positive_predictive_value","negative_predictive_value"].each do |m|
+ line = [m.gsub("_"," ")]
+ results.each do |k,v|
+ line << v[m].round(2)
+ end
+ puts line.join(",")
+end
+