diff options
author | Christoph Helma <helma@in-silico.ch> | 2020-10-10 17:05:41 +0200 |
---|---|---|
committer | Christoph Helma <helma@in-silico.ch> | 2020-10-10 17:05:41 +0200 |
commit | e451d812f3b63d1987c8f1e7f5557156fdab984f (patch) | |
tree | f5b4e1730f0b75593925b3287d3a37fa70fa507e | |
parent | 23ce84a7da69104fa763d5a3911b7b0ad98fbdbc (diff) |
Makefile and scripts cleanup; lazar, R and tensorflow tables
37 files changed, 79 insertions, 272 deletions
diff --git a/10-fold-crossvalidations/summaries/R-DL.json b/10-fold-crossvalidations/summaries/R-DL.json deleted file mode 100644 index 8a48d30..0000000 --- a/10-fold-crossvalidations/summaries/R-DL.json +++ /dev/null @@ -1 +0,0 @@ -{"accuracy":0.5561338289962825,"true_positive_rate":0.87925,"true_negative_rate":0.23857493857493858,"positive_predictive_value":0.531590084643289,"negative_predictive_value":0.6678129298486932} diff --git a/10-fold-crossvalidations/summaries/R-RF.json b/10-fold-crossvalidations/summaries/R-RF.json deleted file mode 100644 index ab7d6e7..0000000 --- a/10-fold-crossvalidations/summaries/R-RF.json +++ /dev/null @@ -1 +0,0 @@ -{"accuracy":0.638909541511772,"true_positive_rate":0.56475,"true_negative_rate":0.7117936117936118,"positive_predictive_value":0.6582167832167832,"negative_predictive_value":0.6246226821905994} diff --git a/10-fold-crossvalidations/summaries/R-SVM.json b/10-fold-crossvalidations/summaries/R-SVM.json deleted file mode 100644 index a038447..0000000 --- a/10-fold-crossvalidations/summaries/R-SVM.json +++ /dev/null @@ -1 +0,0 @@ -{"accuracy":0.6146220570012392,"true_positive_rate":0.56075,"true_negative_rate":0.6675675675675675,"positive_predictive_value":0.6237486095661846,"negative_predictive_value":0.6072865444792133} diff --git a/10-fold-crossvalidations/summaries/lazar-all.json b/10-fold-crossvalidations/summaries/lazar-all.json deleted file mode 100644 index e68ff79..0000000 --- a/10-fold-crossvalidations/summaries/lazar-all.json +++ /dev/null @@ -1 +0,0 @@ -{"accuracy":0.818018249582316,"true_positive_rate":0.8508569966743412,"true_negative_rate":0.7848657024793388,"positive_predictive_value":0.7997114691031498,"negative_predictive_value":0.8390392048591938} diff --git a/10-fold-crossvalidations/summaries/lazar-high-confidence.json b/10-fold-crossvalidations/summaries/lazar-high-confidence.json deleted file mode 100644 index a9f852e..0000000 --- a/10-fold-crossvalidations/summaries/lazar-high-confidence.json +++ /dev/null @@ -1 +0,0 @@ -{"accuracy":0.8410865874363328,"true_positive_rate":0.8852562087393901,"true_negative_rate":0.7892211148025101,"positive_predictive_value":0.8314142308827871,"negative_predictive_value":0.8541749900119856} diff --git a/10-fold-crossvalidations/summaries/lazar-padel-all.json b/10-fold-crossvalidations/summaries/lazar-padel-all.json deleted file mode 100644 index d8ce18a..0000000 --- a/10-fold-crossvalidations/summaries/lazar-padel-all.json +++ /dev/null @@ -1 +0,0 @@ -{"accuracy":0.5796038151137197,"true_positive_rate":0.32123510292524377,"true_negative_rate":0.792242532322782,"positive_predictive_value":0.5599622285174694,"negative_predictive_value":0.5864686468646865} diff --git a/10-fold-crossvalidations/summaries/lazar-padel-high-confidence.json b/10-fold-crossvalidations/summaries/lazar-padel-high-confidence.json deleted file mode 100644 index 7ec0b1e..0000000 --- a/10-fold-crossvalidations/summaries/lazar-padel-high-confidence.json +++ /dev/null @@ -1 +0,0 @@ -{"accuracy":0.5792697868169566,"true_positive_rate":0.3215835140997831,"true_negative_rate":0.7916852928028609,"positive_predictive_value":0.5599622285174694,"negative_predictive_value":0.586035737921906} diff --git a/10-fold-crossvalidations/summaries/results.json b/10-fold-crossvalidations/summaries/results.json deleted file mode 100644 index 033c728..0000000 --- a/10-fold-crossvalidations/summaries/results.json +++ /dev/null @@ -1 +0,0 @@ -{"programs":[{"name":"R","algos":[{"accuracy":0.61,"true_positive_rate":0.56,"true_negative_rate":0.67,"positive_predictive_value":0.62,"negative_predictive_value":0.61,"accuracy_perc":61,"true_positive_rate_perc":56,"true_negative_rate_perc":67,"positive_predictive_value_perc":62,"negative_predictive_value_perc":61,"name":"SVM","abbrev":"R-SVM"},{"accuracy":0.64,"true_positive_rate":0.56,"true_negative_rate":0.71,"positive_predictive_value":0.66,"negative_predictive_value":0.62,"accuracy_perc":64,"true_positive_rate_perc":56,"true_negative_rate_perc":71,"positive_predictive_value_perc":66,"negative_predictive_value_perc":62,"name":"RF","abbrev":"R-RF"},{"accuracy":0.56,"true_positive_rate":0.88,"true_negative_rate":0.24,"positive_predictive_value":0.53,"negative_predictive_value":0.67,"accuracy_perc":56,"true_positive_rate_perc":88,"true_negative_rate_perc":24,"positive_predictive_value_perc":53,"negative_predictive_value_perc":67,"name":"DL","abbrev":"R-DL"}]},{"name":"tensorflow","algos":[{"accuracy":0.63,"true_positive_rate":0.63,"true_negative_rate":0.63,"positive_predictive_value":0.62,"negative_predictive_value":0.63,"accuracy_perc":63,"true_positive_rate_perc":63,"true_negative_rate_perc":63,"positive_predictive_value_perc":62,"negative_predictive_value_perc":63,"name":"without feature selection","abbrev":"tensorflow-without feature selection"},{"accuracy":0.63,"true_positive_rate":0.61,"true_negative_rate":0.64,"positive_predictive_value":0.63,"negative_predictive_value":0.63,"accuracy_perc":63,"true_positive_rate_perc":61,"true_negative_rate_perc":64,"positive_predictive_value_perc":63,"negative_predictive_value_perc":63,"name":"with feature selection","abbrev":"tensorflow-with feature selection"}]},{"name":"lazar","algos":[{"accuracy":0.82,"true_positive_rate":0.85,"true_negative_rate":0.78,"positive_predictive_value":0.8,"negative_predictive_value":0.84,"accuracy_perc":82,"true_positive_rate_perc":85,"true_negative_rate_perc":78,"positive_predictive_value_perc":80,"negative_predictive_value_perc":84,"name":"all","abbrev":"lazar-all"},{"accuracy":0.84,"true_positive_rate":0.89,"true_negative_rate":0.79,"positive_predictive_value":0.83,"negative_predictive_value":0.85,"accuracy_perc":84,"true_positive_rate_perc":89,"true_negative_rate_perc":79,"positive_predictive_value_perc":83,"negative_predictive_value_perc":85,"name":"high-confidence","abbrev":"lazar-high-confidence"},{"accuracy":0.58,"true_positive_rate":0.32,"true_negative_rate":0.79,"positive_predictive_value":0.56,"negative_predictive_value":0.59,"accuracy_perc":58,"true_positive_rate_perc":32,"true_negative_rate_perc":79,"positive_predictive_value_perc":56,"negative_predictive_value_perc":59,"name":"PaDEL all","abbrev":"lazar-PaDEL all"},{"accuracy":0.58,"true_positive_rate":0.32,"true_negative_rate":0.79,"positive_predictive_value":0.56,"negative_predictive_value":0.59,"accuracy_perc":58,"true_positive_rate_perc":32,"true_negative_rate_perc":79,"positive_predictive_value_perc":56,"negative_predictive_value_perc":59,"name":"PaDEL high-confidence","abbrev":"lazar-PaDEL high-confidence"}]}]} diff --git a/10-fold-crossvalidations/summaries/tensorflow-all.json b/10-fold-crossvalidations/summaries/tensorflow-all.json deleted file mode 100644 index a605a4d..0000000 --- a/10-fold-crossvalidations/summaries/tensorflow-all.json +++ /dev/null @@ -1 +0,0 @@ -{"accuracy":0.6258663366336633,"true_positive_rate":0.6264367816091954,"true_negative_rate":0.6253065228052967,"positive_predictive_value":0.6213135068153656,"negative_predictive_value":0.630407911001236} diff --git a/10-fold-crossvalidations/summaries/tensorflow-selected.json b/10-fold-crossvalidations/summaries/tensorflow-selected.json deleted file mode 100644 index 93c54ef..0000000 --- a/10-fold-crossvalidations/summaries/tensorflow-selected.json +++ /dev/null @@ -1 +0,0 @@ -{"accuracy":0.6283415841584158,"true_positive_rate":0.612943528235882,"true_negative_rate":0.6434526728788622,"positive_predictive_value":0.6278474532889685,"negative_predictive_value":0.6288042175892643} diff --git a/results.yaml b/10-fold-crossvalidations/summary.yaml index 5952b39..2c6f98b 100644 --- a/results.yaml +++ b/10-fold-crossvalidations/summary.yaml @@ -50,40 +50,6 @@ R-DL: :tnr_perc: 24 :ppv_perc: 53 :npv_perc: 67 -tensorflow-all: - :tp: 2507 - :fp: 1528 - :tn: 2550 - :fn: 1495 - :n: 8080 - :acc: 0.63 - :tpr: 0.63 - :fpr: 0.37 - :tnr: 0.63 - :ppv: 0.62 - :npv: 0.63 - :acc_perc: 63 - :tpr_perc: 63 - :tnr_perc: 63 - :ppv_perc: 62 - :npv_perc: 63 -tensorflow-selected: - :tp: 2453 - :fp: 1454 - :tn: 2624 - :fn: 1549 - :n: 8080 - :acc: 0.63 - :tpr: 0.61 - :fpr: 0.36 - :tnr: 0.64 - :ppv: 0.63 - :npv: 0.63 - :acc_perc: 63 - :tpr_perc: 61 - :tnr_perc: 64 - :ppv_perc: 63 - :npv_perc: 63 lazar-all: :tp: 3326 :fp: 833 @@ -7,122 +7,69 @@ ARTICLE_FILE = mutagenicity.mustache.md PANDOC_SCHOLAR_PATH = pandoc-scholar OUTFILE_PREFIX = mutagenicity DEFAULT_EXTENSIONS = pdf #latex docx html #odt epub -#PANDOC_WRITER_OPTIONS = --filter=panpipe --filter=pandoc-placetable --filter=pandoc-citeproc -M tmpvar=test PANDOC_WRITER_OPTIONS = --filter=pandoc-placetable --filter=pandoc-crossref --filter=pandoc-citeproc TEMPLATE_FILE_LATEX = pandoc-scholar.latex -# Lazar - -LAZAR_DIR = ../lazar -LAZAR_MODEL_DIR = $(LAZAR_DIR)/models/mutagenicity -LAZAR_PADEL_MODEL_DIR = $(LAZAR_DIR)/models/mutagenicity-padel -LAZAR_SUMMARY_DIR = $(LAZAR_MODEL_DIR)/crossvalidation/summaries -LAZAR_PADEL_SUMMARY_DIR = $(LAZAR_PADEL_MODEL_DIR)/crossvalidation/summaries -LAZAR_CONFUSION_MATRIX_DIR = $(LAZAR_MODEL_DIR)/crossvalidation/confusion_matrices -LAZAR_PADEL_CONFUSION_MATRIX_DIR = $(LAZAR_PADEL_MODEL_DIR)/crossvalidation/confusion_matrices - # Experiments -SUMMARIES_DIR = 10-fold-crossvalidations/summaries -CONFUSION_MATRICES_DIR = 10-fold-crossvalidations/confusion-matrices +LAZAR_CONFUSION_MATRIX_DIR = 10-fold-crossvalidations/lazar/crossvalidation/confusion_matrices +LAZAR_PADEL_CONFUSION_MATRIX_DIR = 10-fold-crossvalidations/lazar-padel/crossvalidation/confusion_matrices R_CV_DIR = 10-fold-crossvalidations/R TENSORFLOW_CV_DIR = 10-fold-crossvalidations/tensorflow -#TABLES = tables/r-summary.csv tables/tf-summary.csv tables/lazar-summary.csv tables/R-SVM.csv tables/R-RF.csv tables/R-DL.csv tables/tensorflow-all.csv tables/tensorflow-selected.csv tables/lazar-all.csv tables/lazar-high-confidence.csv tables/lazar-padel-all.csv tables/lazar-padel-high-confidence.csv -TABLES = tables/R-SVM.csv tables/R-RF.csv tables/R-DL.csv tables/tensorflow-all.csv tables/tensorflow-selected.csv tables/lazar-all.csv tables/lazar-high-confidence.csv tables/lazar-padel-all.csv tables/lazar-padel-high-confidence.csv #tables/pred.rf.v3.csv tables/pred.lr.v3.csv tables/pred.lr2.v3.csv tables/pred.nn.v3.csv - -R_SUMMARIES = $(SUMMARIES_DIR)/R-SVM.json $(SUMMARIES_DIR)/R-RF.json $(SUMMARIES_DIR)/R-DL.json -TF_SUMMARIES = $(SUMMARIES_DIR)/tensorflow-all.json $(SUMMARIES_DIR)/tensorflow-selected.json $(SUMMARIES_DIR)/pred.lr.v3.json $(SUMMARIES_DIR)/pred.lr2.v3.json $(SUMMARIES_DIR)/pred.nn.v3.json $(SUMMARIES_DIR)/pred.rf.v3.json -LAZAR_SUMMARIES = $(SUMMARIES_DIR)/lazar-all.json $(SUMMARIES_DIR)/lazar-high-confidence.json $(SUMMARIES_DIR)/lazar-padel-all.json $(SUMMARIES_DIR)/lazar-padel-high-confidence.json +CONFUSION_MATRICES_DIR = 10-fold-crossvalidations/confusion-matrices +CONFUSION_MATRICES = $(CONFUSION_MATRICES_DIR)/R-SVM.csv $(CONFUSION_MATRICES_DIR)/R-RF.csv $(CONFUSION_MATRICES_DIR)/R-DL.csv $(CONFUSION_MATRICES_DIR)/lazar-all.csv $(CONFUSION_MATRICES_DIR)/lazar-high-confidence.csv $(CONFUSION_MATRICES_DIR)/lazar-padel-all.csv $(CONFUSION_MATRICES_DIR)/lazar-padel-high-confidence.csv $(CONFUSION_MATRICES_DIR)/tensorflow-lr.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-lr2.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-nn.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-rf.v3.csv -#SUMMARIES = $(R_SUMMARIES) $(TF_SUMMARIES) $(LAZAR_SUMMARIES) +CV_SUMMARY = 10-fold-crossvalidations/summary.yaml +TABLES = tables/lazar-summary.csv tables/r-summary.csv tables/tensorflow-summary.csv +FIGURES = figures/roc.png figures/tsne-mp2d.png figures/tsne-padel.png #figures/pa-predictions.png -CONFUSION_MATRICES = $(CONFUSION_MATRICES_DIR)/R-SVM.csv $(CONFUSION_MATRICES_DIR)/R-RF.csv $(CONFUSION_MATRICES_DIR)/R-DL.csv $(CONFUSION_MATRICES_DIR)/tensorflow-all.csv $(CONFUSION_MATRICES_DIR)/tensorflow-selected.csv $(CONFUSION_MATRICES_DIR)/lazar-all.csv $(CONFUSION_MATRICES_DIR)/lazar-high-confidence.csv $(CONFUSION_MATRICES_DIR)/lazar-padel-all.csv $(CONFUSION_MATRICES_DIR)/lazar-padel-high-confidence.csv $(CONFUSION_MATRICES_DIR)/tensorflow-lr.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-lr2.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-nn.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-rf.v3.csv -DATA = data/mutagenicity.sdf data/mutagenicity.csv data/mutagenicity-fingerprints.csv -FIGURES = figures/roc.png figures/tsne-mp2d.png figures/tsne-padel.png +# manuscript -all: $(DATA) $(TABLES) $(FIGURES) mutagenicity.pdf +all: $(TABLES) $(FIGURES) mutagenicity.pdf include $(PANDOC_SCHOLAR_PATH)/Makefile -export: $(DATA) -mutagenicity.mustache.md: results.yaml mutagenicity.md $(FIGURES) +mutagenicity.mustache.md: $(CV_SUMMARY) mutagenicity.md $(TABLES) $(FIGURES) mustache $^ > $@ # figures + figures/tsne-padel.png: figures/tsne-padel.csv - scripts/padel-tsne.R + scripts/tsne-padel.R figures/tsne-padel.csv: data/GenoTox-database.csv pyrrolizidine-alkaloids/PA-Padel-2D_m2.csv scripts/padel-descriptors.rb $^ > $@ figures/tsne-mp2d.png: figures/tsne-mp2d.csv - scripts/mp2d-tsne.R + scripts/tsne-mp2d.R -figures/tsne-mp2d.csv: ../lazar/models/mutagenicity/independent_variables +figures/tsne-mp2d.csv: 10-fold-crossvalidations/lazar/independent_variables scripts/mp2d-distances.rb > figures/tsne-mp2d.csv -figures/roc.png: figures/results.csv +figures/roc.png: figures/roc.csv scripts/roc.R -figures/results.csv: results.yaml +figures/roc.csv: $(CV_SUMMARY) scripts/results2csv.rb $< > $@ # tables -tables/r-summary.csv: $(R_SUMMARIES) - scripts/summaries2table.rb $^ > $@ - -tables/tf-summary.csv: $(TF_SUMMARIES) - scripts/summaries2table.rb $^ > $@ +tables/lazar-summary.csv: $(CV_SUMMARY) + scripts/summaries2table.rb lazar > $@ -tables/lazar-summary.csv: $(LAZAR_SUMMARIES) - scripts/summaries2table.rb $^ > $@ +tables/r-summary.csv: $(CV_SUMMARY) + scripts/summaries2table.rb R > $@ -tables/%.csv: $(CONFUSION_MATRICES_DIR)/%.csv - scripts/confusion-matrix2table.rb $< > $@ +tables/tensorflow-summary.csv: $(CV_SUMMARY) + scripts/summaries2table.rb tensorflow > $@ -# summaries +# crossvalidation summary -#$(SUMMARIES_DIR)/results.json: $(SUMMARIES) - #scripts/results.rb $^ > $@ - -#$(SUMMARIES_DIR)/%.json: $(CONFUSION_MATRICES_DIR)/%.csv - #scripts/confusion-matrix-summary.rb $< > $@ - -results.yaml: $(CONFUSION_MATRICES) +$(CV_SUMMARY): $(CONFUSION_MATRICES) scripts/confusion-matrix-summary.rb $^ > $@ # confusion matrices -## tensorflow -$(CONFUSION_MATRICES_DIR)/tensorflow-selected.csv: $(TENSORFLOW_CV_DIR)/pred.sorted.csv - scripts/cv-tensorflow-confusion-matrix.rb $< > $@ - -$(CONFUSION_MATRICES_DIR)/tensorflow-all.csv: $(TENSORFLOW_CV_DIR)/pred_ext.sorted.csv - scripts/cv-tensorflow-confusion-matrix.rb $< > $@ - -$(CONFUSION_MATRICES_DIR)/tensorflow-lr.v3.csv: $(TENSORFLOW_CV_DIR)/pred.lr.v3.sorted.csv - scripts/cv-tensorflow-confusion-matrix.rb $< > $@ - -$(CONFUSION_MATRICES_DIR)/tensorflow-lr2.v3.csv: $(TENSORFLOW_CV_DIR)/pred.lr2.v3.sorted.csv - scripts/cv-tensorflow-confusion-matrix.rb $< > $@ - -$(CONFUSION_MATRICES_DIR)/tensorflow-nn.v3.csv: $(TENSORFLOW_CV_DIR)/pred.nn.v3.sorted.csv - scripts/cv-tensorflow-confusion-matrix.rb $< > $@ - -$(CONFUSION_MATRICES_DIR)/tensorflow-rf.v3.csv: $(TENSORFLOW_CV_DIR)/pred.rf.v3.sorted.csv - scripts/cv-tensorflow-confusion-matrix.rb $< > $@ - -## R -$(CONFUSION_MATRICES_DIR)/R-SVM.csv: $(R_CV_DIR)/Sgl-Observations-SVM.csv - scripts/cv-r-confusion-matrix.rb $< > $@ - -$(CONFUSION_MATRICES_DIR)/R-RF.csv: $(R_CV_DIR)/Sgl-Observations-RF.csv - scripts/cv-r-confusion-matrix.rb $< > $@ - -$(CONFUSION_MATRICES_DIR)/R-DL.csv: $(R_CV_DIR)/Sgl-Observations-DL.csv - scripts/cv-r-confusion-matrix.rb $< > $@ - ## lazar $(CONFUSION_MATRICES_DIR)/lazar-all.csv: $(LAZAR_CONFUSION_MATRIX_DIR) cp $</all $@ @@ -136,21 +83,26 @@ $(CONFUSION_MATRICES_DIR)/lazar-padel-all.csv: $(LAZAR_PADEL_CONFUSION_MATRIX_DI $(CONFUSION_MATRICES_DIR)/lazar-padel-high-confidence.csv: $(LAZAR_PADEL_CONFUSION_MATRIX_DIR) cp $</high_confidence $@ -# exports +## R +$(CONFUSION_MATRICES_DIR)/R-SVM.csv: $(R_CV_DIR)/Sgl-Observations-SVM.csv + scripts/cv-r-confusion-matrix.rb $< > $@ -data/mutagenicity-fingerprints.csv: $(LAZAR_DIR)/models/mutagenicity - $(LAZAR_DIR)/bin/export-fingerprints.rb $< > $@ +$(CONFUSION_MATRICES_DIR)/R-RF.csv: $(R_CV_DIR)/Sgl-Observations-RF.csv + scripts/cv-r-confusion-matrix.rb $< > $@ -data/mutagenicity.csv: $(LAZAR_DIR)/models/mutagenicity/Mutagenicity-Salmonella_typhimurium.csv - cp $< > $@ +$(CONFUSION_MATRICES_DIR)/R-DL.csv: $(R_CV_DIR)/Sgl-Observations-DL.csv + scripts/cv-r-confusion-matrix.rb $< > $@ -data/mutagenicity.sdf: $(LAZAR_DIR)/models/mutagenicity/Mutagenicity-Salmonella_typhimurium.csv - $(LAZAR_DIR)/bin/export-sdf.rb $< > $@ - -# lazar models and crossvalidations +## tensorflow +$(CONFUSION_MATRICES_DIR)/tensorflow-lr.v3.csv: $(TENSORFLOW_CV_DIR)/pred.lr.v3.sorted.csv + scripts/cv-tensorflow-confusion-matrix.rb $< > $@ -$(LAZAR_SUMMARY_DIR): - make -C $(LAZAR_MODEL_DIR) +$(CONFUSION_MATRICES_DIR)/tensorflow-lr2.v3.csv: $(TENSORFLOW_CV_DIR)/pred.lr2.v3.sorted.csv + scripts/cv-tensorflow-confusion-matrix.rb $< > $@ + +$(CONFUSION_MATRICES_DIR)/tensorflow-nn.v3.csv: $(TENSORFLOW_CV_DIR)/pred.nn.v3.sorted.csv + scripts/cv-tensorflow-confusion-matrix.rb $< > $@ + +$(CONFUSION_MATRICES_DIR)/tensorflow-rf.v3.csv: $(TENSORFLOW_CV_DIR)/pred.rf.v3.sorted.csv + scripts/cv-tensorflow-confusion-matrix.rb $< > $@ -$(LAZAR_PADEL_SUMMARY_DIR): - make -C $(LAZAR_PADEL_MODEL_DIR) diff --git a/figures/results.csv b/figures/roc.csv index a9a3676..ac79a2e 100644 --- a/figures/results.csv +++ b/figures/roc.csv @@ -2,8 +2,6 @@ tpr,fpr R-SVM,0.56,0.33 R-RF,0.56,0.29 R-DL,0.88,0.76 -TF,0.63,0.37 -TF-FS,0.61,0.36 L,0.85,0.22 L-HC,0.89,0.21 L-P,0.32,0.21 diff --git a/figures/roc.png b/figures/roc.png Binary files differindex a7cb04f..4ced78d 100644 --- a/figures/roc.png +++ b/figures/roc.png diff --git a/mutagenicity.md b/mutagenicity.md index 418c2d1..9012ce5 100644 --- a/mutagenicity.md +++ b/mutagenicity.md @@ -476,6 +476,16 @@ Results Crossvalidation results are summarized in the following tables: @tbl:lazar shows `lazar` results with MolPrint2D and PaDEL descriptors, @tbl:R summarizes R results and @tbl:tensorflow Tensorflow results. + +```{#tbl:lazar .table file="tables/lazar-summary.csv" caption="Summary of lazar crossvalidation results"} +``` + +```{#tbl:R .table file="tables/r-summary.csv" caption="Summary of R crossvalidation results"} +``` + +```{#tbl:tensorflow .table file="tables/tensorflow-summary.csv" caption="Summary of tensorflow crossvalidation results"} +``` + @fig:roc depicts the position of all crossvalidation results in receiver operating characteristic (ROC) space. Confusion matrices for all models are available from the git repository http://git.in-silico.ch/mutagenicity-paper/10-fold-crossvalidations/confusion-matrices/, individual predictions can be found in @@ -483,6 +493,7 @@ http://git.in-silico.ch/mutagenicity-paper/10-fold-crossvalidations/predictions/ The most accurate crossvalidation predictions have been obtained with `lazar` models with MolPrint2D descriptors ({{lazar-high-confidence.acc}} for predictions with high confidence, {{lazar-all.acc}} for all predictions). Models utilizing PaDEL descriptors have generally lower accuracies ranging from TODO to TODO. Sensitivity and specificity is generally well balanced with the exception of `lazar`-PaDEL (low sensitivity) and R deep learning (low specificity) models. +<!-- | |R-RF | R-SVM | R-DL | TF | TF-FS | L | L-HC | L-P | L-P-HC| |-|-----|-------|------|----|-------|---|------|------|--------| |Accuracy|{{R-RF.acc}}|{{R-SVM.acc}}|{{R-DL.acc}}|{{tensorflow-all.acc}}|{{tensorflow-selected.acc}}|{{lazar-all.acc}}|{{lazar-high-confidence.acc}}|{{lazar-padel-all.acc}}|{{lazar-padel-high-confidence.acc}}| @@ -496,7 +507,6 @@ The most accurate crossvalidation predictions have been obtained with `lazar` mo ![ROC plot of crossvalidation results. *R-RF*: R Random Forests, *R-SVM*: R Support Vector Machines, *R-DL*: R Deep Learning, *TF*: Tensorflow without feature selection, *TF-FS*: Tensorflow with feature selection, *L*: lazar, *L-HC*: lazar high confidence predictions, *L-P*: lazar with PaDEL descriptors, *L-P-HC*: lazar PaDEL high confidence predictions (overlaps with L-P)](figures/roc.png){#fig:roc} -<!-- R Models -------- diff --git a/export/pa_carcinogenicity.csv b/pyrrolizidine-alkaloids/pa_carcinogenicity.csv index 1a3815b..1a3815b 100644 --- a/export/pa_carcinogenicity.csv +++ b/pyrrolizidine-alkaloids/pa_carcinogenicity.csv diff --git a/export/pa_mutagenicity.csv b/pyrrolizidine-alkaloids/pa_mutagenicity.csv index 7b0d6ea..7b0d6ea 100644 --- a/export/pa_mutagenicity.csv +++ b/pyrrolizidine-alkaloids/pa_mutagenicity.csv diff --git a/scripts/confusion-matrix2table.rb b/scripts/confusion-matrix2table.rb deleted file mode 100755 index ccb4817..0000000 --- a/scripts/confusion-matrix2table.rb +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env ruby - -mat = [] -File.readlines(ARGV[0]).each do |l| - mat << l.chomp.split(",") -end -puts ",,Predictions," -puts ",,mutagenic,non-mutagenic" -puts "Measurements,mutagenic,#{mat[0][0]},#{mat[0][1]}" -puts ",non-mutagenic,#{mat[1][0]},#{mat[1][1]}" diff --git a/scripts/crossvalidation-summary.rb b/scripts/crossvalidation-summary.rb deleted file mode 100755 index 13b0dfa..0000000 --- a/scripts/crossvalidation-summary.rb +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env ruby -require_relative '../../lazar/lib/lazar' -include OpenTox - -summary = [] -model = Model::Validation.find(File.read(ARGV[0]).chomp).crossvalidations.each do |cv| - summary << cv.statistics -end -puts JSON.pretty_generate(summary) diff --git a/scripts/json2csv.rb b/scripts/json2csv.rb deleted file mode 100755 index 03191de..0000000 --- a/scripts/json2csv.rb +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env ruby -require_relative '../../lazar/lib/lazar' -include OpenTox - -results = JSON.parse File.read(ARGV[0]) -puts "SMILES,#{File.basename(ARGV[0],".json").sub("pa_","").capitalize},Probability(0),Probability(1),Nr Neighbors,Warnings" -results.each do |id,r| - s = Compound.find(id).smiles - if r["value"] - puts [ - s, - r["value"], - r["probabilities"]["0"], - r["probabilities"]["1"], - r["neighbors"].size, - r["warnings"], - ].join(",") - else - r["neighbors"] ? n = r["neighbors"].size : n = nil - puts [ - s, - r["value"], - nil, - nil, - n, - r["warnings"], - ].join(",") - end -end diff --git a/scripts/results.rb b/scripts/results.rb deleted file mode 100755 index 1a36278..0000000 --- a/scripts/results.rb +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env ruby -require 'json' - -result = {} -ARGV.each do |f| - fname = File.basename(f,".json") - program,algo = fname.split('-') - case program - when "tensorflow" - algo == "all" ? algo = "without feature selection" : algo = "with feature selection" - when "lazar" - algo = "high-confidence" if algo == "high" - if algo == "padel" - algo = "PaDEL" - fname.match("high") ? algo += " high-confidence" : algo += " all" - end - end - result[program] ||= {} - result[program][algo] = JSON.parse(File.read(f)).collect{|k,v| [k,v.round(2)]}.to_h -end - -out = {:programs => []} -result.keys.each do |prog| - out[:programs] << {:name => prog, :algos => []} - result[prog].keys.each do |algo| - r = result[prog][algo].dup - result[prog][algo].each do |k,v| - r[k+"_perc"] = (v*100).round - end - r[:name] = algo - r[:abbrev] = prog+"-"+algo - out[:programs].last[:algos] << r - end -end - -puts out.to_json diff --git a/scripts/roc.R b/scripts/roc.R index cb219fc..afc8293 100755 --- a/scripts/roc.R +++ b/scripts/roc.R @@ -1,6 +1,6 @@ #!/usr/bin/env Rscript library(ggplot2) -data <- read.csv("figures/results.csv",header=T) +data <- read.csv("figures/roc.csv",header=T) p <- ggplot(data, aes(x=fpr, y=tpr)) + geom_abline() p <- p + geom_label(label=rownames(data) ) p <- p + expand_limits(x=c(0,1),y=c(0,1)) diff --git a/scripts/summaries2table.rb b/scripts/summaries2table.rb index 5470b26..f98ec54 100755 --- a/scripts/summaries2table.rb +++ b/scripts/summaries2table.rb @@ -1,19 +1,23 @@ #!/usr/bin/env ruby -require 'json' +require 'yaml' -results = {} +rows = {:acc => "Accuracy", :tpr => "True positive rate/Sensitivity", :tnr => "True negative rate/Specificity", :ppv => "Positive predictive value/Precision", :npv => "Negative predictive value", :n => "Nr. predictions"} +data = YAML.load_file "10-fold-crossvalidations/summary.yaml" -ARGV.each do |f| - results[File.basename(f,".json")] = JSON.parse(File.read(f)) +case ARGV[0] +when "R" + header = ["RF","SVM","DL"] + keys = header.collect{|h| "R-"+h} +when "tensorflow" + header = ["RF","LR (SGD)","LR (SCIKIT)","NN"] + keys = ["lr","lr2","nn"].collect{|n| "tensorflow-"+n+".v3"} +when "lazar" + header = ["lazar-mp2d (all)","lazar-mp2d (high confidence)", "lazar-padel (all)","lazar-padel (high confidence)"] + keys = ["lazar-all","lazar-high-confidence", "lazar-padel-all","lazar-padel-high-confidence"] end - -print "," -puts results.keys.collect{|k| k.sub("tensorflow","TF")}.join(",") -["accuracy","true_positive_rate","true_negative_rate","positive_predictive_value","negative_predictive_value"].each do |m| - line = [m.gsub("_"," ")] - results.each do |k,v| - line << v[m].round(2) - end - puts line.join(",") +puts ","+header.join(",") +rows.each do |short,long| + print long+"," + puts keys.collect{|k| data[k][short]}.join(",") end - +exit diff --git a/scripts/mp2d-tsne.R b/scripts/tsne-mp2d.R index 0877622..0877622 100755 --- a/scripts/mp2d-tsne.R +++ b/scripts/tsne-mp2d.R diff --git a/scripts/padel-tsne.R b/scripts/tsne-padel.R index b8e9763..b8e9763 100755 --- a/scripts/padel-tsne.R +++ b/scripts/tsne-padel.R diff --git a/tables/R-DL.csv b/tables/R-DL.csv deleted file mode 100644 index d622e0e..0000000 --- a/tables/R-DL.csv +++ /dev/null @@ -1,4 +0,0 @@ -,,Predictions, -,,mutagenic,non-mutagenic -Measurements,mutagenic,3517,3099 -,non-mutagenic,483,971 diff --git a/tables/R-RF.csv b/tables/R-RF.csv deleted file mode 100644 index d81cff6..0000000 --- a/tables/R-RF.csv +++ /dev/null @@ -1,4 +0,0 @@ -,,Predictions, -,,mutagenic,non-mutagenic -Measurements,mutagenic,2259,1173 -,non-mutagenic,1741,2897 diff --git a/tables/R-SVM.csv b/tables/R-SVM.csv deleted file mode 100644 index 9aaf85f..0000000 --- a/tables/R-SVM.csv +++ /dev/null @@ -1,4 +0,0 @@ -,,Predictions, -,,mutagenic,non-mutagenic -Measurements,mutagenic,2243,1353 -,non-mutagenic,1757,2717 diff --git a/tables/lazar-all.csv b/tables/lazar-all.csv deleted file mode 100644 index c4db6a1..0000000 --- a/tables/lazar-all.csv +++ /dev/null @@ -1,4 +0,0 @@ -,,Predictions, -,,mutagenic,non-mutagenic -Measurements,mutagenic,3326,833 -,non-mutagenic,583,3039 diff --git a/tables/lazar-high-confidence.csv b/tables/lazar-high-confidence.csv deleted file mode 100644 index 049c73b..0000000 --- a/tables/lazar-high-confidence.csv +++ /dev/null @@ -1,4 +0,0 @@ -,,Predictions, -,,mutagenic,non-mutagenic -Measurements,mutagenic,2816,571 -,non-mutagenic,365,2138 diff --git a/tables/lazar-padel-all.csv b/tables/lazar-padel-all.csv deleted file mode 100644 index e2436f7..0000000 --- a/tables/lazar-padel-all.csv +++ /dev/null @@ -1,4 +0,0 @@ -,,Predictions, -,,mutagenic,non-mutagenic -Measurements,mutagenic,593,466 -,non-mutagenic,1253,1777 diff --git a/tables/lazar-padel-high-confidence.csv b/tables/lazar-padel-high-confidence.csv deleted file mode 100644 index 1b551d9..0000000 --- a/tables/lazar-padel-high-confidence.csv +++ /dev/null @@ -1,4 +0,0 @@ -,,Predictions, -,,mutagenic,non-mutagenic -Measurements,mutagenic,593,466 -,non-mutagenic,1251,1771 diff --git a/tables/lazar-summary.csv b/tables/lazar-summary.csv new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/tables/lazar-summary.csv diff --git a/tables/r-summary.csv b/tables/r-summary.csv new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/tables/r-summary.csv diff --git a/tables/tensorflow-all.csv b/tables/tensorflow-all.csv deleted file mode 100644 index afd74c5..0000000 --- a/tables/tensorflow-all.csv +++ /dev/null @@ -1,4 +0,0 @@ -,,Predictions, -,,mutagenic,non-mutagenic -Measurements,mutagenic,2507,1528 -,non-mutagenic,1495,2550 diff --git a/tables/tensorflow-selected.csv b/tables/tensorflow-selected.csv deleted file mode 100644 index 6c0f6e5..0000000 --- a/tables/tensorflow-selected.csv +++ /dev/null @@ -1,4 +0,0 @@ -,,Predictions, -,,mutagenic,non-mutagenic -Measurements,mutagenic,2453,1454 -,non-mutagenic,1549,2624 diff --git a/tables/tensorflow-summary.csv b/tables/tensorflow-summary.csv new file mode 100644 index 0000000..bbd4885 --- /dev/null +++ b/tables/tensorflow-summary.csv @@ -0,0 +1,7 @@ +,RF,LR (SGD),LR (SCIKIT),NN +Accuracy,0.62,0.63,0.63 +True positive rate/Sensitivity,0.6,0.62,0.61 +True negative rate/Specificity,0.65,0.63,0.64 +Positive predictive value/Precision,0.63,0.62,0.63 +Negative predictive value,0.62,0.63,0.63 +Nr. predictions,8080,8080,8080 |