From 2c3bc133700f7e1e1ea8d038d87da1f3095ed103 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 19 Oct 2020 23:54:19 +0200 Subject: PA prediction summary --- 10-fold-crossvalidations/summary.yaml | 375 +++++++++++++++++----------------- Makefile | 23 ++- figures/roc.png | Bin 134226 -> 146544 bytes mutagenicity.md | 16 +- pyrrolizidine-alkaloids/summary.yaml | 83 ++++++++ scripts/confusion-matrix-summary.rb | 2 +- scripts/summary2roc.rb | 2 +- scripts/summary2table.rb | 2 +- 8 files changed, 301 insertions(+), 202 deletions(-) create mode 100644 pyrrolizidine-alkaloids/summary.yaml diff --git a/10-fold-crossvalidations/summary.yaml b/10-fold-crossvalidations/summary.yaml index c05db63..08c0b40 100644 --- a/10-fold-crossvalidations/summary.yaml +++ b/10-fold-crossvalidations/summary.yaml @@ -1,188 +1,189 @@ --- -lazar-all: - :tp: 3326 - :fp: 833 - :tn: 3039 - :fn: 583 - :n: 7781 - :acc: 0.82 - :tpr: 0.85 - :fpr: 0.22 - :tnr: 0.78 - :ppv: 0.8 - :npv: 0.84 - :acc_perc: 82 - :tpr_perc: 85 - :tnr_perc: 78 - :ppv_perc: 80 - :npv_perc: 84 -lazar-high-confidence: - :tp: 2816 - :fp: 571 - :tn: 2138 - :fn: 365 - :n: 5890 - :acc: 0.84 - :tpr: 0.89 - :fpr: 0.21 - :tnr: 0.79 - :ppv: 0.83 - :npv: 0.85 - :acc_perc: 84 - :tpr_perc: 89 - :tnr_perc: 79 - :ppv_perc: 83 - :npv_perc: 85 -lazar-padel-all: - :tp: 593 - :fp: 466 - :tn: 1777 - :fn: 1253 - :n: 4089 - :acc: 0.58 - :tpr: 0.32 - :fpr: 0.21 - :tnr: 0.79 - :ppv: 0.56 - :npv: 0.59 - :acc_perc: 58 - :tpr_perc: 32 - :tnr_perc: 79 - :ppv_perc: 56 - :npv_perc: 59 -lazar-padel-high-confidence: - :tp: 593 - :fp: 466 - :tn: 1771 - :fn: 1251 - :n: 4081 - :acc: 0.58 - :tpr: 0.32 - :fpr: 0.21 - :tnr: 0.79 - :ppv: 0.56 - :npv: 0.59 - :acc_perc: 58 - :tpr_perc: 32 - :tnr_perc: 79 - :ppv_perc: 56 - :npv_perc: 59 -R-RF: - :tp: 2259 - :fp: 1173 - :tn: 2897 - :fn: 1741 - :n: 8070 - :acc: 0.64 - :tpr: 0.56 - :fpr: 0.29 - :tnr: 0.71 - :ppv: 0.66 - :npv: 0.62 - :acc_perc: 64 - :tpr_perc: 56 - :tnr_perc: 71 - :ppv_perc: 66 - :npv_perc: 62 -R-SVM: - :tp: 2243 - :fp: 1353 - :tn: 2717 - :fn: 1757 - :n: 8070 - :acc: 0.61 - :tpr: 0.56 - :fpr: 0.33 - :tnr: 0.67 - :ppv: 0.62 - :npv: 0.61 - :acc_perc: 61 - :tpr_perc: 56 - :tnr_perc: 67 - :ppv_perc: 62 - :npv_perc: 61 -R-DL: - :tp: 3517 - :fp: 3099 - :tn: 971 - :fn: 483 - :n: 8070 - :acc: 0.56 - :tpr: 0.88 - :fpr: 0.76 - :tnr: 0.24 - :ppv: 0.53 - :npv: 0.67 - :acc_perc: 56 - :tpr_perc: 88 - :tnr_perc: 24 - :ppv_perc: 53 - :npv_perc: 67 -tensorflow-rf.v3: - :tp: 2362 - :fp: 1243 - :tn: 2835 - :fn: 1640 - :n: 8080 - :acc: 0.64 - :tpr: 0.59 - :fpr: 0.3 - :tnr: 0.7 - :ppv: 0.66 - :npv: 0.63 - :acc_perc: 64 - :tpr_perc: 59 - :tnr_perc: 70 - :ppv_perc: 66 - :npv_perc: 63 -tensorflow-lr.v3: - :tp: 2395 - :fp: 1427 - :tn: 2651 - :fn: 1607 - :n: 8080 - :acc: 0.62 - :tpr: 0.6 - :fpr: 0.35 - :tnr: 0.65 - :ppv: 0.63 - :npv: 0.62 - :acc_perc: 62 - :tpr_perc: 60 - :tnr_perc: 65 - :ppv_perc: 63 - :npv_perc: 62 -tensorflow-lr2.v3: - :tp: 2487 - :fp: 1497 - :tn: 2581 - :fn: 1515 - :n: 8080 - :acc: 0.63 - :tpr: 0.62 - :fpr: 0.37 - :tnr: 0.63 - :ppv: 0.62 - :npv: 0.63 - :acc_perc: 63 - :tpr_perc: 62 - :tnr_perc: 63 - :ppv_perc: 62 - :npv_perc: 63 -tensorflow-nn.v3: - :tp: 2452 - :fp: 1468 - :tn: 2610 - :fn: 1550 - :n: 8080 - :acc: 0.63 - :tpr: 0.61 - :fpr: 0.36 - :tnr: 0.64 - :ppv: 0.63 - :npv: 0.63 - :acc_perc: 63 - :tpr_perc: 61 - :tnr_perc: 64 - :ppv_perc: 63 - :npv_perc: 63 +:cv: + lazar-all: + :tp: 3326 + :fp: 833 + :tn: 3039 + :fn: 583 + :n: 7781 + :acc: 0.82 + :tpr: 0.85 + :fpr: 0.22 + :tnr: 0.78 + :ppv: 0.8 + :npv: 0.84 + :acc_perc: 82 + :tpr_perc: 85 + :tnr_perc: 78 + :ppv_perc: 80 + :npv_perc: 84 + lazar-high-confidence: + :tp: 2816 + :fp: 571 + :tn: 2138 + :fn: 365 + :n: 5890 + :acc: 0.84 + :tpr: 0.89 + :fpr: 0.21 + :tnr: 0.79 + :ppv: 0.83 + :npv: 0.85 + :acc_perc: 84 + :tpr_perc: 89 + :tnr_perc: 79 + :ppv_perc: 83 + :npv_perc: 85 + lazar-padel-all: + :tp: 593 + :fp: 466 + :tn: 1777 + :fn: 1253 + :n: 4089 + :acc: 0.58 + :tpr: 0.32 + :fpr: 0.21 + :tnr: 0.79 + :ppv: 0.56 + :npv: 0.59 + :acc_perc: 58 + :tpr_perc: 32 + :tnr_perc: 79 + :ppv_perc: 56 + :npv_perc: 59 + lazar-padel-high-confidence: + :tp: 593 + :fp: 466 + :tn: 1771 + :fn: 1251 + :n: 4081 + :acc: 0.58 + :tpr: 0.32 + :fpr: 0.21 + :tnr: 0.79 + :ppv: 0.56 + :npv: 0.59 + :acc_perc: 58 + :tpr_perc: 32 + :tnr_perc: 79 + :ppv_perc: 56 + :npv_perc: 59 + R-RF: + :tp: 2259 + :fp: 1173 + :tn: 2897 + :fn: 1741 + :n: 8070 + :acc: 0.64 + :tpr: 0.56 + :fpr: 0.29 + :tnr: 0.71 + :ppv: 0.66 + :npv: 0.62 + :acc_perc: 64 + :tpr_perc: 56 + :tnr_perc: 71 + :ppv_perc: 66 + :npv_perc: 62 + R-SVM: + :tp: 2243 + :fp: 1353 + :tn: 2717 + :fn: 1757 + :n: 8070 + :acc: 0.61 + :tpr: 0.56 + :fpr: 0.33 + :tnr: 0.67 + :ppv: 0.62 + :npv: 0.61 + :acc_perc: 61 + :tpr_perc: 56 + :tnr_perc: 67 + :ppv_perc: 62 + :npv_perc: 61 + R-DL: + :tp: 3517 + :fp: 3099 + :tn: 971 + :fn: 483 + :n: 8070 + :acc: 0.56 + :tpr: 0.88 + :fpr: 0.76 + :tnr: 0.24 + :ppv: 0.53 + :npv: 0.67 + :acc_perc: 56 + :tpr_perc: 88 + :tnr_perc: 24 + :ppv_perc: 53 + :npv_perc: 67 + tensorflow-rf.v3: + :tp: 2362 + :fp: 1243 + :tn: 2835 + :fn: 1640 + :n: 8080 + :acc: 0.64 + :tpr: 0.59 + :fpr: 0.3 + :tnr: 0.7 + :ppv: 0.66 + :npv: 0.63 + :acc_perc: 64 + :tpr_perc: 59 + :tnr_perc: 70 + :ppv_perc: 66 + :npv_perc: 63 + tensorflow-lr.v3: + :tp: 2395 + :fp: 1427 + :tn: 2651 + :fn: 1607 + :n: 8080 + :acc: 0.62 + :tpr: 0.6 + :fpr: 0.35 + :tnr: 0.65 + :ppv: 0.63 + :npv: 0.62 + :acc_perc: 62 + :tpr_perc: 60 + :tnr_perc: 65 + :ppv_perc: 63 + :npv_perc: 62 + tensorflow-lr2.v3: + :tp: 2487 + :fp: 1497 + :tn: 2581 + :fn: 1515 + :n: 8080 + :acc: 0.63 + :tpr: 0.62 + :fpr: 0.37 + :tnr: 0.63 + :ppv: 0.62 + :npv: 0.63 + :acc_perc: 63 + :tpr_perc: 62 + :tnr_perc: 63 + :ppv_perc: 62 + :npv_perc: 63 + tensorflow-nn.v3: + :tp: 2452 + :fp: 1468 + :tn: 2610 + :fn: 1550 + :n: 8080 + :acc: 0.63 + :tpr: 0.61 + :fpr: 0.36 + :tnr: 0.64 + :ppv: 0.63 + :npv: 0.63 + :acc_perc: 63 + :tpr_perc: 61 + :tnr_perc: 64 + :ppv_perc: 63 + :npv_perc: 63 diff --git a/Makefile b/Makefile index 5ceb4aa..fb5eb30 100644 --- a/Makefile +++ b/Makefile @@ -13,6 +13,7 @@ TEMPLATE_FILE_LATEX = pandoc-scholar.latex # Experiments # crossvalidations + LAZAR_CONFUSION_MATRIX_DIR = 10-fold-crossvalidations/lazar/crossvalidation/confusion_matrices LAZAR_PADEL_CONFUSION_MATRIX_DIR = 10-fold-crossvalidations/lazar-padel/crossvalidation/confusion_matrices R_CV_DIR = 10-fold-crossvalidations/R @@ -22,8 +23,10 @@ CONFUSION_MATRICES_DIR = 10-fold-crossvalidations/confusion-matrices CONFUSION_MATRICES = $(CONFUSION_MATRICES_DIR)/lazar-all.csv $(CONFUSION_MATRICES_DIR)/lazar-high-confidence.csv $(CONFUSION_MATRICES_DIR)/lazar-padel-all.csv $(CONFUSION_MATRICES_DIR)/lazar-padel-high-confidence.csv $(CONFUSION_MATRICES_DIR)/R-RF.csv $(CONFUSION_MATRICES_DIR)/R-SVM.csv $(CONFUSION_MATRICES_DIR)/R-DL.csv $(CONFUSION_MATRICES_DIR)/tensorflow-rf.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-lr.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-lr2.v3.csv $(CONFUSION_MATRICES_DIR)/tensorflow-nn.v3.csv CV_SUMMARY = 10-fold-crossvalidations/summary.yaml +PA_SUMMARY = pyrrolizidine-alkaloids/summary.yaml # PA predictions + PA_DIR = pyrrolizidine-alkaloids PA_LAZAR_DIR = $(PA_DIR)/lazar PA_R_DIR = $(PA_DIR)/R @@ -32,13 +35,14 @@ PA_TF_DIR = $(PA_DIR)/tensorflow PA_PREDICTIONS = $(PA_LAZAR_DIR)/pa-mp2d-predictions.csv $(PA_LAZAR_DIR)/pa-padel-predictions.csv $(PA_R_DIR)/PA.RF.outcome.csv $(PA_R_DIR)/PA.SVM.outcome.csv $(PA_R_DIR)/PA.DL.outcome.csv $(PA_TF_DIR)/pred.lr.v3-ext-Padel-2D.csv $(PA_TF_DIR)/pred.lr2.v3-ext-Padel-2D.csv $(PA_TF_DIR)/pred.rf.v3-ext-Padel-2D.csv $(PA_TF_DIR)/pred.nn.v3-ext-Padel-2D.csv # manuscript + TABLES = tables/lazar-summary.csv tables/r-summary.csv tables/tensorflow-summary.csv tables/pa-tab.tex FIGURES = figures/roc.png figures/tsne-mp2d.png figures/tsne-padel.png -all: $(TABLES) $(FIGURES) mutagenicity.pdf +all: $(TABLES) $(FIGURES) $(CV_SUMMARY) mutagenicity.pdf include $(PANDOC_SCHOLAR_PATH)/Makefile -mutagenicity.mustache.md: $(CV_SUMMARY) mutagenicity.md $(TABLES) $(FIGURES) +mutagenicity.mustache.md: $(CV_SUMMARY) $(PA_SUMMARY) mutagenicity.md $(TABLES) $(FIGURES) mustache $^ > $@ # figures @@ -62,6 +66,7 @@ figures/roc.csv: $(CV_SUMMARY) scripts/summary2roc.rb $< > $@ # tables + tables/pa-tab.tex: tables/pa-table.csv scripts/pa-tex-table.rb $< > $@ @@ -77,10 +82,10 @@ tables/r-summary.csv: $(CV_SUMMARY) tables/tensorflow-summary.csv: $(CV_SUMMARY) scripts/summary2table.rb tensorflow > $@ -# crossvalidation summary +# PA summary -$(CV_SUMMARY): $(CONFUSION_MATRICES) - scripts/confusion-matrix-summary.rb $^ > $@ +$(PA_SUMMARY): tables/pa-table.csv + scripts/pa-summary.rb $< > $@ # PA predictions @@ -93,9 +98,15 @@ $(PA_LAZAR_DIR)/pa-mp2d-predictions.csv: $(PA_LAZAR_DIR)/pa-smiles.csv $(PA_LAZAR_DIR)/pa-smiles.csv: pyrrolizidine-alkaloids/180920_PA_complete_SMILES.csv cut -f1,4 -d ';' $< | sed 's/;/,/' > $@ +# crossvalidation summary + +$(CV_SUMMARY): $(CONFUSION_MATRICES) + scripts/confusion-matrix-summary.rb $^ > $@ + # confusion matrices ## lazar + $(CONFUSION_MATRICES_DIR)/lazar-all.csv: $(LAZAR_CONFUSION_MATRIX_DIR) cp $ $@ @@ -119,6 +131,7 @@ $(CONFUSION_MATRICES_DIR)/R-DL.csv: $(R_CV_DIR)/Sgl-Observations-DL.csv scripts/cv-r-confusion-matrix.rb $< > $@ ## tensorflow + $(TENSORFLOW_CV_DIR)/pred.lr.v3.norm.sorted.csv: $(TENSORFLOW_CV_DIR)/pred.lr.v3.norm.csv sort -n $< > $@ diff --git a/figures/roc.png b/figures/roc.png index 732299b..24a9dfb 100644 Binary files a/figures/roc.png and b/figures/roc.png differ diff --git a/mutagenicity.md b/mutagenicity.md index 9f7e349..c278142 100644 --- a/mutagenicity.md +++ b/mutagenicity.md @@ -42,7 +42,7 @@ Abstract Random forest, support vector machine, logistic regression, neural networks and k-nearest neighbor (`lazar`) algorithms, were applied to new *Salmonella* mutagenicity dataset with 8309 unique chemical structures. The best prediction accuracies in -10-fold-crossvalidation were obtained with `lazar` models and MolPrint2D descriptors, that gave accuracies ({{lazar-high-confidence.acc_perc}}%) +10-fold-crossvalidation were obtained with `lazar` models and MolPrint2D descriptors, that gave accuracies ({{cv.lazar-high-confidence.acc_perc}}%) similar to the interlaboratory variability of the Ames test. **TODO**: PA results @@ -497,13 +497,15 @@ Crossvalidation results are summarized in the following tables: @tbl:lazar shows Confusion matrices for all models are available from the git repository http://git.in-silico.ch/mutagenicity-paper/10-fold-crossvalidations/confusion-matrices/, individual predictions can be found in http://git.in-silico.ch/mutagenicity-paper/10-fold-crossvalidations/predictions/. -The most accurate crossvalidation predictions have been obtained with standard `lazar` models using MolPrint2D descriptors ({{lazar-high-confidence.acc}} for predictions with high confidence, {{lazar-all.acc}} for all predictions). Models utilizing PaDEL descriptors have generally lower accuracies ranging from {{R-DL.acc}} (R deep learning) to {{R-RF.acc}} (R/Tensorflow random forests). Sensitivity and specificity is generally well balanced with the exception of `lazar`-PaDEL (low sensitivity) and R deep learning (low specificity) models. +The most accurate crossvalidation predictions have been obtained with standard `lazar` models using MolPrint2D descriptors ({{cv.lazar-high-confidence.acc}} for predictions with high confidence, {{cv.lazar-all.acc}} for all predictions). Models utilizing PaDEL descriptors have generally lower accuracies ranging from {{cv.R-DL.acc}} (R deep learning) to {{cv.R-RF.acc}} (R/Tensorflow random forests). Sensitivity and specificity is generally well balanced with the exception of `lazar`-PaDEL (low sensitivity) and R deep learning (low specificity) models. Pyrrolizidine alkaloid mutagenicity predictions ----------------------------------------------- Mutagenicity predictions from all investigated models for 602 pyrrolizidine alkaloids are summarized in Table 4. +**TODO** **Verena und Philipp** Koennt Ihr bitte stichprobenweise die Tabelle ueberpruefen, mir verrutscht bei der Auswertung immer gerne etwas. + \input{tables/pa-tab.tex} Training data and @@ -546,16 +548,16 @@ models have low specificity. The accuracy of `lazar` *in-silico* predictions are comparable to the interlaboratory variability of the Ames test (80-85% according to @Benigni1988), especially for predictions with high confidence -({{lazar-high-confidence.acc_perc}}%). This is a clear indication that +({{cv.lazar-high-confidence.acc_perc}}%). This is a clear indication that *in-silico* predictions can be as reliable as the bioassays, if the compounds are close to the applicability domain. This conclusion is also supported by our analysis of `lazar` lowest observed effect level predictions, which are also similar to the experimental variability (@Helma2018). -The lowest number of predictions ({{lazar-padel-high-confidence.n}}) has been +The lowest number of predictions ({{cv.lazar-padel-high-confidence.n}}) has been obtained from `lazar`-PaDEL high confidence predictions, the largest number of -predictions comes from Tensorflow models ({{tensorflow-rf.v3.n}}). Standard -`lazar` give a slightly lower number of predictions ({{lazar-all.n}}) than R +predictions comes from Tensorflow models ({{cv.tensorflow-rf.v3.n}}). Standard +`lazar` give a slightly lower number of predictions ({{cv.lazar-all.n}}) than R and Tensorflow models. This is not necessarily a disadvantage, because `lazar` abstains from predictions, if the query compound is very dissimilar from the compounds in the training set and thus avoids to make predictions for compounds @@ -751,7 +753,7 @@ A new public *Salmonella* mutagenicity training dataset with 8309 compounds was created and used it to train `lazar`, R and Tensorflow models with MolPrint2D and PaDEL descriptors. The best performance was obtained with `lazar` models using MolPrint2D descriptors, with prediction accuracies -({{lazar-high-confidence.acc_perc}}%) comparable to the interlaboratory variability +({{cv.lazar-high-confidence.acc_perc}}%) comparable to the interlaboratory variability of the Ames test (80-85%). Models based on PaDEL descriptors had lower accuracies than MolPrint2D models, but only the `lazar` algorithm could use MolPrint2D descriptors. diff --git a/pyrrolizidine-alkaloids/summary.yaml b/pyrrolizidine-alkaloids/summary.yaml new file mode 100644 index 0000000..66c5030 --- /dev/null +++ b/pyrrolizidine-alkaloids/summary.yaml @@ -0,0 +1,83 @@ +--- +:pa: + :n: 602 + :lazar: + :mp2d: + :all: + :n: 560 + :mut: 111 + :non_mut: 449 + :n_perc: 93 + :mut_perc: 19 + :non_mut_perc: 80 + :high_confidence: + :n: 301 + :mut: 76 + :non_mut: 225 + :n_perc: 50 + :mut_perc: 25 + :non_mut_perc: 74 + :padel: + :all: + :n: 600 + :mut: 83 + :non_mut: 517 + :n_perc: 99 + :mut_perc: 13 + :non_mut_perc: 86 + :high_confidence: + :n: 0 + :mut: 0 + :non_mut: 0 + :n_perc: 0 + :r: + :rf: + :n: 602 + :mut: 18 + :non_mut: 584 + :n_perc: 100 + :mut_perc: 2 + :non_mut_perc: 97 + :svm: + :n: 602 + :mut: 11 + :non_mut: 591 + :n_perc: 100 + :mut_perc: 1 + :non_mut_perc: 98 + :dl: + :n: 602 + :mut: 521 + :non_mut: 81 + :n_perc: 100 + :mut_perc: 86 + :non_mut_perc: 13 + :tf: + :rf: + :n: 602 + :mut: 186 + :non_mut: 416 + :n_perc: 100 + :mut_perc: 30 + :non_mut_perc: 69 + :lr_sgd: + :n: 602 + :mut: 286 + :non_mut: 316 + :n_perc: 100 + :mut_perc: 47 + :non_mut_perc: 52 + :lr_scikit: + :n: 602 + :mut: 395 + :non_mut: 207 + :n_perc: 100 + :mut_perc: 65 + :non_mut_perc: 34 + :nn: + :n: 602 + :mut: 295 + :non_mut: 307 + :n_perc: 100 + :mut_perc: 49 + :non_mut_perc: 50 diff --git a/scripts/confusion-matrix-summary.rb b/scripts/confusion-matrix-summary.rb index 129d69a..8a32f79 100755 --- a/scripts/confusion-matrix-summary.rb +++ b/scripts/confusion-matrix-summary.rb @@ -30,5 +30,5 @@ ARGV.each do |f| } results[File.basename(f,".csv")] = result end - +results = {:cv => results} puts results.to_yaml diff --git a/scripts/summary2roc.rb b/scripts/summary2roc.rb index e50d97a..e692d74 100755 --- a/scripts/summary2roc.rb +++ b/scripts/summary2roc.rb @@ -1,7 +1,7 @@ #!/usr/bin/env ruby require "yaml" -data = YAML.load(File.read ARGV[0]) +data = YAML.load(File.read ARGV[0])[:cv] puts "tpr,fpr" data.each do |algo,values| algo = algo.sub("tensorflow","Tensorflow").sub("selected","FS").sub(".v3","").sub("-all"," (all)").sub("-high-confidence"," (high confidence)").sub("padel","PaDEL").sub("lazar ","lazar-MP2D ").sub("lr2","LR-scikit").sub("lr","LR-sgd").sub("nn","NN").sub("-rf","-RF") diff --git a/scripts/summary2table.rb b/scripts/summary2table.rb index 555097c..267bb97 100755 --- a/scripts/summary2table.rb +++ b/scripts/summary2table.rb @@ -2,7 +2,7 @@ require 'yaml' rows = {:acc => "Accuracy", :tpr => "True positive rate/Sensitivity", :tnr => "True negative rate/Specificity", :ppv => "Positive predictive value/Precision", :npv => "Negative predictive value", :n => "Nr. predictions"} -data = YAML.load_file "10-fold-crossvalidations/summary.yaml" +data = YAML.load_file("10-fold-crossvalidations/summary.yaml")[:cv] case ARGV[0] when "R" -- cgit v1.2.3