From 0b686f924a42105f2516aea44c27b6d3f75e1672 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 20 Oct 2020 20:42:54 +0200 Subject: Summary table of PA predictions --- Makefile | 5 +- mutagenicity.md | 38 +++++-- pyrrolizidine-alkaloids/summary.yaml | 24 +++-- scripts/pa-summary-table.rb | 18 ++++ scripts/pa-summary.rb | 190 +++++++++++++++++++++++++++++++++++ tables/pa-summary.csv | 12 +++ 6 files changed, 265 insertions(+), 22 deletions(-) create mode 100755 scripts/pa-summary-table.rb create mode 100755 scripts/pa-summary.rb create mode 100644 tables/pa-summary.csv diff --git a/Makefile b/Makefile index fb5eb30..c63cdf5 100644 --- a/Makefile +++ b/Makefile @@ -36,7 +36,7 @@ PA_PREDICTIONS = $(PA_LAZAR_DIR)/pa-mp2d-predictions.csv $(PA_LAZAR_DIR)/pa-pade # manuscript -TABLES = tables/lazar-summary.csv tables/r-summary.csv tables/tensorflow-summary.csv tables/pa-tab.tex +TABLES = tables/lazar-summary.csv tables/r-summary.csv tables/tensorflow-summary.csv tables/pa-tab.tex tables/pa-summary.csv FIGURES = figures/roc.png figures/tsne-mp2d.png figures/tsne-padel.png all: $(TABLES) $(FIGURES) $(CV_SUMMARY) mutagenicity.pdf @@ -67,6 +67,9 @@ figures/roc.csv: $(CV_SUMMARY) # tables +tables/pa-summary.csv: $(PA_SUMMARY) + scripts/pa-summary-table.rb $< > $@ + tables/pa-tab.tex: tables/pa-table.csv scripts/pa-tex-table.rb $< > $@ diff --git a/mutagenicity.md b/mutagenicity.md index c278142..d05cbc7 100644 --- a/mutagenicity.md +++ b/mutagenicity.md @@ -478,7 +478,9 @@ Results 10-fold crossvalidations ------------------------ -Crossvalidation results are summarized in the following tables: @tbl:lazar shows `lazar` results with MolPrint2D and PaDEL descriptors, @tbl:R R results and @tbl:tensorflow Tensorflow results. +Crossvalidation results are summarized in the following tables: @tbl:lazar +shows `lazar` results with MolPrint2D and PaDEL descriptors, @tbl:R R results +and @tbl:tensorflow Tensorflow results. ```{#tbl:lazar .table file="tables/lazar-summary.csv" caption="Summary of lazar crossvalidation results (all/high confidence predictions)"} @@ -494,25 +496,41 @@ Crossvalidation results are summarized in the following tables: @tbl:lazar shows ![ROC plot of crossvalidation results.](figures/roc.png){#fig:roc} -Confusion matrices for all models are available from the git repository http://git.in-silico.ch/mutagenicity-paper/10-fold-crossvalidations/confusion-matrices/, individual predictions can be found in -http://git.in-silico.ch/mutagenicity-paper/10-fold-crossvalidations/predictions/. +Confusion matrices for all models are available from the git repository +https://git.in-silico.ch/mutagenicity-paper/10-fold-crossvalidations/confusion-matrices/, +individual predictions can be found in +https://git.in-silico.ch/mutagenicity-paper/10-fold-crossvalidations/predictions/. -The most accurate crossvalidation predictions have been obtained with standard `lazar` models using MolPrint2D descriptors ({{cv.lazar-high-confidence.acc}} for predictions with high confidence, {{cv.lazar-all.acc}} for all predictions). Models utilizing PaDEL descriptors have generally lower accuracies ranging from {{cv.R-DL.acc}} (R deep learning) to {{cv.R-RF.acc}} (R/Tensorflow random forests). Sensitivity and specificity is generally well balanced with the exception of `lazar`-PaDEL (low sensitivity) and R deep learning (low specificity) models. +The most accurate crossvalidation predictions have been obtained with standard +`lazar` models using MolPrint2D descriptors ({{cv.lazar-high-confidence.acc}} +for predictions with high confidence, {{cv.lazar-all.acc}} for all +predictions). Models utilizing PaDEL descriptors have generally lower +accuracies ranging from {{cv.R-DL.acc}} (R deep learning) to {{cv.R-RF.acc}} +(R/Tensorflow random forests). Sensitivity and specificity is generally well +balanced with the exception of `lazar`-PaDEL (low sensitivity) and R deep +learning (low specificity) models. Pyrrolizidine alkaloid mutagenicity predictions ----------------------------------------------- -Mutagenicity predictions from all investigated models for 602 pyrrolizidine alkaloids are summarized in Table 4. +Mutagenicity predictions from all investigated models for 602 pyrrolizidine +alkaloids (PAs) are summarized in Table 4. A CSV table with all predictions can be +downloaded from https://git.in-silico.ch/mutagenicity-paper/tables/pa-table.csv **TODO** **Verena und Philipp** Koennt Ihr bitte stichprobenweise die Tabelle ueberpruefen, mir verrutscht bei der Auswertung immer gerne etwas. \input{tables/pa-tab.tex} -Training data and -pyrrolizidine alkaloids were visualised with t-distributed stochastic neighbor embedding (t-SNE, @Maaten2008) -for MolPrint2D and PaDEL descriptors. t-SNA maps each high-dimensional object -(chemical) to a two-dimensional point. Similar objects are represented by -nearby points and dissimilar objects are represented by distant points. +```{#tbl:pa-summary .table file="tables/pa-summary.csv" caption="Summary of pyrrolizidine alkaloid mutagenicity predictions"} +``` + +For the visualisation of the position of pyrrolizidine alkaloids in respect to +the training data set we have applied t-distributed stochastic neighbor +embedding (t-SNE, @Maaten2008) for MolPrint2D and PaDEL descriptors. t-SNE +maps each high-dimensional object (chemical) to a two-dimensional point, +maintaining the high-dimensional distances of the objects. Similar objects are +represented by nearby points and dissimilar objects are represented by distant +points. @fig:tsne-mp2d shows the t-SNE of pyrrolizidine alkaloids (PA) and the mutagenicity training data in MP2D space (Tanimoto/Jaccard similarity). diff --git a/pyrrolizidine-alkaloids/summary.yaml b/pyrrolizidine-alkaloids/summary.yaml index 66c5030..9c3a39f 100644 --- a/pyrrolizidine-alkaloids/summary.yaml +++ b/pyrrolizidine-alkaloids/summary.yaml @@ -8,7 +8,7 @@ :mut: 111 :non_mut: 449 :n_perc: 93 - :mut_perc: 19 + :mut_perc: 20 :non_mut_perc: 80 :high_confidence: :n: 301 @@ -16,41 +16,43 @@ :non_mut: 225 :n_perc: 50 :mut_perc: 25 - :non_mut_perc: 74 + :non_mut_perc: 75 :padel: :all: :n: 600 :mut: 83 :non_mut: 517 - :n_perc: 99 - :mut_perc: 13 + :n_perc: 100 + :mut_perc: 14 :non_mut_perc: 86 :high_confidence: :n: 0 :mut: 0 :non_mut: 0 :n_perc: 0 + :mut_perc: 0 + :non_mut_perc: 0 :r: :rf: :n: 602 :mut: 18 :non_mut: 584 :n_perc: 100 - :mut_perc: 2 + :mut_perc: 3 :non_mut_perc: 97 :svm: :n: 602 :mut: 11 :non_mut: 591 :n_perc: 100 - :mut_perc: 1 + :mut_perc: 2 :non_mut_perc: 98 :dl: :n: 602 :mut: 521 :non_mut: 81 :n_perc: 100 - :mut_perc: 86 + :mut_perc: 87 :non_mut_perc: 13 :tf: :rf: @@ -58,21 +60,21 @@ :mut: 186 :non_mut: 416 :n_perc: 100 - :mut_perc: 30 + :mut_perc: 31 :non_mut_perc: 69 :lr_sgd: :n: 602 :mut: 286 :non_mut: 316 :n_perc: 100 - :mut_perc: 47 + :mut_perc: 48 :non_mut_perc: 52 :lr_scikit: :n: 602 :mut: 395 :non_mut: 207 :n_perc: 100 - :mut_perc: 65 + :mut_perc: 66 :non_mut_perc: 34 :nn: :n: 602 @@ -80,4 +82,4 @@ :non_mut: 307 :n_perc: 100 :mut_perc: 49 - :non_mut_perc: 50 + :non_mut_perc: 51 diff --git a/scripts/pa-summary-table.rb b/scripts/pa-summary-table.rb new file mode 100755 index 0000000..48546bd --- /dev/null +++ b/scripts/pa-summary-table.rb @@ -0,0 +1,18 @@ +#!/usr/bin/env ruby +require 'yaml' +data = YAML.load_file(ARGV[0]) +puts "Model,Nr.predictions,mutagenic,non-mutagenic" +puts "lazar-MP2D (all),#{data[:pa][:lazar][:mp2d][:all][:n]} (#{data[:pa][:lazar][:mp2d][:all][:n_perc]} %),#{data[:pa][:lazar][:mp2d][:all][:mut]} (#{data[:pa][:lazar][:mp2d][:all][:mut_perc]} %),#{data[:pa][:lazar][:mp2d][:all][:non_mut]} (#{data[:pa][:lazar][:mp2d][:all][:non_mut_perc]} %)" +puts "lazar-MP2D (high-confidence),#{data[:pa][:lazar][:mp2d][:high_confidence][:n]} (#{data[:pa][:lazar][:mp2d][:high_confidence][:n_perc]} %),#{data[:pa][:lazar][:mp2d][:high_confidence][:mut]} (#{data[:pa][:lazar][:mp2d][:high_confidence][:mut_perc]} %),#{data[:pa][:lazar][:mp2d][:high_confidence][:non_mut]} (#{data[:pa][:lazar][:mp2d][:high_confidence][:non_mut_perc]} %)" + +puts "lazar-PaDEL (all),#{data[:pa][:lazar][:padel][:all][:n]} (#{data[:pa][:lazar][:padel][:all][:n_perc]} %),#{data[:pa][:lazar][:padel][:all][:mut]} (#{data[:pa][:lazar][:padel][:all][:mut_perc]} %),#{data[:pa][:lazar][:padel][:all][:non_mut]} (#{data[:pa][:lazar][:padel][:all][:non_mut_perc]} %)" +puts "lazar-PaDEL (high-confidence),#{data[:pa][:lazar][:padel][:high_confidence][:n]} (#{data[:pa][:lazar][:padel][:high_confidence][:n_perc]} %),#{data[:pa][:lazar][:padel][:high_confidence][:mut]} (#{data[:pa][:lazar][:padel][:high_confidence][:mut_perc]} %),#{data[:pa][:lazar][:padel][:high_confidence][:non_mut]} (#{data[:pa][:lazar][:padel][:high_confidence][:non_mut_perc]} %)" + +puts "R-RF,#{data[:pa][:r][:rf][:n]} (#{data[:pa][:r][:rf][:n_perc]} %),#{data[:pa][:r][:rf][:mut]} (#{data[:pa][:r][:rf][:mut_perc]} %),#{data[:pa][:r][:rf][:non_mut]} (#{data[:pa][:r][:rf][:non_mut_perc]} %)" +puts "R-SVM,#{data[:pa][:r][:svm][:n]} (#{data[:pa][:r][:svm][:n_perc]} %),#{data[:pa][:r][:svm][:mut]} (#{data[:pa][:r][:svm][:mut_perc]} %),#{data[:pa][:r][:svm][:non_mut]} (#{data[:pa][:r][:svm][:non_mut_perc]} %)" +puts "R-DL,#{data[:pa][:r][:dl][:n]} (#{data[:pa][:r][:dl][:n_perc]} %),#{data[:pa][:r][:dl][:mut]} (#{data[:pa][:r][:dl][:mut_perc]} %),#{data[:pa][:r][:dl][:non_mut]} (#{data[:pa][:r][:dl][:non_mut_perc]} %)" + +puts "Tensorflow-RF,#{data[:pa][:tf][:rf][:n]} (#{data[:pa][:tf][:rf][:n_perc]} %),#{data[:pa][:tf][:rf][:mut]} (#{data[:pa][:tf][:rf][:mut_perc]} %),#{data[:pa][:tf][:rf][:non_mut]} (#{data[:pa][:tf][:rf][:non_mut_perc]} %)" +puts "Tensorflow-LR-sgd,#{data[:pa][:tf][:lr_sgd][:n]} (#{data[:pa][:tf][:lr_sgd][:n_perc]} %),#{data[:pa][:tf][:lr_sgd][:mut]} (#{data[:pa][:tf][:lr_sgd][:mut_perc]} %),#{data[:pa][:tf][:lr_sgd][:non_mut]} (#{data[:pa][:tf][:lr_sgd][:non_mut_perc]} %)" +puts "Tensorflow-LR-scikit,#{data[:pa][:tf][:lr_scikit][:n]} (#{data[:pa][:tf][:lr_scikit][:n_perc]} %),#{data[:pa][:tf][:lr_scikit][:mut]} (#{data[:pa][:tf][:lr_scikit][:mut_perc]} %),#{data[:pa][:tf][:lr_scikit][:non_mut]} (#{data[:pa][:tf][:lr_scikit][:non_mut_perc]} %)" +puts "Tensorflow-NN,#{data[:pa][:tf][:nn][:n]} (#{data[:pa][:tf][:nn][:n_perc]} %),#{data[:pa][:tf][:nn][:mut]} (#{data[:pa][:tf][:nn][:mut_perc]} %),#{data[:pa][:tf][:nn][:non_mut]} (#{data[:pa][:tf][:nn][:non_mut_perc]} %)" diff --git a/scripts/pa-summary.rb b/scripts/pa-summary.rb new file mode 100755 index 0000000..0715a6c --- /dev/null +++ b/scripts/pa-summary.rb @@ -0,0 +1,190 @@ +#!/usr/bin/env ruby +require 'yaml' + +summary = { + :n => 0, + :lazar => { + :mp2d => { + :all => { + :n => 0, + :mut => 0, + :non_mut => 0 + }, + :high_confidence => { + :n => 0, + :mut => 0, + :non_mut => 0 + } + }, + :padel => { + :all => { + :n => 0, + :mut => 0, + :non_mut => 0 + }, + :high_confidence => { + :n => 0, + :mut => 0, + :non_mut => 0 + } + }, + }, + :r => { + :rf => { + :n => 0, + :mut => 0, + :non_mut => 0 + }, + :svm => { + :n => 0, + :mut => 0, + :non_mut => 0 + }, + :dl => { + :n => 0, + :mut => 0, + :non_mut => 0 + }, + }, + :tf => { + :rf => { + :n => 0, + :mut => 0, + :non_mut => 0 + }, + :lr_sgd => { + :n => 0, + :mut => 0, + :non_mut => 0 + }, + :lr_scikit => { + :n => 0, + :mut => 0, + :non_mut => 0 + }, + :nn => { + :n => 0, + :mut => 0, + :non_mut => 0 + }, + }, +} + +n = 0 +File.read(ARGV[0]).each_line do |l| + unless l.match("SMILES") + id,cid,smi,cansmi,exp,lazar_MP2D,lazar_MP2D_high_confidence,lazar_PaDEL,lazar_PaDEL_high_confidence,r_DL,r_RF,r_SVM,tf_lr_sgd,tf_lr_scikit,tf_NN,tf_RF = l.chomp.split(",") + + if lazar_MP2D == "1" + summary[:lazar][:mp2d][:all][:n] += 1 + summary[:lazar][:mp2d][:all][:mut] += 1 + if lazar_MP2D_high_confidence == "T" + summary[:lazar][:mp2d][:high_confidence][:n] += 1 + summary[:lazar][:mp2d][:high_confidence][:mut] += 1 + end + elsif lazar_MP2D == "0" + summary[:lazar][:mp2d][:all][:n] += 1 + summary[:lazar][:mp2d][:all][:non_mut] += 1 + if lazar_MP2D_high_confidence == "T" + summary[:lazar][:mp2d][:high_confidence][:n] += 1 + summary[:lazar][:mp2d][:high_confidence][:non_mut] += 1 + end + end + if lazar_PaDEL == "1" + summary[:lazar][:padel][:all][:n] += 1 + summary[:lazar][:padel][:all][:mut] += 1 + if lazar_PaDEL_high_confidence == "T" + summary[:lazar][:padel][:high_confidence][:n] += 1 + summary[:lazar][:padel][:high_confidence][:mut] += 1 + end + elsif lazar_PaDEL == "0" + summary[:lazar][:padel][:all][:n] += 1 + summary[:lazar][:padel][:all][:non_mut] += 1 + if lazar_PaDEL_high_confidence == "T" + summary[:lazar][:padel][:high_confidence][:n] += 1 + summary[:lazar][:padel][:high_confidence][:non_mut] += 1 + end + end + if r_DL == "1" + summary[:r][:dl][:n] += 1 + summary[:r][:dl][:mut] += 1 + elsif r_DL == "0" + summary[:r][:dl][:n] += 1 + summary[:r][:dl][:non_mut] += 1 + end + if r_RF == "1" + summary[:r][:rf][:n] += 1 + summary[:r][:rf][:mut] += 1 + elsif r_RF == "0" + summary[:r][:rf][:n] += 1 + summary[:r][:rf][:non_mut] += 1 + end + if r_SVM == "1" + summary[:r][:svm][:n] += 1 + summary[:r][:svm][:mut] += 1 + elsif r_SVM == "0" + summary[:r][:svm][:n] += 1 + summary[:r][:svm][:non_mut] += 1 + end + if tf_lr_sgd == "1" + summary[:tf][:lr_sgd][:n] += 1 + summary[:tf][:lr_sgd][:mut] += 1 + elsif tf_lr_sgd == "0" + summary[:tf][:lr_sgd][:n] += 1 + summary[:tf][:lr_sgd][:non_mut] += 1 + end + if tf_lr_scikit == "1" + summary[:tf][:lr_scikit][:n] += 1 + summary[:tf][:lr_scikit][:mut] += 1 + elsif tf_lr_scikit == "0" + summary[:tf][:lr_scikit][:n] += 1 + summary[:tf][:lr_scikit][:non_mut] += 1 + end + if tf_RF == "1" + summary[:tf][:rf][:n] += 1 + summary[:tf][:rf][:mut] += 1 + elsif tf_RF == "0" + summary[:tf][:rf][:n] += 1 + summary[:tf][:rf][:non_mut] += 1 + end + if tf_NN == "1" + summary[:tf][:nn][:n] += 1 + summary[:tf][:nn][:mut] += 1 + elsif tf_NN == "0" + summary[:tf][:nn][:n] += 1 + summary[:tf][:nn][:non_mut] += 1 + end + summary[:n] += 1 + end +end +summary[:lazar][:mp2d][:all][:n_perc] = (100.0*summary[:lazar][:mp2d][:all][:n]/summary[:n]).round +summary[:lazar][:mp2d][:all][:mut_perc] = (100.0*summary[:lazar][:mp2d][:all][:mut]/summary[:lazar][:mp2d][:all][:n]).round +summary[:lazar][:mp2d][:all][:non_mut_perc] = (100.0*summary[:lazar][:mp2d][:all][:non_mut]/summary[:lazar][:mp2d][:all][:n]).round +summary[:lazar][:mp2d][:high_confidence][:n_perc] = (100.0*summary[:lazar][:mp2d][:high_confidence][:n]/summary[:n]).round +summary[:lazar][:mp2d][:high_confidence][:mut_perc] = (100.0*summary[:lazar][:mp2d][:high_confidence][:mut]/summary[:lazar][:mp2d][:high_confidence][:n]).round +summary[:lazar][:mp2d][:high_confidence][:non_mut_perc] = (100.0*summary[:lazar][:mp2d][:high_confidence][:non_mut]/summary[:lazar][:mp2d][:high_confidence][:n]).round +summary[:lazar][:padel][:all][:n_perc] = (100.0*summary[:lazar][:padel][:all][:n]/summary[:n]).round +summary[:lazar][:padel][:all][:mut_perc] = (100.0*summary[:lazar][:padel][:all][:mut]/summary[:lazar][:padel][:all][:n]).round +summary[:lazar][:padel][:all][:non_mut_perc] = (100.0*summary[:lazar][:padel][:all][:non_mut]/summary[:lazar][:padel][:all][:n]).round +summary[:lazar][:padel][:high_confidence][:n_perc] = (100.0*summary[:lazar][:padel][:high_confidence][:n]/summary[:n]).round +if summary[:lazar][:padel][:high_confidence][:n] == 0 + summary[:lazar][:padel][:high_confidence][:mut_perc] = 0 + summary[:lazar][:padel][:high_confidence][:non_mut_perc] = 0 +else + summary[:lazar][:padel][:high_confidence][:mut_perc] = (100.0*summary[:lazar][:padel][:high_confidence][:mut]/summary[:lazar][:padel][:high_confidence][:n]).round + summary[:lazar][:padel][:high_confidence][:non_mut_perc] = (100.0*summary[:lazar][:padel][:high_confidence][:non_mut]/summary[:lazar][:padel][:high_confidence][:n]).round +end + +[:rf,:svm,:dl].each do |a| + summary[:r][a][:n_perc] = (100.0*summary[:r][a][:n]/summary[:n]).round + summary[:r][a][:mut_perc] = (100.0*summary[:r][a][:mut]/summary[:r][a][:n]).round + summary[:r][a][:non_mut_perc] = (100.0*summary[:r][a][:non_mut]/summary[:r][a][:n]).round +end + +[:rf,:lr_sgd,:lr_scikit,:nn].each do |a| + summary[:tf][a][:n_perc] = (100.0*summary[:tf][a][:n]/summary[:n]).round + summary[:tf][a][:mut_perc] = (100.0*summary[:tf][a][:mut]/summary[:tf][a][:n]).round + summary[:tf][a][:non_mut_perc] = (100.0*summary[:tf][a][:non_mut]/summary[:tf][a][:n]).round +end +summary = {:pa => summary} +puts summary.to_yaml diff --git a/tables/pa-summary.csv b/tables/pa-summary.csv new file mode 100644 index 0000000..0bc0e97 --- /dev/null +++ b/tables/pa-summary.csv @@ -0,0 +1,12 @@ +Model,Nr.predictions,mutagenic,non-mutagenic +lazar-MP2D (all),560 (93 %),111 (20 %),449 (80 %) +lazar-MP2D (high-confidence),301 (50 %),76 (25 %),225 (75 %) +lazar-PaDEL (all),600 (100 %),83 (14 %),517 (86 %) +lazar-PaDEL (high-confidence),0 (0 %),0 (0 %),0 (0 %) +R-RF,602 (100 %),18 (3 %),584 (97 %) +R-SVM,602 (100 %),11 (2 %),591 (98 %) +R-DL,602 (100 %),521 (87 %),81 (13 %) +Tensorflow-RF,602 (100 %),186 (31 %),416 (69 %) +Tensorflow-LR-sgd,602 (100 %),286 (48 %),316 (52 %) +Tensorflow-LR-scikit,602 (100 %),395 (66 %),207 (34 %) +Tensorflow-NN,602 (100 %),295 (49 %),307 (51 %) -- cgit v1.2.3