From e12b47c62046d099c6ac90a9a9e01942ba2c0a98 Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Tue, 6 Oct 2020 23:35:58 +0200
Subject: methods and results updated

---
 Gemfile          |   2 +-
 Gemfile.lock     |   2 -
 figures/roc.png  | Bin 76960 -> 75787 bytes
 gemset.nix       |  12 +--
 mutagenicity.md  | 272 +++++++++++++++++++++++++++++++++----------------------
 mutagenicity.pdf | Bin 0 -> 1739347 bytes
 6 files changed, 167 insertions(+), 121 deletions(-)
 create mode 100644 mutagenicity.pdf

diff --git a/Gemfile b/Gemfile
index bf0d961..7659d00 100644
--- a/Gemfile
+++ b/Gemfile
@@ -1,4 +1,4 @@
 source 'https://rubygems.org' do
       gem 'mustache'
-      gem 'openbabel'
+      #gem 'openbabel'
 end
diff --git a/Gemfile.lock b/Gemfile.lock
index e82d9b2..298fd25 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -2,14 +2,12 @@ GEM
   remote: https://rubygems.org/
   specs:
     mustache (1.1.1)
-    openbabel (2.4.90.0)
 
 PLATFORMS
   ruby
 
 DEPENDENCIES
   mustache!
-  openbabel!
 
 BUNDLED WITH
    2.1.4
diff --git a/figures/roc.png b/figures/roc.png
index 4e8f388..a7cb04f 100644
Binary files a/figures/roc.png and b/figures/roc.png differ
diff --git a/gemset.nix b/gemset.nix
index 18adbb1..a1bca28 100644
--- a/gemset.nix
+++ b/gemset.nix
@@ -9,14 +9,4 @@
     };
     version = "1.1.1";
   };
-  openbabel = {
-    groups = ["default"];
-    platforms = [];
-    source = {
-      remotes = ["https://rubygems.org"];
-      sha256 = "0fancy2yh4y52ywkn0j2v9sd9n854l2gir4a4xl7s07wvg8l52s6";
-      type = "gem";
-    };
-    version = "2.4.90.0";
-  };
-}
\ No newline at end of file
+}
diff --git a/mutagenicity.md b/mutagenicity.md
index 321a460..418c2d1 100644
--- a/mutagenicity.md
+++ b/mutagenicity.md
@@ -1,5 +1,8 @@
 ---
-title: A comparison of random forest, support vector machine, linear regression, deep learning and lazar algorithms for predicting the mutagenic potential of different pyrrolizidine alkaloids 
+title: A comparison of twelve machine learning models based on an expanded mutagenicity dataset and their application for predicting pyrrolizidine alkaloid mutagenicity
+# TODO check # algorithms
+
+#title: A comparison of random forest, support vector machine, linear regression, deep learning and lazar algorithms for predicting the mutagenic potential of different pyrrolizidine alkaloids 
 #subtitle: Performance comparison with a new expanded dataset
 author:
   - Christoph Helma:
@@ -38,17 +41,20 @@ header-includes:
 Abstract
 ========
 
-Random forest, support vector machine, deep learning and k-nearest neighbor
+<!---
+Random forest, support vector machine, linear regression, deep learning and k-nearest neighbor
 (`lazar`) algorithms, were applied to new *Salmonella* mutagenicity dataset
 with 8309 unique chemical structures. The best prediction accuracies in
 10-fold-crossvalidation were obtained with `lazar` models, that gave accuracies
 similar to the interlaboratory variability of the Ames test.
+--->
 
 Introduction
 ============
 
 TODO
 
+<!---
 Pyrrolizidine alkaloids (PAs) are secondary plant ingredients found in
 many plant species as protection against predators [Hartmann & Witte
 1995](#_ENREF_59)[Langel et al. 2011](#_ENREF_76)(; ). PAs are ester
@@ -102,13 +108,23 @@ these PAs. This is also true for *in vitro* and *in vivo* tests on
 mutagenicity and genotoxicity. To gain a wider perspective, in this
 study over 600 different PAs were assessed on their mutagenic potential
 using four different machine learning techniques.
+--->
+
+<!---
 
+Mutagenicity datasets
+Algorithms
+descriptors
+define abbreviations
+pyrrolizidine 
+--->
 
 The main objectives of this study were
 
   - to generate a new training dataset, by combining the most comprehensive public mutagenicity datasets
-  - to compare the performance of global models (RF, SVM, Neural Nets) with local models (`lazar`)
-  - to apply these models for the prediction of the mutagenicity of pyrrolizidine alkaloids
+  - to compare the performance of global models (RF, SVM, LR, NN) with local models (`lazar`)
+  - to compare the performance of MolPrint2D fingerprints with PaDEL descriptors
+  - to apply these models for the prediction of pyrrolizidine alkaloid mutagenicity
 
 Materials and Methods
 =====================
@@ -145,11 +161,16 @@ available from the git repository <https://git.in-silico.ch/mutagenicity-paper>
 under a GPL3 License. The new combined dataset can be found at
 <https://git.in-silico.ch/mutagenicity-paper/data/mutagenicity.csv>.
 
-### Pyrrolizidine dataset
+### Pyrrolizidine alkaloid (PA) dataset
 
 The testing dataset consisted of 602 different PAs. The compilation of
 the PA dataset is described in detail in [Schöning et al.
-(2017)](#_ENREF_119). The PAs were assigned to groups according to
+(2017)](#_ENREF_119).
+
+TODO: **Verena** Quellen und Auswahlkriterien
+
+<!---
+The PAs were assigned to groups according to
 structural features of the necine base and necic acid.
 
 For the necine base, following groups were assigned:
@@ -176,7 +197,62 @@ For the necic acid, following groups were assigned:
 -   Open-ring diester-type
 
 -   Macrocyclic diester-type
+--->
+
+Descriptors
+-----------
+
+### MolPrint2D fingerprints (*MP2D*)
+
+MolPrint2D fingerprints (@OBoyle2011a) use atom environments as molecular
+representation.  They determine for each atom in a molecule, the atom types of
+its connected atoms to represent their chemical environment.  This resembles
+basically the chemical concept of functional groups.
+
+In contrast to predefined lists of fragments (e.g. FP3, FP4 or MACCs
+fingerprints) or descriptors (e.g PaDEL) they are generated dynamically from
+chemical structures. This has the advantage that they can capture substructures
+of toxicological relevance that are not included in other descriptors. 
+
+Chemical similarities (e.g. Tanimoto indices) can be calculated very
+efficiently with MolPrint2D fingerprints. Using them as descriptors for global
+models leads however to huge, sparsely populated matrices that cannot be
+handled with traditional machine learning algorithms. In our experiments none
+of the R and Tensorflow algorithms was capable to use them as descriptors.
+
+MolPrint2D fingerprints were calculated with the OpenBabel cheminformatics
+library (@OBoyle2011a).
 
+#### PaDEL descriptors
+
+For R and Tensorflow models, molecular 1D and 2D descriptors were calculated
+with the PaDEL-Descriptors program (<http://www.yapcwsoft.com> version 2.21, @Yap2011). 
+
+As the training dataset contained over 8309 instances, it was decided to
+delete instances with missing values during data pre-processing.
+Furthermore, substances with equivocal outcome were removed. The final
+training dataset contained 8080 instances with known mutagenic
+potential.
+
+During feature
+selection, descriptor with near zero variance were removed using
+'*NearZeroVar*'-function (package 'caret'). If the percentage of the
+most common value was more than 90% or when the frequency ratio of the
+most common value to the second most common value was greater than 95:5
+(e.g. 95 instances of the most common value and only 5 or less instances
+of the second most common value), a descriptor was classified as having
+a near zero variance. After that, highly correlated descriptors were
+removed using the '*findCorrelation*'-function (package 'caret') with a
+cut-off of 0.9. This resulted in a training dataset with 516
+descriptors. These descriptors were scaled to be in the range between 0
+and 1 using the '*preProcess*'-function (package 'caret'). The scaling
+routine was saved in order to apply the same scaling on the testing
+dataset. As these three steps did not consider the outcome, it was
+decided that they do not need to be included in the cross-validation of
+the model. To further reduce the number of features, a LASSO (*least
+absolute shrinkage and selection operator*) regression was performed
+using the '*glmnet*'-function (package '*glmnet*'). The reduced dataset
+was used for the generation of the pre-trained models.
 
 Algorithms
 ----------
@@ -207,30 +283,22 @@ sections.
 
 #### Neighbour identification
 
-Similarity calculations were based on MolPrint2D fingerprints (*MP2D*,
-@Bender2004) from the OpenBabel cheminformatics library (@OBoyle2011a). The
-MolPrint2D fingerprint uses atom environments as molecular representation,
-which resembles basically the chemical concept of functional groups. For each
-atom in a molecule, it represents the chemical environment using the atom types
-of connected atoms.
+Utilizing this modularity, similarity calculations were based both on
+MolPrint2D fingerprints and on PaDEL descriptors.
 
-MolPrint2D fingerprints are generated dynamically from chemical
-structures and do not rely on predefined lists of fragments (such as
-OpenBabel FP3, FP4 or MACCs fingerprints or lists of
-toxicophores/toxicophobes). This has the advantage that they may capture
-substructures of toxicological relevance that are not included in other
-fingerprints.
+For MolPrint2D fingerprints chemical similarity between two compounds $a$ and
+$b$ is expressed as the proportion between atom environments common in both
+structures $A \cap B$ and the total number of atom environments $A \cup B$
+(Jaccard/Tanimoto index).
 
-From MolPrint2D fingerprints a feature vector with all atom environments
-of a compound can be constructed that can be used to calculate chemical
-similarities.
+$$sim = \frac{\lvert A\  \cap B \rvert}{\lvert A\  \cup B \rvert}$$
 
-The chemical similarity between two compounds $a$ and $b$ is expressed as
-the proportion between atom environments common in both structures $A \cap B$
-and the total number of atom environments $A \cup B$ (Jaccard/Tanimoto
-index).
+For PaDEL descriptors chemical similarity between two compounds $a$ and $b$ is
+expressed as the cosine similarity between the descriptor vectors $A$ for $a$
+and $B$ for $b$.
+
+$$sim = \frac{A \cdot B}{\lvert A \rvert \lvert B \rvert}$$
 
-$$sim = \frac{\left| A\  \cap B \right|}{\left| A\  \cup B \right|}$$
 
 Threshold selection is a trade-off between prediction accuracy (high
 threshold) and the number of predictable compounds (low threshold). As
@@ -302,44 +370,10 @@ of individual neighbours.
 
 ### R Random Forest, Support Vector Machines, and Deep Learning
 
-#### PaDEL descriptors
-
-For Random Forest (RF), Support Vector Machines (SVM), and Deep
-Learning (DL) models, molecular descriptors were calculated
-with the PaDEL-Descriptors program (<http://www.yapcwsoft.com> version 2.21, @Yap2011). The same descriptors were used for TensorFlow models.
-
-TODO: **Verena** kannst Du bitte die PaDEL Deskriptoren etwas ausfuehrlicher beschreiben (welche Typen, Anzahl, Bedeutung etc)
-
-For the generation of these models, molecular 1D and 2D descriptors of
-the training dataset were calculated using PaDEL-Descriptors (<http://www.yapcwsoft.com> version
-2.21, @Yap2011).
-
-As the training dataset contained over 8309 instances, it was decided to
-delete instances with missing values during data pre-processing.
-Furthermore, substances with equivocal outcome were removed. The final
-training dataset contained 8080 instances with known mutagenic
-potential. The RF, SVM, and DL models were generated using the R
+The RF, SVM, and DL models were generated using the R
 software (R-project for Statistical Computing,
 <https://www.r-project.org/>*;* version 3.3.1), specific R packages used
-are identified for each step in the description below. During feature
-selection, descriptor with near zero variance were removed using
-'*NearZeroVar*'-function (package 'caret'). If the percentage of the
-most common value was more than 90% or when the frequency ratio of the
-most common value to the second most common value was greater than 95:5
-(e.g. 95 instances of the most common value and only 5 or less instances
-of the second most common value), a descriptor was classified as having
-a near zero variance. After that, highly correlated descriptors were
-removed using the '*findCorrelation*'-function (package 'caret') with a
-cut-off of 0.9. This resulted in a training dataset with 516
-descriptors. These descriptors were scaled to be in the range between 0
-and 1 using the '*preProcess*'-function (package 'caret'). The scaling
-routine was saved in order to apply the same scaling on the testing
-dataset. As these three steps did not consider the outcome, it was
-decided that they do not need to be included in the cross-validation of
-the model. To further reduce the number of features, a LASSO (*least
-absolute shrinkage and selection operator*) regression was performed
-using the '*glmnet*'-function (package '*glmnet*'). The reduced dataset
-was used for the generation of the pre-trained models.
+are identified for each step in the description below. 
 
 #### Random Forest
 
@@ -378,6 +412,8 @@ validation data. This step was repeated 10 times.
 
 #### Applicability domain
 
+TODO: **Verena**: Mit welchen Deskriptoren hast Du den Jaccard index berechnet?  Fuer den Jaccard index braucht man binaere Deskriptoren (zB MP2D), mit PaDEL Deskriptoren koennte man zB eine euklidische oder cosinus Distanz berechnen.
+
 The AD of the training dataset and the PA dataset was evaluated using
 the Jaccard distance. A Jaccard distance of '0' indicates that the
 substances are similar, whereas a value of '1' shows that the substances
@@ -386,7 +422,13 @@ to the training dataset. Therefore, PA dataset is within the AD of the
 training dataset and the models can be used to predict the genotoxic
 potential of the PA dataset.
 
-### TensorFlow models
+#### Availability
+
+R scripts for these experiments can be found in https://git.in-silico.ch/mutagenicity-paper/scripts/R.
+
+### Tensorflow models
+
+TODO: **Philipp** bitte ergaenzen
 
 #### Logistic regression (SGD)
 
@@ -396,11 +438,11 @@ potential of the PA dataset.
 
 #### Deep Learning
 
-Alternatively, a DL model was established with Python-based TensorFlow
+Alternatively, a DL model was established with Python-based Tensorflow
 program (<https://www.tensorflow.org/>) using the high-level API Keras
 (<https://www.tensorflow.org/guide/keras>) to build the models. 
 
-TensorFlow models used the same PaDEL descriptors as the R models.
+Tensorflow models used the same PaDEL descriptors as the R models.
 
 Data pre-processing was done by rank transformation using the
 '*QuantileTransformer*' procedure. A sequential model has been used.
@@ -412,8 +454,7 @@ a L^2^-penalty of 0.001 was used for the input layer. For training of
 the model, the ADAM algorithm was used to minimise the cross-entropy
 loss using the default parameters of Keras. Training was performed for
 100 epochs with a batch size of 64. The model was implemented with
-Python 3.6 and Keras. For training of the model, a 10-fold
-cross-validation was used. 
+Python 3.6 and Keras. 
 
 TODO: **Philipp** kannst Du bitte ueberpruefen ob die Beschreibung noch stimmt
 und ob der Ablauf von Verena (Figure 1) auch fuer Deine Modelle gilt
@@ -421,11 +462,41 @@ und ob der Ablauf von Verena (Figure 1) auch fuer Deine Modelle gilt
 Validation
 ----------
 
+10-fold cross-validation was used for all Tensorflow models.
+
+#### Availability
+
+Jupyter notebooks for these experiments can be found in https://git.in-silico.ch/mutagenicity-paper/scripts/tensorflow.
+
 Results
 =======
 
-TODO: **Verena** und **Philipp**: koennt Ihr bitte gegenchecken, ob ich keine Zahlendreher in den Ergebnissen habe
+10-fold crossvalidations
+------------------------
+
+Crossvalidation results are summarized in the following tables: @tbl:lazar shows `lazar` results with MolPrint2D and PaDEL descriptors, @tbl:R summarizes R results and @tbl:tensorflow Tensorflow results.
+
+@fig:roc depicts the position of all crossvalidation results in receiver operating characteristic (ROC) space.
 
+Confusion matrices for all models are available from the git repository http://git.in-silico.ch/mutagenicity-paper/10-fold-crossvalidations/confusion-matrices/, individual predictions can be found in 
+http://git.in-silico.ch/mutagenicity-paper/10-fold-crossvalidations/predictions/.
+
+The most accurate crossvalidation predictions have been obtained with `lazar` models with MolPrint2D descriptors ({{lazar-high-confidence.acc}} for predictions with high confidence, {{lazar-all.acc}} for all predictions). Models utilizing PaDEL descriptors have generally lower accuracies ranging from TODO to TODO. Sensitivity and specificity is generally well balanced with the exception of `lazar`-PaDEL (low sensitivity) and R deep learning (low specificity) models.
+
+| |R-RF | R-SVM | R-DL | TF | TF-FS | L | L-HC | L-P | L-P-HC|
+|-|-----|-------|------|----|-------|---|------|------|--------|
+|Accuracy|{{R-RF.acc}}|{{R-SVM.acc}}|{{R-DL.acc}}|{{tensorflow-all.acc}}|{{tensorflow-selected.acc}}|{{lazar-all.acc}}|{{lazar-high-confidence.acc}}|{{lazar-padel-all.acc}}|{{lazar-padel-high-confidence.acc}}|
+|Sensitivity|{{R-RF.tpr}}|{{R-SVM.tpr}}|{{R-DL.tpr}}|{{tensorflow-all.tpr}}|{{tensorflow-selected.tpr}}|{{lazar-all.tpr}}|{{lazar-high-confidence.tpr}}|{{lazar-padel-all.tpr}}|{{lazar-padel-high-confidence.tpr}}|
+|Specificity|{{R-RF.tnr}}|{{R-SVM.tnr}}|{{R-DL.tnr}}|{{tensorflow-all.tnr}}|{{tensorflow-selected.tnr}}|{{lazar-all.tnr}}|{{lazar-high-confidence.tnr}}|{{lazar-padel-all.tnr}}|{{lazar-padel-high-confidence.tnr}}|
+|PPV|{{R-RF.ppv}}|{{R-SVM.ppv}}|{{R-DL.ppv}}|{{tensorflow-all.ppv}}|{{tensorflow-selected.ppv}}|{{lazar-all.ppv}}|{{lazar-high-confidence.ppv}}|{{lazar-padel-all.ppv}}|{{lazar-padel-high-confidence.ppv}}|
+|NPV|{{R-RF.npv}}|{{R-SVM.npv}}|{{R-DL.npv}}|{{tensorflow-all.npv}}|{{tensorflow-selected.npv}}|{{lazar-all.npv}}|{{lazar-high-confidence.npv}}|{{lazar-padel-all.npv}}|{{lazar-padel-high-confidence.npv}}|
+|Nr. predictions|{{R-RF.n}}|{{R-SVM.n}}|{{R-DL.n}}|{{tensorflow-all.n}}|{{tensorflow-selected.n}}|{{lazar-all.n}}|{{lazar-high-confidence.n}}|{{lazar-padel-all.n}}|{{lazar-padel-high-confidence.n}}|
+
+: Summary of crossvalidation results. *R-RF*: R Random Forests, *R-SVM*: R Support Vector Machines, *R-DL*: R Deep Learning, *TF*: Tensorflow without feature selection, *TF-FS*: Tensorflow with feature selection, *L*: lazar, *L-HC*: lazar high confidence predictions, *L-P*: lazar with PaDEL descriptors, *L-P-HC*: lazar PaDEL high confidence predictions, *PPV*: Positive predictive value (Precision), *NPV*: Negative predictive value {#tbl:summary}
+
+![ROC plot of crossvalidation results. *R-RF*: R Random Forests, *R-SVM*: R Support Vector Machines, *R-DL*: R Deep Learning, *TF*: Tensorflow without feature selection, *TF-FS*: Tensorflow with feature selection, *L*: lazar, *L-HC*: lazar high confidence predictions, *L-P*: lazar with PaDEL descriptors, *L-P-HC*: lazar PaDEL high confidence predictions (overlaps with L-P)](figures/roc.png){#fig:roc}
+
+<!--
 R Models
 --------
 
@@ -459,12 +530,12 @@ predictions is provided in @tbl:R-DL.
 ```{#tbl:R-DL .table file="tables/R-DL.csv" caption="Confusion matrix for R Deep Learning predictions"}
 ```
 
-TensorFlow Models
+Tensorflow Models
 -----------------
 
 ### Without feature selection
 
-10-fold crossvalidation of the TensorFlow DL model gave an accuracy of
+10-fold crossvalidation of the Tensorflow DL model gave an accuracy of
 {{tensorflow-all.acc_perc}}%, a sensitivity of {{tensorflow-all.tpr_perc}}% and a specificity of
 {{tensorflow-all.tnr_perc}}%.  The confusion matrix for {{tensorflow-all.n}}
 predictions is provided in @tbl:tensorflow-all.
@@ -474,7 +545,7 @@ predictions is provided in @tbl:tensorflow-all.
 
 ### With feature selection
 
-10-fold crossvalidation of the TensorFlow model with feature selection gave an accuracy of
+10-fold crossvalidation of the Tensorflow model with feature selection gave an accuracy of
 {{tensorflow-selected.acc_perc}}%, a sensitivity of {{tensorflow-selected.tpr_perc}}% and a specificity of
 {{tensorflow-selected.tnr_perc}}%.  The confusion matrix for {{tensorflow-selected.n}}
 predictions is provided in @tbl:tensorflow-selected.
@@ -525,29 +596,20 @@ predictions is provided in @tbl:lazar-padel-high-confidence.
 
 ```{#tbl:lazar-padel-high-confidence .table file="tables/lazar-padel-high-confidence.csv" caption="Confusion matrix for high confidence lazar predictions with PaDEL descriptors"}
 ```
+-->
 
-Summary
--------
+Pyrrolizidine alkaloid mutagenicity predictions 
+-----------------------------------------------
 
-The results of all crossvalidation experiments are summarized in @tbl:summary.
+Pyrrolizidine alkaloid mutagenicity predictions are summarized in Table @tab:pa. 
 
-| |R-RF | R-SVM | R-DL | TF | TF-FS | L | L-HC | L-P | L-P-HC|
-|-|-----|-------|------|----|-------|---|------|------|--------|
-|Accuracy|{{R-RF.acc}}|{{R-SVM.acc}}|{{R-DL.acc}}|{{tensorflow-all.acc}}|{{tensorflow-selected.acc}}|{{lazar-all.acc}}|{{lazar-high-confidence.acc}}|{{lazar-padel-all.acc}}|{{lazar-padel-high-confidence.acc}}|
-|Sensitivity|{{R-RF.tpr}}|{{R-SVM.tpr}}|{{R-DL.tpr}}|{{tensorflow-all.tpr}}|{{tensorflow-selected.tpr}}|{{lazar-all.tpr}}|{{lazar-high-confidence.tpr}}|{{lazar-padel-all.tpr}}|{{lazar-padel-high-confidence.tpr}}|
-|Specificity|{{R-RF.tnr}}|{{R-SVM.tnr}}|{{R-DL.tnr}}|{{tensorflow-all.tnr}}|{{tensorflow-selected.tnr}}|{{lazar-all.tnr}}|{{lazar-high-confidence.tnr}}|{{lazar-padel-all.tnr}}|{{lazar-padel-high-confidence.tnr}}|
-|PPV|{{R-RF.ppv}}|{{R-SVM.ppv}}|{{R-DL.ppv}}|{{tensorflow-all.ppv}}|{{tensorflow-selected.ppv}}|{{lazar-all.ppv}}|{{lazar-high-confidence.ppv}}|{{lazar-padel-all.ppv}}|{{lazar-padel-high-confidence.ppv}}|
-|NPV|{{R-RF.npv}}|{{R-SVM.npv}}|{{R-DL.npv}}|{{tensorflow-all.npv}}|{{tensorflow-selected.npv}}|{{lazar-all.npv}}|{{lazar-high-confidence.npv}}|{{lazar-padel-all.npv}}|{{lazar-padel-high-confidence.npv}}|
-|Nr. predictions|{{R-RF.n}}|{{R-SVM.n}}|{{R-DL.n}}|{{tensorflow-all.n}}|{{tensorflow-selected.n}}|{{lazar-all.n}}|{{lazar-high-confidence.n}}|{{lazar-padel-all.n}}|{{lazar-padel-high-confidence.n}}|
-
-: Summary of crossvalidation results. *R-RF*: R Random Forests, *R-SVM*: R Support Vector Machines, *R-DL*: R Deep Learning, *TF*: TensorFlow without feature selection, *TF-FS*: TensorFlow with feature selection, *L*: lazar, *L-HC*: lazar high confidence predictions, *L-P*: lazar with PaDEL descriptors, *L-P-HC*: lazar PaDEL high confidence predictions, *PPV*: Positive predictive value (Precision), *NPV*: Negative predictive value {#tbl:summary}
+@fig:tsne-mp2d shows the position of pyrrolizidine alkaloids (PA) in the mutagenicity training dataset in MP2D space
 
-@fig:roc shows the position of crossvalidation results in receiver operating characteristic (ROC) space.
+![t-sne visualisation of mutagenicty training data and pyrrolizidine alkaloids (PA)](figures/tsne-mp2d.png){#fig:tsne-mp2d}
 
-![ROC plot of crossvalidation results. *R-RF*: R Random Forests, *R-SVM*: R Support Vector Machines, *R-DL*: R Deep Learning, *TF*: TensorFlow without feature selection, *TF-FS*: TensorFlow with feature selection, *L*: lazar, *L-HC*: lazar high confidence predictions, *L-P*: lazar with PaDEL descriptors, *L-P-HC*: lazar PaDEL high confidence predictions (overlaps with L-P)](figures/roc.png){#fig:roc}
+@fig:tsne-padel shows the position of pyrrolizidine alkaloids (PA) in the mutagenicity training dataset in PADEL space
 
-Predictions for pyrrolizidine alkaloid  mutagenicity
-----------------------------------------------------
+![t-sne visualisation of mutagenicty training data and pyrrolizidine alkaloids (PA)](figures/tsne-padel.png){#fig:tsne-padel}
 
 Discussion
 ==========
@@ -567,7 +629,7 @@ Model performance
 
 @tbl:summary and @fig:roc show that the standard `lazar` algorithm (with MP2D
 fingerprints) give the most accurate crossvalidation results. R Random Forests,
-Support Vector Machines and TensorFlow models have similar accuracies with
+Support Vector Machines and Tensorflow models have similar accuracies with
 balanced sensitivity (true position rate) and specificity (true negative rate).
 `lazar` models with PaDEL descriptors have low sensitivity and R Deep Learning
 models have low specificity.
@@ -583,14 +645,14 @@ similar to the experimental variability (@Helma2018).
 
 The lowest number of predictions ({{lazar-padel-high-confidence.n}}) has been
 obtained from `lazar`/PaDEL high confidence predictions, the largest number of
-predictions comes from TensorFlow models ({{tensorflow-all.n}}). Standard
+predictions comes from Tensorflow models ({{tensorflow-all.n}}). Standard
 `lazar` give a slightly lower number of predictions ({{lazar-all.n}}) than R
-and TensorFlow models. This is not necessarily a disadvantage, because `lazar`
+and Tensorflow models. This is not necessarily a disadvantage, because `lazar`
 abstains from predictions, if the query compound is very dissimilar from the
 compounds in the training set and thus avoids to make predictions for compounds
 that do not fall into its applicability domain. 
 
-There are two major differences between `lazar` and R/TensorFlow models, which
+There are two major differences between `lazar` and R/Tensorflow models, which
 might explain the different prediction accuracies:
 
 - `lazar` uses MolPrint2D fingerprints, while all other models use PaDEL descriptors
@@ -614,7 +676,7 @@ PaDEL calculates topological and physical-chemical descriptors.
 
 TODO: **Verena** kannst Du bitte die Deskriptoren nochmals kurz beschreiben
 
-PaDEL descriptors were used for the R and TensorFlow models. In addition we
+PaDEL descriptors were used for the R and Tensorflow models. In addition we
 have used PaDEL descriptors to calculate cosine similarities for the `lazar`
 algorithm and compared the results with standard MP2D similarities, which led
 to a significant decrease of `lazar` prediction accuracies. Based on this
@@ -622,7 +684,7 @@ result we can conclude, that PaDEL descriptors are less suited for similarity
 calculations than MP2D descriptors.
 
 In order to investigate, if MP2D fingerprints are also a better option for
-global models we have tried to build R and TensorFlow models both with and
+global models we have tried to build R and Tensorflow models both with and
 without unsupervised feature selection. Unfortunately none of the algorithms
 was capable to deal with the large and sparsely populated descriptor matrix.
 Based on this result we can conclude, that MP2D descriptors are at the moment
@@ -642,7 +704,7 @@ Algorithms
 structures for a given compound and calculates the prediction based on the
 experimental data for these structures. The QSAR literature calls such models
 frequently *local models*, because models are generated specifically for each
-query compound. R and TensorFlow models are in contrast *global models*, i.e. a
+query compound. R and Tensorflow models are in contrast *global models*, i.e. a
 single model is used to make predictions for all compounds. It has been
 postulated in the past, that local models are more accurate, because they can
 account better for mechanisms, that affect only a subset of the training data.
@@ -656,10 +718,6 @@ modelling algorithms that are capable to handle large, sparse binary matrices.
 Mutagenicity of PAs
 -------------------
 
-@fig:tsne-mp2d shows the position of pyrrolizidine alkaloids (PA) in the mutagenicity training dataset
-
-![t-sne visualisation of mutagenicty training data and pyrrolizidine alkaloids (PA)](figures/tsne-mp2d.png){#fig:tsne-mp2d}
-
 Due to the low to moderate predictivity of all models, quantitative
 statement on the genotoxicity of single PAs cannot be made with
 sufficient confidence.
@@ -670,7 +728,7 @@ literature, and are therefore not further considered in the discussion.
 Necic acid
 
 The rank order of the necic acid is comparable in the four models
-considered (LAZAR, RF and DL (R-project and TensorFlow). PAs from the
+considered (LAZAR, RF and DL (R-project and Tensorflow). PAs from the
 monoester type had the lowest genotoxic potential, followed by PAs from
 the open-ring diester type. PAs with macrocyclic diesters had the
 highest genotoxic potential. The result fit well with current state of
@@ -684,7 +742,7 @@ Necine base
 The rank order of necine base is comparable in LAZAR, RF, and DL
 (R-project) models: with platynecine being less or as genotoxic as
 retronecine, and otonecine being the most genotoxic. In the
-TensorFlow-generate DL model, platynecine also has the lowest genotoxic
+Tensorflow-generate DL model, platynecine also has the lowest genotoxic
 probability, but are then followed by the otonecines and last by
 retronecine. These results partly correspond to earlier published
 studies. Saturated PAs of the platynecine-type are generally accepted to
@@ -697,7 +755,7 @@ than those of the retronecine-type [Li et al. 2013](#_ENREF_80)().
 
 Modifications of necine base
 
-The group-specific results of the TensorFlow-generated DL model appear
+The group-specific results of the Tensorflow-generated DL model appear
 to reflect the expected relationship between the groups: the low
 genotoxic potential of *N*-oxides and the highest potential of
 dehydropyrrolizidines [Chen et al. 2010](#_ENREF_26)().
@@ -736,7 +794,7 @@ corresponding tertiary PAs. However, in the groups of modification of
 the necine base, dehydropyrrolizidine, the toxic principle of PAs,
 should have had the highest genotoxic potential. Taken together, the
 predictions of the modifications of the necine base from the LAZAR, RF
-and R-generated DL model cannot -- in contrast to the TensorFlow DL
+and R-generated DL model cannot -- in contrast to the Tensorflow DL
 model - be considered as reliable.
 
 Overall, when comparing the prediction results of the PAs to current
@@ -748,7 +806,7 @@ issues:
     the extended AD, 92.3% of the PAs could be included in the
     prediction. Even though the Jaccard distance between the training
     dataset and the PA dataset for the RF, SVM, and DL (R-project and
-    TensorFlow) models was small, suggesting a high similarity, the
+    Tensorflow) models was small, suggesting a high similarity, the
     LAZAR indicated that PAs have only few local neighbours, which might
     adversely affect the prediction of the mutagenic potential of PAs.
 
@@ -774,7 +832,7 @@ Conclusions
 ===========
 
 A new public *Salmonella* mutagenicity training dataset with 8309 compounds was
-created and used it to train `lazar`, R and TensorFlow models. The best
+created and used it to train `lazar`, R and Tensorflow models. The best
 performance was obtained with `lazar` models using MolPrint2D descriptors, with
 prediction accuracies comparable to the interlaboratory variability of the Ames
 test. Differences between algorithms (local vs. global models) and/or
@@ -783,9 +841,9 @@ prediction accuracies.
 
 In this study, an attempt was made to predict the genotoxic potential of
 PAs using five different machine learning techniques (LAZAR, RF, SVM, DL
-(R-project and TensorFlow). The results of all models fitted only partly
+(R-project and Tensorflow). The results of all models fitted only partly
 to the findings in literature, with best results obtained with the
-TensorFlow DL model. Therefore, modelling allows statements on the
+Tensorflow DL model. Therefore, modelling allows statements on the
 relative risks of genotoxicity of the different PA groups. Individual
 predictions for selective PAs appear, however, not reliable on the
 current basis of the used training dataset.
diff --git a/mutagenicity.pdf b/mutagenicity.pdf
new file mode 100644
index 0000000..b7cd456
Binary files /dev/null and b/mutagenicity.pdf differ
-- 
cgit v1.2.3