1 files changed, 779 insertions, 0 deletions
diff --git a/paper/outfile.latex b/paper/outfile.latex
new file mode 100644
index 0000000..9af84b1
--- /dev/null
+++ b/paper/outfile.latex
@@ -0,0 +1,779 @@
+\documentclass[]{scrartcl}
+\usepackage{lmodern}
+\usepackage{amssymb,amsmath}
+\usepackage{ifxetex,ifluatex}
+\usepackage{fixltx2e} % provides \textsubscript
+\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
+  \usepackage[T1]{fontenc}
+  \usepackage[utf8]{inputenc}
+\else % if luatex or xelatex
+  \ifxetex
+    \usepackage{mathspec}
+  \else
+    \usepackage{fontspec}
+  \fi
+  \defaultfontfeatures{Ligatures=TeX,Scale=MatchLowercase}
+\fi
+% use upquote if available, for straight quotes in verbatim environments
+\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
+% use microtype if available
+\IfFileExists{microtype.sty}{%
+\usepackage{microtype}
+\UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
+}{}
+\usepackage[unicode=true]{hyperref}
+\hypersetup{
+            pdftitle={A comparison of random forest, support vector machine, deep learning and lazar algorithms for predicting mutagenicity},
+            pdfkeywords={mutagenicity, (Q)SAR, lazar, random forest, support vector machine, deep
+learning},
+            pdfborder={0 0 0},
+            breaklinks=true}
+\urlstyle{same}  % don't use monospace font for urls
+\usepackage{longtable,booktabs}
+\usepackage{graphicx,grffile}
+\makeatletter
+\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
+\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
+\makeatother
+% Scale images if necessary, so that they will not overflow the page
+% margins by default, and it is still possible to overwrite the defaults
+% using explicit options in \includegraphics[width, height, ...]{}
+\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
+\IfFileExists{parskip.sty}{%
+\usepackage{parskip}
+}{% else
+\setlength{\parindent}{0pt}
+\setlength{\parskip}{6pt plus 2pt minus 1pt}
+}
+\setlength{\emergencystretch}{3em}  % prevent overfull lines
+\providecommand{\tightlist}{%
+  \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
+\setcounter{secnumdepth}{0}
+% Redefines (sub)paragraphs to behave more like sections
+\ifx\paragraph\undefined\else
+\let\oldparagraph\paragraph
+\renewcommand{\paragraph}[1]{\oldparagraph{#1}\mbox{}}
+\fi
+\ifx\subparagraph\undefined\else
+\let\oldsubparagraph\subparagraph
+\renewcommand{\subparagraph}[1]{\oldsubparagraph{#1}\mbox{}}
+\fi
+
+\title{A comparison of random forest, support vector machine, deep learning and
+lazar algorithms for predicting mutagenicity}
+\usepackage{authblk}
+\author[%
+  1%
+  ]{%
+  Christoph Helma%
+  %
+  \textsuperscript{*\,}%
+  %%
+  %
+}
+\author[%
+  2%
+  ]{%
+  Verena Schöning%
+  %
+  %
+}
+\author[%
+  2%
+  ]{%
+  Philipp Boss%
+  %
+  %
+}
+\author[%
+  2%
+  ]{%
+  Jürgen Drewe%
+  %
+  %
+}
+\affil[1]{\normalsize in silico toxicology gmbh, \footnotesize Rastatterstrasse 41, 4057 Basel, Switzerland}
+\affil[2]{\normalsize Zeller AG, \footnotesize Seeblickstrasse 4, 8590 Romanshorn, Switzerland}
+\date{}
+
+\makeatletter
+\def\@maketitle{%
+  \newpage \null \vskip 2em
+  \begin {center}%
+    \let \footnote \thanks
+         {\LARGE \@title \par}%
+         \vskip 1.5em%
+                {\large \lineskip .5em%
+                  \begin {tabular}[t]{c}%
+                    \@author
+                  \end {tabular}\par}%
+                                                \vskip 0.2em{\textsuperscript{*}\,Correspondence:
+                                    Christoph Helma <helma@in-silico.ch>\\
+                  }%
+                %                \vskip 1em{\large \@date}%
+  \end {center}%
+  \par
+  \vskip 1.5em}
+\makeatother
+
+\begin{document}
+
+\maketitle
+
+\begin{abstract}
+k-nearest neighbor (\texttt{lazar}), random forest, support vector
+machine and deep learning algorithms were applied to a new
+\emph{Salmonella} mutagenicity dataset with 8281 unique chemical
+structures. Algorithm performance was evaluated using 5-fold
+crossvalidation. TODO - results - conclusion
+\end{abstract}
+
+\hypertarget{introduction}{%
+\section{Introduction}\label{introduction}}
+
+TODO: algo history
+
+TODO: dataset history
+
+TODO: open problems
+
+\hypertarget{materials-and-methods}{%
+\section{Materials and Methods}\label{materials-and-methods}}
+
+\hypertarget{mutagenicity-data}{%
+\subsection{Mutagenicity data}\label{mutagenicity-data}}
+
+For all methods, the same training dataset was used. The training
+dataset was compiled from the following sources:
+
+\begin{itemize}
+\item
+  Kazius/Bursi Dataset (4337 compounds, Kazius, McGuire, and Bursi
+  (2005)): \url{http://cheminformatics.org/datasets/bursi/cas_4337.zip}
+\item
+  Hansen Dataset (6513 compounds, Hansen et al. (2009)):
+  \url{http://doc.ml.tu-berlin.de/toxbenchmark/Mutagenicity_N6512.csv}
+\item
+  EFSA Dataset (695 compounds):
+  \url{https://data.europa.eu/euodp/data/storage/f/2017-0719T142131/GENOTOX\%20data\%20and\%20dictionary.xls}
+\end{itemize}
+
+Mutagenicity classifications from Kazius and Hansen datasets were used
+without further processing. To achieve consistency between these
+datasets, EFSA compounds were classified as mutagenic, if at least one
+positive result was found for TA98 or T100 Salmonella strains.
+
+Dataset merges were based on unique SMILES (\emph{Simplified Molecular
+Input Line Entry Specification}) strings of the compound structures.
+Duplicated experimental data with the same outcome was merged into a
+single value, because it is likely that it originated from the same
+experiment. Contradictory results were kept as multiple measurements in
+the database. The combined training dataset contains 8281 unique
+structures.
+
+Source code for all data download, extraction and merge operations is
+publicly available from the git repository
+\url{https://git.in-silico.ch/pyrrolizidine} under a GPL3 License.
+
+TODO: check/fix git repo
+
+For the Random Forest (RF), Support Vector Machines (SVM), and Deep
+Learning (DL) models, molecular descriptors were calculated with the
+PaDEL-Descriptors program (\url{http://www.yapcwsoft.com} version 2.21,
+Yap (2011)).
+
+TODO: sentence ??
+
+From these descriptors were chosen, which were actually used for the
+generation of the DL model.
+
+\hypertarget{algorithms}{%
+\subsection{Algorithms}\label{algorithms}}
+
+\hypertarget{lazar}{%
+\subsubsection{\texorpdfstring{\texttt{lazar}}{lazar}}\label{lazar}}
+
+\texttt{lazar} (\emph{lazy structure activity relationships}) is a
+modular framework for read-across model development and validation. It
+follows the following basic workflow: For a given chemical structure
+\texttt{lazar}:
+
+\begin{itemize}
+\item
+  searches in a database for similar structures (neighbours) with
+  experimental data,
+\item
+  builds a local QSAR model with these neighbours and
+\item
+  uses this model to predict the unknown activity of the query compound.
+\end{itemize}
+
+This procedure resembles an automated version of read across predictions
+in toxicology, in machine learning terms it would be classified as a
+k-nearest-neighbour algorithm.
+
+Apart from this basic workflow, \texttt{lazar} is completely modular and
+allows the researcher to use any algorithm for similarity searches and
+local QSAR (\emph{Quantitative structure--activity relationship})
+modelling. Algorithms used within this study are described in the
+following sections.
+
+\hypertarget{neighbour-identification}{%
+\paragraph{Neighbour identification}\label{neighbour-identification}}
+
+Similarity calculations were based on MolPrint2D fingerprints (Bender et
+al. (2004)) from the OpenBabel cheminformatics library (O'Boyle et al.
+(2011)). The MolPrint2D fingerprint uses atom environments as molecular
+representation, which resembles basically the chemical concept of
+functional groups. For each atom in a molecule, it represents the
+chemical environment using the atom types of connected atoms.
+
+MolPrint2D fingerprints are generated dynamically from chemical
+structures and do not rely on predefined lists of fragments (such as
+OpenBabel FP3, FP4 or MACCs fingerprints or lists of
+toxicophores/toxicophobes). This has the advantage that they may capture
+substructures of toxicological relevance that are not included in other
+fingerprints.
+
+From MolPrint2D fingerprints a feature vector with all atom environments
+of a compound can be constructed that can be used to calculate chemical
+similarities.
+
+The chemical similarity between two compounds a and b is expressed as
+the proportion between atom environments common in both structures A ∩ B
+and the total number of atom environments A U B (Jaccard/Tanimoto
+index).
+
+\[sim = \frac{\left| A\  \cap B \right|}{\left| A\  \cup B \right|}\]
+
+Threshold selection is a trade-off between prediction accuracy (high
+threshold) and the number of predictable compounds (low threshold). As
+it is in many practical cases desirable to make predictions even in the
+absence of closely related neighbours, we follow a tiered approach:
+
+\begin{itemize}
+\item
+  First a similarity threshold of 0.5 is used to collect neighbours, to
+  create a local QSAR model and to make a prediction for the query
+  compound.
+\item
+  If any of these steps fails, the procedure is repeated with a
+  similarity threshold of 0.2 and the prediction is flagged with a
+  warning that it might be out of the applicability domain of the
+  training data.
+\item
+  Similarity thresholds of 0.5 and 0.2 are the default values chosen
+  \textgreater{} by the software developers and remained unchanged
+  during the \textgreater{} course of these experiments.
+\end{itemize}
+
+Compounds with the same structure as the query structure are
+automatically eliminated from neighbours to obtain unbiased predictions
+in the presence of duplicates.
+
+\hypertarget{local-qsar-models-and-predictions}{%
+\paragraph{Local QSAR models and
+predictions}\label{local-qsar-models-and-predictions}}
+
+Only similar compounds (neighbours) above the threshold are used for
+local QSAR models. In this investigation, we are using a weighted
+majority vote from the neighbour's experimental data for mutagenicity
+classifications. Probabilities for both classes
+(mutagenic/non-mutagenic) are calculated according to the following
+formula and the class with the higher probability is used as prediction
+outcome.
+
+\[p_{c} = \ \frac{\sum_{}^{}\text{sim}_{n,c}}{\sum_{}^{}\text{sim}_{n}}\]
+
+\(p_{c}\) Probability of class c (e.g.~mutagenic or non-mutagenic)\\
+\(\sum_{}^{}\text{sim}_{n,c}\) Sum of similarities of neighbours with
+class c\\
+\(\sum_{}^{}\text{sim}_{n}\) Sum of all neighbours
+
+\hypertarget{applicability-domain}{%
+\paragraph{Applicability domain}\label{applicability-domain}}
+
+The applicability domain (AD) of \texttt{lazar} models is determined by
+the structural diversity of the training data. If no similar compounds
+are found in the training data no predictions will be generated.
+Warnings are issued if the similarity threshold had to be lowered from
+0.5 to 0.2 in order to enable predictions. Predictions without warnings
+can be considered as close to the applicability domain and predictions
+with warnings as more distant from the applicability domain.
+Quantitative applicability domain information can be obtained from the
+similarities of individual neighbours.
+
+\hypertarget{availability}{%
+\paragraph{Availability}\label{availability}}
+
+\begin{itemize}
+\item
+  \texttt{lazar} experiments for this manuscript:
+  \url{https://git.in-silico.ch/pyrrolizidine} (source code, GPL3)
+\item
+  \texttt{lazar} framework: \url{https://git.in-silico.ch/lazar} (source
+  code, GPL3)
+\item
+  \texttt{lazar} GUI: \url{https://git.in-silico.ch/lazar-gui} (source
+  code, GPL3)
+\item
+  Public web interface: \url{https://lazar.in-silico.ch}
+\end{itemize}
+
+\hypertarget{random-forest-support-vector-machines-and-deep-learning-in-r-project}{%
+\subsubsection{Random Forest, Support Vector Machines, and Deep Learning
+in
+R-project}\label{random-forest-support-vector-machines-and-deep-learning-in-r-project}}
+
+In comparison to \texttt{lazar}, three other models (Random Forest (RF),
+Support Vector Machines (SVM), and Deep Learning (DL)) were evaluated.
+
+For the generation of these models, molecular 1D and 2D descriptors of
+the training dataset were calculated using PaDEL-Descriptors
+(\url{http://www.yapcwsoft.com} version 2.21, Yap (2011)).
+
+As the training dataset contained over 8280 instances, it was decided to
+delete instances with missing values during data pre-processing.
+Furthermore, substances with equivocal outcome were removed. The final
+training dataset contained 8080 instances with known mutagenic
+potential. The RF, SVM, and DL models were generated using the R
+software (R-project for Statistical Computing,
+\url{https://www.r-project.org/}\emph{;} version 3.3.1), specific R
+packages used are identified for each step in the description below.
+During feature selection, descriptor with near zero variance were
+removed using `\emph{NearZeroVar}'-function (package `caret'). If the
+percentage of the most common value was more than 90\% or when the
+frequency ratio of the most common value to the second most common value
+was greater than 95:5 (e.g.~95 instances of the most common value and
+only 5 or less instances of the second most common value), a descriptor
+was classified as having a near zero variance. After that, highly
+correlated descriptors were removed using the
+`\emph{findCorrelation}'-function (package `caret') with a cut-off of
+0.9. This resulted in a training dataset with 516 descriptors. These
+descriptors were scaled to be in the range between 0 and 1 using the
+`\emph{preProcess}'-function (package `caret'). The scaling routine was
+saved in order to apply the same scaling on the testing dataset. As
+these three steps did not consider the outcome, it was decided that they
+do not need to be included in the cross-validation of the model. To
+further reduce the number of features, a LASSO (\emph{least absolute
+shrinkage and selection operator}) regression was performed using the
+`\emph{glmnet}'-function (package `\emph{glmnet}'). The reduced dataset
+was used for the generation of the pre-trained models.
+
+For the RF model, the `\emph{randomForest}'-function (package
+`\emph{randomForest}') was used. A forest with 1000 trees with maximal
+terminal nodes of 200 was grown for the prediction.
+
+The `\emph{svm}'-function (package `e1071') with a \emph{radial basis
+function kernel} was used for the SVM model.
+
+The DL model was generated using the `\emph{h2o.deeplearning}'-function
+(package `\emph{h2o}'). The DL contained four hidden layer with 70, 50,
+50, and 10 neurons, respectively. Other hyperparameter were set as
+follows: l1=1.0E-7, l2=1.0E-11, epsilon = 1.0E-10, rho = 0.8, and
+quantile\_alpha = 0.5. For all other hyperparameter, the default values
+were used. Weights and biases were in a first step determined with an
+unsupervised DL model. These values were then used for the actual,
+supervised DL model.
+
+To validate these models, an internal cross-validation approach was
+chosen. The training dataset was randomly split in training data, which
+contained 95\% of the data, and validation data, which contain 5\% of
+the data. A feature selection with LASSO on the training data was
+performed, reducing the number of descriptors to approximately 100. This
+step was repeated five times. Based on each of the five different
+training data, the predictive models were trained and the performance
+tested with the validation data. This step was repeated 10 times.
+Furthermore, a y-randomisation using the RF model was performed. During
+y-randomisation, the outcome (y-variable) is randomly permuted. The
+theory is that after randomisation of the outcome, the model should not
+be able to correlate the outcome to the properties (descriptor values)
+of the substances. The performance of the model should therefore
+indicate a by change prediction with an accuracy of about 50\%. If this
+is true, it can be concluded that correlation between actual outcome and
+properties of the substances is real and not by chance (Rücker, Rücker,
+and Meringer (2007)).
+
+\includegraphics[width=6.26875in,height=5.48611in]{media/image1.png}
+
+Figure 1: Flowchart of the generation and validation of the models
+generated in R-project
+
+\hypertarget{applicability-domain-1}{%
+\paragraph{Applicability domain}\label{applicability-domain-1}}
+
+The AD of the training dataset and the PA dataset was evaluated using
+the Jaccard distance. A Jaccard distance of `0' indicates that the
+substances are similar, whereas a value of `1' shows that the substances
+are different. The Jaccard distance was below 0.2 for all PAs relative
+to the training dataset. Therefore, PA dataset is within the AD of the
+training dataset and the models can be used to predict the genotoxic
+potential of the PA dataset.
+
+\hypertarget{y-randomisation}{%
+\paragraph{y-randomisation}\label{y-randomisation}}
+
+After y-randomisation of the outcome, the accuracy and CCR are around
+50\%, indicating a chance in the distribution of the results. This
+shows, that the outcome is actually related to the predictors and not by
+chance.
+
+\hypertarget{deep-learning-in-tensorflow}{%
+\subsubsection{Deep Learning in
+TensorFlow}\label{deep-learning-in-tensorflow}}
+
+Alternatively, a DL model was established with Python-based TensorFlow
+program (\url{https://www.tensorflow.org/}) using the high-level API
+Keras (\url{https://www.tensorflow.org/guide/keras}) to build the
+models.
+
+Data pre-processing was done by rank transformation using the
+`\emph{QuantileTransformer}' procedure. A sequential model has been
+used. Four layers have been used: input layer, two hidden layers (with
+12, 8 and 8 nodes, respectively) and one output layer. For the output
+layer, a sigmoidal activation function and for all other layers the ReLU
+(`\emph{Rectified Linear Unit}') activation function was used.
+Additionally, a L\textsuperscript{2}-penalty of 0.001 was used for the
+input layer. For training of the model, the ADAM algorithm was used to
+minimise the cross-entropy loss using the default parameters of Keras.
+Training was performed for 100 epochs with a batch size of 64. The model
+was implemented with Python 3.6 and Keras. For training of the model, a
+6-fold cross-validation was used. Accuracy was estimated by ROC-AUC and
+confusion matrix.
+
+\hypertarget{validation}{%
+\subsection{Validation}\label{validation}}
+
+\hypertarget{results}{%
+\section{Results}\label{results}}
+
+\hypertarget{lazar-1}{%
+\subsection{\texorpdfstring{\texttt{lazar}}{lazar}}\label{lazar-1}}
+
+\hypertarget{random-forest}{%
+\subsection{Random Forest}\label{random-forest}}
+
+The validation showed that the RF model has an accuracy of 64\%, a
+sensitivity of 66\% and a specificity of 63\%. The confusion matrix of
+the model, calculated for 8080 instances, is provided in Table 1.
+
+Table 1: Confusion matrix of the RF model
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule
+& Predicted genotoxicity & & &\tabularnewline
+\midrule
+\endhead
+Measured genotoxicity & & \textbf{\emph{PP}} & \textbf{\emph{PN}} &
+\textbf{\emph{Total}}\tabularnewline
+& \textbf{\emph{TP}} & 2274 & 1163 & 3437\tabularnewline
+& \textbf{\emph{TN}} & 1736 & 2907 & 4643\tabularnewline
+& \textbf{\emph{Total}} & 4010 & 4070 & 8080\tabularnewline
+\bottomrule
+\end{longtable}
+
+PP: Predicted positive; PN: Predicted negative, TP: True positive, TN:
+True negative
+
+\hypertarget{support-vector-machines}{%
+\subsection{Support Vector Machines}\label{support-vector-machines}}
+
+The validation showed that the SVM model has an accuracy of 62\%, a
+sensitivity of 65\% and a specificity of 60\%. The confusion matrix of
+SVM model, calculated for 8080 instances, is provided in Table 2.
+
+Table 2: Confusion matrix of the SVM model
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule
+& Predicted genotoxicity & & &\tabularnewline
+\midrule
+\endhead
+Measured genotoxicity & & \textbf{\emph{PP}} & \textbf{\emph{PN}} &
+\textbf{\emph{Total}}\tabularnewline
+& \textbf{\emph{TP}} & 2057 & 1107 & 3164\tabularnewline
+& \textbf{\emph{TN}} & 1953 & 2963 & 4916\tabularnewline
+& \textbf{\emph{Total}} & 4010 & 4070 & 8080\tabularnewline
+\bottomrule
+\end{longtable}
+
+PP: Predicted positive; PN: Predicted negative, TP: True positive, TN:
+True negative
+
+\hypertarget{deep-learning-r-project}{%
+\subsection{Deep Learning (R-project)}\label{deep-learning-r-project}}
+
+The validation showed that the DL model generated in R has an accuracy
+of 59\%, a sensitivity of 89\% and a specificity of 30\%. The confusion
+matrix of the model, normalised to 8080 instances, is provided in Table
+3.
+
+Table 3: Confusion matrix of the DL model (R-project)
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule
+& Predicted genotoxicity & & &\tabularnewline
+\midrule
+\endhead
+Measured genotoxicity & & \textbf{\emph{PP}} & \textbf{\emph{PN}} &
+\textbf{\emph{Total}}\tabularnewline
+& \textbf{\emph{TP}} & 3575 & 435 & 4010\tabularnewline
+& \textbf{\emph{TN}} & 2853 & 1217 & 4070\tabularnewline
+& \textbf{\emph{Total}} & 6428 & 1652 & 8080\tabularnewline
+\bottomrule
+\end{longtable}
+
+PP: Predicted positive; PN: Predicted negative, TP: True positive, TN:
+True negative
+
+\hypertarget{dl-model-tensorflow}{%
+\subsection{DL model (TensorFlow)}\label{dl-model-tensorflow}}
+
+The validation showed that the DL model generated in TensorFlow has an
+accuracy of 68\%, a sensitivity of 70\% and a specificity of 46\%. The
+confusion matrix of the model, normalised to 8080 instances, is provided
+in Table 4.
+
+Table 4: Confusion matrix of the DL model (TensorFlow)
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule
+& Predicted genotoxicity & & &\tabularnewline
+\midrule
+\endhead
+Measured genotoxicity & & \textbf{\emph{PP}} & \textbf{\emph{PN}} &
+\textbf{\emph{Total}}\tabularnewline
+& \textbf{\emph{TP}} & 2851 & 1227 & 4078\tabularnewline
+& \textbf{\emph{TN}} & 1825 & 2177 & 4002\tabularnewline
+& \textbf{\emph{Total}} & 4676 & 3404 & 8080\tabularnewline
+\bottomrule
+\end{longtable}
+
+PP: Predicted positive; PN: Predicted negative, TP: True positive, TN:
+True negative
+
+The ROC curves from the 6-fold validation are shown in Figure 7.
+
+\includegraphics[width=3.825in,height=2.7327in]{media/image7.png}
+
+Figure 7: Six-fold cross-validation of TensorFlow DL model show an
+average area under the ROC-curve (ROC-AUC; measure of accuracy) of 68\%.
+
+In summary, the validation results of the four methods are presented in
+the following table.
+
+Table 5 Results of the cross-validation of the four models and after
+y-randomisation
+
+\begin{longtable}[]{@{}lllll@{}}
+\toprule
+\begin{minipage}[b]{0.28\columnwidth}\raggedright
+\strut
+\end{minipage} & \begin{minipage}[b]{0.13\columnwidth}\raggedright
+Accuracy\strut
+\end{minipage} & \begin{minipage}[b]{0.09\columnwidth}\raggedright
+CCR\strut
+\end{minipage} & \begin{minipage}[b]{0.16\columnwidth}\raggedright
+Sensitivity\strut
+\end{minipage} & \begin{minipage}[b]{0.16\columnwidth}\raggedright
+Specificity\strut
+\end{minipage}\tabularnewline
+\midrule
+\endhead
+\begin{minipage}[t]{0.28\columnwidth}\raggedright
+RF model\strut
+\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright
+64.1\%\strut
+\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\raggedright
+64.4\%\strut
+\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
+66.2\%\strut
+\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
+62.6\%\strut
+\end{minipage}\tabularnewline
+\begin{minipage}[t]{0.28\columnwidth}\raggedright
+SVM model\strut
+\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright
+62.1\%\strut
+\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\raggedright
+62.6\%\strut
+\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
+65.0\%\strut
+\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
+60.3\%\strut
+\end{minipage}\tabularnewline
+\begin{minipage}[t]{0.28\columnwidth}\raggedright
+DL model\\
+(R-project)\strut
+\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright
+59.3\%\strut
+\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\raggedright
+59.5\%\strut
+\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
+89.2\%\strut
+\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
+29.9\%\strut
+\end{minipage}\tabularnewline
+\begin{minipage}[t]{0.28\columnwidth}\raggedright
+DL model (TensorFlow)\strut
+\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright
+68\%\strut
+\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\raggedright
+62.2\%\strut
+\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
+69.9\%\strut
+\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
+45.6\%\strut
+\end{minipage}\tabularnewline
+\begin{minipage}[t]{0.28\columnwidth}\raggedright
+y-randomisation\strut
+\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright
+50.5\%\strut
+\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\raggedright
+50.4\%\strut
+\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
+50.3\%\strut
+\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
+50.6\%\strut
+\end{minipage}\tabularnewline
+\bottomrule
+\end{longtable}
+
+CCR (correct classification rate)
+
+\hypertarget{discussion}{%
+\section{Discussion}\label{discussion}}
+
+General model performance
+
+Based on the results of the cross-validation for all models,
+\texttt{lazar}, RF, SVM, DL (R-project) and DL (TensorFlow) it can be
+state that the prediction results are not optimal due to different
+reasons. The accuracy as measured during cross-validation of the four
+models (RF, SVM, DL (R-project and TensorFlow)) was partly low with CCR
+values between 59.3 and 68\%, with the R-generated DL model and the
+TensorFlow-generated DL model showing the worst and the best
+performance, respectively. The validation of the R-generated DL model
+revealed a high sensitivity (89.2\%) but an unacceptably low specificity
+of 29.9\% indicating a high number of false positive estimates. The
+TensorFlow-generated DL model, however, showed an acceptable but not
+optimal accuracy of 68\%, a sensitivity of 69.9\% and a specificity of
+45.6\%. The low specificity indicates that both DL models tends to
+predict too many instances as positive (genotoxic), and therefore have a
+high false positive rate. This allows at least with the TensorFlow
+generated DL model to make group statements, but the confidence for
+estimations of single PAs appears to be insufficiently low.
+
+Several factors have likely contributed to the low to moderate
+performance of the used methods as shown during the cross-validation:
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  The outcome in the training dataset was based on the results of AMES
+  tests for genotoxicity \protect\hyperlink{_ENREF_63}{ICH 2011}(), an
+  \emph{in vitro} test in different strains of the bacteria
+  \emph{Salmonella typhimurium}. In this test, mutagenicity is evaluated
+  with and without prior metabolic activation of the test substance.
+  Metabolic activation could result in the formation of genotoxic
+  metabolites from non-genotoxic parent compounds. However, no
+  distinction was made in the training dataset between substances that
+  needed metabolic activation before being mutagenic and those that were
+  mutagenic without metabolic activation. \texttt{lazar} is able to
+  handle this `inaccuracy' in the training dataset well due to the way
+  the algorithm works: \texttt{lazar} predicts the genotoxic potential
+  based on the neighbours of substances with comparable structural
+  features, considering mutagenic and not mutagenic neighbours. Based on
+  the structural similarity, a probability for mutagenicity and no
+  mutagenicity is calculated independently from each other (meaning that
+  the sum of probabilities does not necessarily adds up to 100\%). The
+  class with the higher outcome is then the overall outcome for the
+  substance.
+\end{enumerate}
+
+\begin{quote}
+In contrast, the other models need to be trained first to recognise the
+structural features that are responsible for genotoxicity. Therefore,
+the mixture of substances being mutagenic with and without metabolic
+activation in the training dataset may have adversely affected the
+ability to separate the dataset in two distinct classes and thus
+explains the relatively low performance of these models.
+\end{quote}
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\setcounter{enumi}{1}
+\tightlist
+\item
+  Machine learning algorithms try to find an optimized solution in a
+  high-dimensional (one dimension per each predictor) space. Sometimes
+  these methods do not find the global optimum of estimates but only
+  local (not optimal) solutions. Strategies to find the global solutions
+  are systematic variation (grid search) of the hyperparameters of the
+  methods, which may be very time consuming in particular in large
+  datasets.
+\end{enumerate}
+
+\hypertarget{conclusions}{%
+\section{Conclusions}\label{conclusions}}
+
+In this study, an attempt was made to predict the genotoxic potential of
+PAs using five different machine learning techniques (\texttt{lazar},
+RF, SVM, DL (R-project and TensorFlow). The results of all models fitted
+only partly to the findings in literature, with best results obtained
+with the TensorFlow DL model. Therefore, modelling allows statements on
+the relative risks of genotoxicity of the different PA groups.
+Individual predictions for selective PAs appear, however, not reliable
+on the current basis of the used training dataset.
+
+This study emphasises the importance of critical assessment of
+predictions by QSAR models. This includes not only extensive literature
+research to assess the plausibility of the predictions, but also a good
+knowledge of the metabolism of the test substances and understanding for
+possible mechanisms of toxicity.
+
+In further studies, additional machine learning techniques or a modified
+(extended) training dataset should be used for an additional attempt to
+predict the genotoxic potential of PAs.
+
+\hypertarget{references}{%
+\section*{References}\label{references}}
+\addcontentsline{toc}{section}{References}
+
+\hypertarget{refs}{}
+\leavevmode\hypertarget{ref-Bender2004}{}%
+Bender, Andreas, Hamse Y. Mussa, Robert C. Glen, and Stephan Reiling.
+2004. ``Molecular Similarity Searching Using Atom Environments,
+Information-Based Feature Selection, and a Naïve Bayesian Classifier.''
+\emph{Journal of Chemical Information and Computer Sciences} 44 (1):
+170--78. \url{https://doi.org/10.1021/ci034207y}.
+
+\leavevmode\hypertarget{ref-Hansen2009}{}%
+Hansen, Katja, Sebastian Mika, Timon Schroeter, Andreas Sutter, Antonius
+ter Laak, Thomas Steger-Hartmann, Nikolaus Heinrich, and Klaus-Robert
+Müller. 2009. ``Benchmark Data Set for in Silico Prediction of Ames
+Mutagenicity.'' \emph{Journal of Chemical Information and Modeling} 49
+(9): 2077--81. \url{https://doi.org/10.1021/ci900161g}.
+
+\leavevmode\hypertarget{ref-Kazius2005}{}%
+Kazius, J., R. McGuire, and R. Bursi. 2005. ``Derivation and Validation
+of Toxicophores for Mutagenicity Prediction.'' \emph{J Med Chem}, no.
+48: 312--20.
+
+\leavevmode\hypertarget{ref-OBoyle2011a}{}%
+O'Boyle, Noel, Michael Banck, Craig James, Chris Morley, Tim
+Vandermeersch, and Geoffrey Hutchison. 2011. ``Open Babel: An open
+chemical toolbox.'' \emph{J. Cheminf.} 3 (1): 33.
+\url{https://doi.org/doi:10.1186/1758-2946-3-33}.
+
+\leavevmode\hypertarget{ref-Ruxfccker2007}{}%
+Rücker, C, G Rücker, and M. Meringer. 2007. ``Y-Randomization and Its
+Variants in Qspr/Qsar.'' \emph{J. Chem. Inf. Model.}, no. 47: 2345--57.
+
+\leavevmode\hypertarget{ref-Yap2011}{}%
+Yap, CW. 2011. ``PaDEL-Descriptor: An Open Source Software to Calculate
+Molecular Descriptors and Fingerprints.'' \emph{Journal of Computational
+Chemistry}, no. 32: 1466--74.
+
+\end{document}