1 files changed, 0 insertions, 779 deletions
diff --git a/paper/outfile.latex b/paper/outfile.latex
deleted file mode 100644
index 9af84b1..0000000
--- a/paper/outfile.latex
+++ /dev/null
@@ -1,779 +0,0 @@
-\documentclass[]{scrartcl}
-\usepackage{lmodern}
-\usepackage{amssymb,amsmath}
-\usepackage{ifxetex,ifluatex}
-\usepackage{fixltx2e} % provides \textsubscript
-\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
-  \usepackage[T1]{fontenc}
-  \usepackage[utf8]{inputenc}
-\else % if luatex or xelatex
-  \ifxetex
-    \usepackage{mathspec}
-  \else
-    \usepackage{fontspec}
-  \fi
-  \defaultfontfeatures{Ligatures=TeX,Scale=MatchLowercase}
-\fi
-% use upquote if available, for straight quotes in verbatim environments
-\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
-% use microtype if available
-\IfFileExists{microtype.sty}{%
-\usepackage{microtype}
-\UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
-}{}
-\usepackage[unicode=true]{hyperref}
-\hypersetup{
-            pdftitle={A comparison of random forest, support vector machine, deep learning and lazar algorithms for predicting mutagenicity},
-            pdfkeywords={mutagenicity, (Q)SAR, lazar, random forest, support vector machine, deep
-learning},
-            pdfborder={0 0 0},
-            breaklinks=true}
-\urlstyle{same}  % don't use monospace font for urls
-\usepackage{longtable,booktabs}
-\usepackage{graphicx,grffile}
-\makeatletter
-\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
-\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
-\makeatother
-% Scale images if necessary, so that they will not overflow the page
-% margins by default, and it is still possible to overwrite the defaults
-% using explicit options in \includegraphics[width, height, ...]{}
-\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
-\IfFileExists{parskip.sty}{%
-\usepackage{parskip}
-}{% else
-\setlength{\parindent}{0pt}
-\setlength{\parskip}{6pt plus 2pt minus 1pt}
-}
-\setlength{\emergencystretch}{3em}  % prevent overfull lines
-\providecommand{\tightlist}{%
-  \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
-\setcounter{secnumdepth}{0}
-% Redefines (sub)paragraphs to behave more like sections
-\ifx\paragraph\undefined\else
-\let\oldparagraph\paragraph
-\renewcommand{\paragraph}[1]{\oldparagraph{#1}\mbox{}}
-\fi
-\ifx\subparagraph\undefined\else
-\let\oldsubparagraph\subparagraph
-\renewcommand{\subparagraph}[1]{\oldsubparagraph{#1}\mbox{}}
-\fi
-
-\title{A comparison of random forest, support vector machine, deep learning and
-lazar algorithms for predicting mutagenicity}
-\usepackage{authblk}
-\author[%
-  1%
-  ]{%
-  Christoph Helma%
-  %
-  \textsuperscript{*\,}%
-  %%
-  %
-}
-\author[%
-  2%
-  ]{%
-  Verena Schöning%
-  %
-  %
-}
-\author[%
-  2%
-  ]{%
-  Philipp Boss%
-  %
-  %
-}
-\author[%
-  2%
-  ]{%
-  Jürgen Drewe%
-  %
-  %
-}
-\affil[1]{\normalsize in silico toxicology gmbh, \footnotesize Rastatterstrasse 41, 4057 Basel, Switzerland}
-\affil[2]{\normalsize Zeller AG, \footnotesize Seeblickstrasse 4, 8590 Romanshorn, Switzerland}
-\date{}
-
-\makeatletter
-\def\@maketitle{%
-  \newpage \null \vskip 2em
-  \begin {center}%
-    \let \footnote \thanks
-         {\LARGE \@title \par}%
-         \vskip 1.5em%
-                {\large \lineskip .5em%
-                  \begin {tabular}[t]{c}%
-                    \@author
-                  \end {tabular}\par}%
-                                                \vskip 0.2em{\textsuperscript{*}\,Correspondence:
-                                    Christoph Helma <helma@in-silico.ch>\\
-                  }%
-                %                \vskip 1em{\large \@date}%
-  \end {center}%
-  \par
-  \vskip 1.5em}
-\makeatother
-
-\begin{document}
-
-\maketitle
-
-\begin{abstract}
-k-nearest neighbor (\texttt{lazar}), random forest, support vector
-machine and deep learning algorithms were applied to a new
-\emph{Salmonella} mutagenicity dataset with 8281 unique chemical
-structures. Algorithm performance was evaluated using 5-fold
-crossvalidation. TODO - results - conclusion
-\end{abstract}
-
-\hypertarget{introduction}{%
-\section{Introduction}\label{introduction}}
-
-TODO: algo history
-
-TODO: dataset history
-
-TODO: open problems
-
-\hypertarget{materials-and-methods}{%
-\section{Materials and Methods}\label{materials-and-methods}}
-
-\hypertarget{mutagenicity-data}{%
-\subsection{Mutagenicity data}\label{mutagenicity-data}}
-
-For all methods, the same training dataset was used. The training
-dataset was compiled from the following sources:
-
-\begin{itemize}
-\item
-  Kazius/Bursi Dataset (4337 compounds, Kazius, McGuire, and Bursi
-  (2005)): \url{http://cheminformatics.org/datasets/bursi/cas_4337.zip}
-\item
-  Hansen Dataset (6513 compounds, Hansen et al. (2009)):
-  \url{http://doc.ml.tu-berlin.de/toxbenchmark/Mutagenicity_N6512.csv}
-\item
-  EFSA Dataset (695 compounds):
-  \url{https://data.europa.eu/euodp/data/storage/f/2017-0719T142131/GENOTOX\%20data\%20and\%20dictionary.xls}
-\end{itemize}
-
-Mutagenicity classifications from Kazius and Hansen datasets were used
-without further processing. To achieve consistency between these
-datasets, EFSA compounds were classified as mutagenic, if at least one
-positive result was found for TA98 or T100 Salmonella strains.
-
-Dataset merges were based on unique SMILES (\emph{Simplified Molecular
-Input Line Entry Specification}) strings of the compound structures.
-Duplicated experimental data with the same outcome was merged into a
-single value, because it is likely that it originated from the same
-experiment. Contradictory results were kept as multiple measurements in
-the database. The combined training dataset contains 8281 unique
-structures.
-
-Source code for all data download, extraction and merge operations is
-publicly available from the git repository
-\url{https://git.in-silico.ch/pyrrolizidine} under a GPL3 License.
-
-TODO: check/fix git repo
-
-For the Random Forest (RF), Support Vector Machines (SVM), and Deep
-Learning (DL) models, molecular descriptors were calculated with the
-PaDEL-Descriptors program (\url{http://www.yapcwsoft.com} version 2.21,
-Yap (2011)).
-
-TODO: sentence ??
-
-From these descriptors were chosen, which were actually used for the
-generation of the DL model.
-
-\hypertarget{algorithms}{%
-\subsection{Algorithms}\label{algorithms}}
-
-\hypertarget{lazar}{%
-\subsubsection{\texorpdfstring{\texttt{lazar}}{lazar}}\label{lazar}}
-
-\texttt{lazar} (\emph{lazy structure activity relationships}) is a
-modular framework for read-across model development and validation. It
-follows the following basic workflow: For a given chemical structure
-\texttt{lazar}:
-
-\begin{itemize}
-\item
-  searches in a database for similar structures (neighbours) with
-  experimental data,
-\item
-  builds a local QSAR model with these neighbours and
-\item
-  uses this model to predict the unknown activity of the query compound.
-\end{itemize}
-
-This procedure resembles an automated version of read across predictions
-in toxicology, in machine learning terms it would be classified as a
-k-nearest-neighbour algorithm.
-
-Apart from this basic workflow, \texttt{lazar} is completely modular and
-allows the researcher to use any algorithm for similarity searches and
-local QSAR (\emph{Quantitative structure--activity relationship})
-modelling. Algorithms used within this study are described in the
-following sections.
-
-\hypertarget{neighbour-identification}{%
-\paragraph{Neighbour identification}\label{neighbour-identification}}
-
-Similarity calculations were based on MolPrint2D fingerprints (Bender et
-al. (2004)) from the OpenBabel cheminformatics library (O'Boyle et al.
-(2011)). The MolPrint2D fingerprint uses atom environments as molecular
-representation, which resembles basically the chemical concept of
-functional groups. For each atom in a molecule, it represents the
-chemical environment using the atom types of connected atoms.
-
-MolPrint2D fingerprints are generated dynamically from chemical
-structures and do not rely on predefined lists of fragments (such as
-OpenBabel FP3, FP4 or MACCs fingerprints or lists of
-toxicophores/toxicophobes). This has the advantage that they may capture
-substructures of toxicological relevance that are not included in other
-fingerprints.
-
-From MolPrint2D fingerprints a feature vector with all atom environments
-of a compound can be constructed that can be used to calculate chemical
-similarities.
-
-The chemical similarity between two compounds a and b is expressed as
-the proportion between atom environments common in both structures A ∩ B
-and the total number of atom environments A U B (Jaccard/Tanimoto
-index).
-
-\[sim = \frac{\left| A\  \cap B \right|}{\left| A\  \cup B \right|}\]
-
-Threshold selection is a trade-off between prediction accuracy (high
-threshold) and the number of predictable compounds (low threshold). As
-it is in many practical cases desirable to make predictions even in the
-absence of closely related neighbours, we follow a tiered approach:
-
-\begin{itemize}
-\item
-  First a similarity threshold of 0.5 is used to collect neighbours, to
-  create a local QSAR model and to make a prediction for the query
-  compound.
-\item
-  If any of these steps fails, the procedure is repeated with a
-  similarity threshold of 0.2 and the prediction is flagged with a
-  warning that it might be out of the applicability domain of the
-  training data.
-\item
-  Similarity thresholds of 0.5 and 0.2 are the default values chosen
-  \textgreater{} by the software developers and remained unchanged
-  during the \textgreater{} course of these experiments.
-\end{itemize}
-
-Compounds with the same structure as the query structure are
-automatically eliminated from neighbours to obtain unbiased predictions
-in the presence of duplicates.
-
-\hypertarget{local-qsar-models-and-predictions}{%
-\paragraph{Local QSAR models and
-predictions}\label{local-qsar-models-and-predictions}}
-
-Only similar compounds (neighbours) above the threshold are used for
-local QSAR models. In this investigation, we are using a weighted
-majority vote from the neighbour's experimental data for mutagenicity
-classifications. Probabilities for both classes
-(mutagenic/non-mutagenic) are calculated according to the following
-formula and the class with the higher probability is used as prediction
-outcome.
-
-\[p_{c} = \ \frac{\sum_{}^{}\text{sim}_{n,c}}{\sum_{}^{}\text{sim}_{n}}\]
-
-\(p_{c}\) Probability of class c (e.g.~mutagenic or non-mutagenic)\\
-\(\sum_{}^{}\text{sim}_{n,c}\) Sum of similarities of neighbours with
-class c\\
-\(\sum_{}^{}\text{sim}_{n}\) Sum of all neighbours
-
-\hypertarget{applicability-domain}{%
-\paragraph{Applicability domain}\label{applicability-domain}}
-
-The applicability domain (AD) of \texttt{lazar} models is determined by
-the structural diversity of the training data. If no similar compounds
-are found in the training data no predictions will be generated.
-Warnings are issued if the similarity threshold had to be lowered from
-0.5 to 0.2 in order to enable predictions. Predictions without warnings
-can be considered as close to the applicability domain and predictions
-with warnings as more distant from the applicability domain.
-Quantitative applicability domain information can be obtained from the
-similarities of individual neighbours.
-
-\hypertarget{availability}{%
-\paragraph{Availability}\label{availability}}
-
-\begin{itemize}
-\item
-  \texttt{lazar} experiments for this manuscript:
-  \url{https://git.in-silico.ch/pyrrolizidine} (source code, GPL3)
-\item
-  \texttt{lazar} framework: \url{https://git.in-silico.ch/lazar} (source
-  code, GPL3)
-\item
-  \texttt{lazar} GUI: \url{https://git.in-silico.ch/lazar-gui} (source
-  code, GPL3)
-\item
-  Public web interface: \url{https://lazar.in-silico.ch}
-\end{itemize}
-
-\hypertarget{random-forest-support-vector-machines-and-deep-learning-in-r-project}{%
-\subsubsection{Random Forest, Support Vector Machines, and Deep Learning
-in
-R-project}\label{random-forest-support-vector-machines-and-deep-learning-in-r-project}}
-
-In comparison to \texttt{lazar}, three other models (Random Forest (RF),
-Support Vector Machines (SVM), and Deep Learning (DL)) were evaluated.
-
-For the generation of these models, molecular 1D and 2D descriptors of
-the training dataset were calculated using PaDEL-Descriptors
-(\url{http://www.yapcwsoft.com} version 2.21, Yap (2011)).
-
-As the training dataset contained over 8280 instances, it was decided to
-delete instances with missing values during data pre-processing.
-Furthermore, substances with equivocal outcome were removed. The final
-training dataset contained 8080 instances with known mutagenic
-potential. The RF, SVM, and DL models were generated using the R
-software (R-project for Statistical Computing,
-\url{https://www.r-project.org/}\emph{;} version 3.3.1), specific R
-packages used are identified for each step in the description below.
-During feature selection, descriptor with near zero variance were
-removed using `\emph{NearZeroVar}'-function (package `caret'). If the
-percentage of the most common value was more than 90\% or when the
-frequency ratio of the most common value to the second most common value
-was greater than 95:5 (e.g.~95 instances of the most common value and
-only 5 or less instances of the second most common value), a descriptor
-was classified as having a near zero variance. After that, highly
-correlated descriptors were removed using the
-`\emph{findCorrelation}'-function (package `caret') with a cut-off of
-0.9. This resulted in a training dataset with 516 descriptors. These
-descriptors were scaled to be in the range between 0 and 1 using the
-`\emph{preProcess}'-function (package `caret'). The scaling routine was
-saved in order to apply the same scaling on the testing dataset. As
-these three steps did not consider the outcome, it was decided that they
-do not need to be included in the cross-validation of the model. To
-further reduce the number of features, a LASSO (\emph{least absolute
-shrinkage and selection operator}) regression was performed using the
-`\emph{glmnet}'-function (package `\emph{glmnet}'). The reduced dataset
-was used for the generation of the pre-trained models.
-
-For the RF model, the `\emph{randomForest}'-function (package
-`\emph{randomForest}') was used. A forest with 1000 trees with maximal
-terminal nodes of 200 was grown for the prediction.
-
-The `\emph{svm}'-function (package `e1071') with a \emph{radial basis
-function kernel} was used for the SVM model.
-
-The DL model was generated using the `\emph{h2o.deeplearning}'-function
-(package `\emph{h2o}'). The DL contained four hidden layer with 70, 50,
-50, and 10 neurons, respectively. Other hyperparameter were set as
-follows: l1=1.0E-7, l2=1.0E-11, epsilon = 1.0E-10, rho = 0.8, and
-quantile\_alpha = 0.5. For all other hyperparameter, the default values
-were used. Weights and biases were in a first step determined with an
-unsupervised DL model. These values were then used for the actual,
-supervised DL model.
-
-To validate these models, an internal cross-validation approach was
-chosen. The training dataset was randomly split in training data, which
-contained 95\% of the data, and validation data, which contain 5\% of
-the data. A feature selection with LASSO on the training data was
-performed, reducing the number of descriptors to approximately 100. This
-step was repeated five times. Based on each of the five different
-training data, the predictive models were trained and the performance
-tested with the validation data. This step was repeated 10 times.
-Furthermore, a y-randomisation using the RF model was performed. During
-y-randomisation, the outcome (y-variable) is randomly permuted. The
-theory is that after randomisation of the outcome, the model should not
-be able to correlate the outcome to the properties (descriptor values)
-of the substances. The performance of the model should therefore
-indicate a by change prediction with an accuracy of about 50\%. If this
-is true, it can be concluded that correlation between actual outcome and
-properties of the substances is real and not by chance (Rücker, Rücker,
-and Meringer (2007)).
-
-\includegraphics[width=6.26875in,height=5.48611in]{media/image1.png}
-
-Figure 1: Flowchart of the generation and validation of the models
-generated in R-project
-
-\hypertarget{applicability-domain-1}{%
-\paragraph{Applicability domain}\label{applicability-domain-1}}
-
-The AD of the training dataset and the PA dataset was evaluated using
-the Jaccard distance. A Jaccard distance of `0' indicates that the
-substances are similar, whereas a value of `1' shows that the substances
-are different. The Jaccard distance was below 0.2 for all PAs relative
-to the training dataset. Therefore, PA dataset is within the AD of the
-training dataset and the models can be used to predict the genotoxic
-potential of the PA dataset.
-
-\hypertarget{y-randomisation}{%
-\paragraph{y-randomisation}\label{y-randomisation}}
-
-After y-randomisation of the outcome, the accuracy and CCR are around
-50\%, indicating a chance in the distribution of the results. This
-shows, that the outcome is actually related to the predictors and not by
-chance.
-
-\hypertarget{deep-learning-in-tensorflow}{%
-\subsubsection{Deep Learning in
-TensorFlow}\label{deep-learning-in-tensorflow}}
-
-Alternatively, a DL model was established with Python-based TensorFlow
-program (\url{https://www.tensorflow.org/}) using the high-level API
-Keras (\url{https://www.tensorflow.org/guide/keras}) to build the
-models.
-
-Data pre-processing was done by rank transformation using the
-`\emph{QuantileTransformer}' procedure. A sequential model has been
-used. Four layers have been used: input layer, two hidden layers (with
-12, 8 and 8 nodes, respectively) and one output layer. For the output
-layer, a sigmoidal activation function and for all other layers the ReLU
-(`\emph{Rectified Linear Unit}') activation function was used.
-Additionally, a L\textsuperscript{2}-penalty of 0.001 was used for the
-input layer. For training of the model, the ADAM algorithm was used to
-minimise the cross-entropy loss using the default parameters of Keras.
-Training was performed for 100 epochs with a batch size of 64. The model
-was implemented with Python 3.6 and Keras. For training of the model, a
-6-fold cross-validation was used. Accuracy was estimated by ROC-AUC and
-confusion matrix.
-
-\hypertarget{validation}{%
-\subsection{Validation}\label{validation}}
-
-\hypertarget{results}{%
-\section{Results}\label{results}}
-
-\hypertarget{lazar-1}{%
-\subsection{\texorpdfstring{\texttt{lazar}}{lazar}}\label{lazar-1}}
-
-\hypertarget{random-forest}{%
-\subsection{Random Forest}\label{random-forest}}
-
-The validation showed that the RF model has an accuracy of 64\%, a
-sensitivity of 66\% and a specificity of 63\%. The confusion matrix of
-the model, calculated for 8080 instances, is provided in Table 1.
-
-Table 1: Confusion matrix of the RF model
-
-\begin{longtable}[]{@{}lllll@{}}
-\toprule
-& Predicted genotoxicity & & &\tabularnewline
-\midrule
-\endhead
-Measured genotoxicity & & \textbf{\emph{PP}} & \textbf{\emph{PN}} &
-\textbf{\emph{Total}}\tabularnewline
-& \textbf{\emph{TP}} & 2274 & 1163 & 3437\tabularnewline
-& \textbf{\emph{TN}} & 1736 & 2907 & 4643\tabularnewline
-& \textbf{\emph{Total}} & 4010 & 4070 & 8080\tabularnewline
-\bottomrule
-\end{longtable}
-
-PP: Predicted positive; PN: Predicted negative, TP: True positive, TN:
-True negative
-
-\hypertarget{support-vector-machines}{%
-\subsection{Support Vector Machines}\label{support-vector-machines}}
-
-The validation showed that the SVM model has an accuracy of 62\%, a
-sensitivity of 65\% and a specificity of 60\%. The confusion matrix of
-SVM model, calculated for 8080 instances, is provided in Table 2.
-
-Table 2: Confusion matrix of the SVM model
-
-\begin{longtable}[]{@{}lllll@{}}
-\toprule
-& Predicted genotoxicity & & &\tabularnewline
-\midrule
-\endhead
-Measured genotoxicity & & \textbf{\emph{PP}} & \textbf{\emph{PN}} &
-\textbf{\emph{Total}}\tabularnewline
-& \textbf{\emph{TP}} & 2057 & 1107 & 3164\tabularnewline
-& \textbf{\emph{TN}} & 1953 & 2963 & 4916\tabularnewline
-& \textbf{\emph{Total}} & 4010 & 4070 & 8080\tabularnewline
-\bottomrule
-\end{longtable}
-
-PP: Predicted positive; PN: Predicted negative, TP: True positive, TN:
-True negative
-
-\hypertarget{deep-learning-r-project}{%
-\subsection{Deep Learning (R-project)}\label{deep-learning-r-project}}
-
-The validation showed that the DL model generated in R has an accuracy
-of 59\%, a sensitivity of 89\% and a specificity of 30\%. The confusion
-matrix of the model, normalised to 8080 instances, is provided in Table
-3.
-
-Table 3: Confusion matrix of the DL model (R-project)
-
-\begin{longtable}[]{@{}lllll@{}}
-\toprule
-& Predicted genotoxicity & & &\tabularnewline
-\midrule
-\endhead
-Measured genotoxicity & & \textbf{\emph{PP}} & \textbf{\emph{PN}} &
-\textbf{\emph{Total}}\tabularnewline
-& \textbf{\emph{TP}} & 3575 & 435 & 4010\tabularnewline
-& \textbf{\emph{TN}} & 2853 & 1217 & 4070\tabularnewline
-& \textbf{\emph{Total}} & 6428 & 1652 & 8080\tabularnewline
-\bottomrule
-\end{longtable}
-
-PP: Predicted positive; PN: Predicted negative, TP: True positive, TN:
-True negative
-
-\hypertarget{dl-model-tensorflow}{%
-\subsection{DL model (TensorFlow)}\label{dl-model-tensorflow}}
-
-The validation showed that the DL model generated in TensorFlow has an
-accuracy of 68\%, a sensitivity of 70\% and a specificity of 46\%. The
-confusion matrix of the model, normalised to 8080 instances, is provided
-in Table 4.
-
-Table 4: Confusion matrix of the DL model (TensorFlow)
-
-\begin{longtable}[]{@{}lllll@{}}
-\toprule
-& Predicted genotoxicity & & &\tabularnewline
-\midrule
-\endhead
-Measured genotoxicity & & \textbf{\emph{PP}} & \textbf{\emph{PN}} &
-\textbf{\emph{Total}}\tabularnewline
-& \textbf{\emph{TP}} & 2851 & 1227 & 4078\tabularnewline
-& \textbf{\emph{TN}} & 1825 & 2177 & 4002\tabularnewline
-& \textbf{\emph{Total}} & 4676 & 3404 & 8080\tabularnewline
-\bottomrule
-\end{longtable}
-
-PP: Predicted positive; PN: Predicted negative, TP: True positive, TN:
-True negative
-
-The ROC curves from the 6-fold validation are shown in Figure 7.
-
-\includegraphics[width=3.825in,height=2.7327in]{media/image7.png}
-
-Figure 7: Six-fold cross-validation of TensorFlow DL model show an
-average area under the ROC-curve (ROC-AUC; measure of accuracy) of 68\%.
-
-In summary, the validation results of the four methods are presented in
-the following table.
-
-Table 5 Results of the cross-validation of the four models and after
-y-randomisation
-
-\begin{longtable}[]{@{}lllll@{}}
-\toprule
-\begin{minipage}[b]{0.28\columnwidth}\raggedright
-\strut
-\end{minipage} & \begin{minipage}[b]{0.13\columnwidth}\raggedright
-Accuracy\strut
-\end{minipage} & \begin{minipage}[b]{0.09\columnwidth}\raggedright
-CCR\strut
-\end{minipage} & \begin{minipage}[b]{0.16\columnwidth}\raggedright
-Sensitivity\strut
-\end{minipage} & \begin{minipage}[b]{0.16\columnwidth}\raggedright
-Specificity\strut
-\end{minipage}\tabularnewline
-\midrule
-\endhead
-\begin{minipage}[t]{0.28\columnwidth}\raggedright
-RF model\strut
-\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright
-64.1\%\strut
-\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\raggedright
-64.4\%\strut
-\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
-66.2\%\strut
-\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
-62.6\%\strut
-\end{minipage}\tabularnewline
-\begin{minipage}[t]{0.28\columnwidth}\raggedright
-SVM model\strut
-\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright
-62.1\%\strut
-\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\raggedright
-62.6\%\strut
-\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
-65.0\%\strut
-\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
-60.3\%\strut
-\end{minipage}\tabularnewline
-\begin{minipage}[t]{0.28\columnwidth}\raggedright
-DL model\\
-(R-project)\strut
-\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright
-59.3\%\strut
-\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\raggedright
-59.5\%\strut
-\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
-89.2\%\strut
-\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
-29.9\%\strut
-\end{minipage}\tabularnewline
-\begin{minipage}[t]{0.28\columnwidth}\raggedright
-DL model (TensorFlow)\strut
-\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright
-68\%\strut
-\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\raggedright
-62.2\%\strut
-\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
-69.9\%\strut
-\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
-45.6\%\strut
-\end{minipage}\tabularnewline
-\begin{minipage}[t]{0.28\columnwidth}\raggedright
-y-randomisation\strut
-\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright
-50.5\%\strut
-\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\raggedright
-50.4\%\strut
-\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
-50.3\%\strut
-\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
-50.6\%\strut
-\end{minipage}\tabularnewline
-\bottomrule
-\end{longtable}
-
-CCR (correct classification rate)
-
-\hypertarget{discussion}{%
-\section{Discussion}\label{discussion}}
-
-General model performance
-
-Based on the results of the cross-validation for all models,
-\texttt{lazar}, RF, SVM, DL (R-project) and DL (TensorFlow) it can be
-state that the prediction results are not optimal due to different
-reasons. The accuracy as measured during cross-validation of the four
-models (RF, SVM, DL (R-project and TensorFlow)) was partly low with CCR
-values between 59.3 and 68\%, with the R-generated DL model and the
-TensorFlow-generated DL model showing the worst and the best
-performance, respectively. The validation of the R-generated DL model
-revealed a high sensitivity (89.2\%) but an unacceptably low specificity
-of 29.9\% indicating a high number of false positive estimates. The
-TensorFlow-generated DL model, however, showed an acceptable but not
-optimal accuracy of 68\%, a sensitivity of 69.9\% and a specificity of
-45.6\%. The low specificity indicates that both DL models tends to
-predict too many instances as positive (genotoxic), and therefore have a
-high false positive rate. This allows at least with the TensorFlow
-generated DL model to make group statements, but the confidence for
-estimations of single PAs appears to be insufficiently low.
-
-Several factors have likely contributed to the low to moderate
-performance of the used methods as shown during the cross-validation:
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi}.}
-\tightlist
-\item
-  The outcome in the training dataset was based on the results of AMES
-  tests for genotoxicity \protect\hyperlink{_ENREF_63}{ICH 2011}(), an
-  \emph{in vitro} test in different strains of the bacteria
-  \emph{Salmonella typhimurium}. In this test, mutagenicity is evaluated
-  with and without prior metabolic activation of the test substance.
-  Metabolic activation could result in the formation of genotoxic
-  metabolites from non-genotoxic parent compounds. However, no
-  distinction was made in the training dataset between substances that
-  needed metabolic activation before being mutagenic and those that were
-  mutagenic without metabolic activation. \texttt{lazar} is able to
-  handle this `inaccuracy' in the training dataset well due to the way
-  the algorithm works: \texttt{lazar} predicts the genotoxic potential
-  based on the neighbours of substances with comparable structural
-  features, considering mutagenic and not mutagenic neighbours. Based on
-  the structural similarity, a probability for mutagenicity and no
-  mutagenicity is calculated independently from each other (meaning that
-  the sum of probabilities does not necessarily adds up to 100\%). The
-  class with the higher outcome is then the overall outcome for the
-  substance.
-\end{enumerate}
-
-\begin{quote}
-In contrast, the other models need to be trained first to recognise the
-structural features that are responsible for genotoxicity. Therefore,
-the mixture of substances being mutagenic with and without metabolic
-activation in the training dataset may have adversely affected the
-ability to separate the dataset in two distinct classes and thus
-explains the relatively low performance of these models.
-\end{quote}
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi}.}
-\setcounter{enumi}{1}
-\tightlist
-\item
-  Machine learning algorithms try to find an optimized solution in a
-  high-dimensional (one dimension per each predictor) space. Sometimes
-  these methods do not find the global optimum of estimates but only
-  local (not optimal) solutions. Strategies to find the global solutions
-  are systematic variation (grid search) of the hyperparameters of the
-  methods, which may be very time consuming in particular in large
-  datasets.
-\end{enumerate}
-
-\hypertarget{conclusions}{%
-\section{Conclusions}\label{conclusions}}
-
-In this study, an attempt was made to predict the genotoxic potential of
-PAs using five different machine learning techniques (\texttt{lazar},
-RF, SVM, DL (R-project and TensorFlow). The results of all models fitted
-only partly to the findings in literature, with best results obtained
-with the TensorFlow DL model. Therefore, modelling allows statements on
-the relative risks of genotoxicity of the different PA groups.
-Individual predictions for selective PAs appear, however, not reliable
-on the current basis of the used training dataset.
-
-This study emphasises the importance of critical assessment of
-predictions by QSAR models. This includes not only extensive literature
-research to assess the plausibility of the predictions, but also a good
-knowledge of the metabolism of the test substances and understanding for
-possible mechanisms of toxicity.
-
-In further studies, additional machine learning techniques or a modified
-(extended) training dataset should be used for an additional attempt to
-predict the genotoxic potential of PAs.
-
-\hypertarget{references}{%
-\section*{References}\label{references}}
-\addcontentsline{toc}{section}{References}
-
-\hypertarget{refs}{}
-\leavevmode\hypertarget{ref-Bender2004}{}%
-Bender, Andreas, Hamse Y. Mussa, Robert C. Glen, and Stephan Reiling.
-2004. ``Molecular Similarity Searching Using Atom Environments,
-Information-Based Feature Selection, and a Naïve Bayesian Classifier.''
-\emph{Journal of Chemical Information and Computer Sciences} 44 (1):
-170--78. \url{https://doi.org/10.1021/ci034207y}.
-
-\leavevmode\hypertarget{ref-Hansen2009}{}%
-Hansen, Katja, Sebastian Mika, Timon Schroeter, Andreas Sutter, Antonius
-ter Laak, Thomas Steger-Hartmann, Nikolaus Heinrich, and Klaus-Robert
-Müller. 2009. ``Benchmark Data Set for in Silico Prediction of Ames
-Mutagenicity.'' \emph{Journal of Chemical Information and Modeling} 49
-(9): 2077--81. \url{https://doi.org/10.1021/ci900161g}.
-
-\leavevmode\hypertarget{ref-Kazius2005}{}%
-Kazius, J., R. McGuire, and R. Bursi. 2005. ``Derivation and Validation
-of Toxicophores for Mutagenicity Prediction.'' \emph{J Med Chem}, no.
-48: 312--20.
-
-\leavevmode\hypertarget{ref-OBoyle2011a}{}%
-O'Boyle, Noel, Michael Banck, Craig James, Chris Morley, Tim
-Vandermeersch, and Geoffrey Hutchison. 2011. ``Open Babel: An open
-chemical toolbox.'' \emph{J. Cheminf.} 3 (1): 33.
-\url{https://doi.org/doi:10.1186/1758-2946-3-33}.
-
-\leavevmode\hypertarget{ref-Ruxfccker2007}{}%
-Rücker, C, G Rücker, and M. Meringer. 2007. ``Y-Randomization and Its
-Variants in Qspr/Qsar.'' \emph{J. Chem. Inf. Model.}, no. 47: 2345--57.
-
-\leavevmode\hypertarget{ref-Yap2011}{}%
-Yap, CW. 2011. ``PaDEL-Descriptor: An Open Source Software to Calculate
-Molecular Descriptors and Fingerprints.'' \emph{Journal of Computational
-Chemistry}, no. 32: 1466--74.
-
-\end{document}