diff options
Diffstat (limited to 'paper/outfile.latex')
-rw-r--r-- | paper/outfile.latex | 779 |
1 files changed, 779 insertions, 0 deletions
diff --git a/paper/outfile.latex b/paper/outfile.latex new file mode 100644 index 0000000..9af84b1 --- /dev/null +++ b/paper/outfile.latex @@ -0,0 +1,779 @@ +\documentclass[]{scrartcl} +\usepackage{lmodern} +\usepackage{amssymb,amsmath} +\usepackage{ifxetex,ifluatex} +\usepackage{fixltx2e} % provides \textsubscript +\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex + \usepackage[T1]{fontenc} + \usepackage[utf8]{inputenc} +\else % if luatex or xelatex + \ifxetex + \usepackage{mathspec} + \else + \usepackage{fontspec} + \fi + \defaultfontfeatures{Ligatures=TeX,Scale=MatchLowercase} +\fi +% use upquote if available, for straight quotes in verbatim environments +\IfFileExists{upquote.sty}{\usepackage{upquote}}{} +% use microtype if available +\IfFileExists{microtype.sty}{% +\usepackage{microtype} +\UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts +}{} +\usepackage[unicode=true]{hyperref} +\hypersetup{ + pdftitle={A comparison of random forest, support vector machine, deep learning and lazar algorithms for predicting mutagenicity}, + pdfkeywords={mutagenicity, (Q)SAR, lazar, random forest, support vector machine, deep +learning}, + pdfborder={0 0 0}, + breaklinks=true} +\urlstyle{same} % don't use monospace font for urls +\usepackage{longtable,booktabs} +\usepackage{graphicx,grffile} +\makeatletter +\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} +\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} +\makeatother +% Scale images if necessary, so that they will not overflow the page +% margins by default, and it is still possible to overwrite the defaults +% using explicit options in \includegraphics[width, height, ...]{} +\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} +\IfFileExists{parskip.sty}{% +\usepackage{parskip} +}{% else +\setlength{\parindent}{0pt} +\setlength{\parskip}{6pt plus 2pt minus 1pt} +} +\setlength{\emergencystretch}{3em} % prevent overfull lines +\providecommand{\tightlist}{% + \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} +\setcounter{secnumdepth}{0} +% Redefines (sub)paragraphs to behave more like sections +\ifx\paragraph\undefined\else +\let\oldparagraph\paragraph +\renewcommand{\paragraph}[1]{\oldparagraph{#1}\mbox{}} +\fi +\ifx\subparagraph\undefined\else +\let\oldsubparagraph\subparagraph +\renewcommand{\subparagraph}[1]{\oldsubparagraph{#1}\mbox{}} +\fi + +\title{A comparison of random forest, support vector machine, deep learning and +lazar algorithms for predicting mutagenicity} +\usepackage{authblk} +\author[% + 1% + ]{% + Christoph Helma% + % + \textsuperscript{*\,}% + %% + % +} +\author[% + 2% + ]{% + Verena Schöning% + % + % +} +\author[% + 2% + ]{% + Philipp Boss% + % + % +} +\author[% + 2% + ]{% + Jürgen Drewe% + % + % +} +\affil[1]{\normalsize in silico toxicology gmbh, \footnotesize Rastatterstrasse 41, 4057 Basel, Switzerland} +\affil[2]{\normalsize Zeller AG, \footnotesize Seeblickstrasse 4, 8590 Romanshorn, Switzerland} +\date{} + +\makeatletter +\def\@maketitle{% + \newpage \null \vskip 2em + \begin {center}% + \let \footnote \thanks + {\LARGE \@title \par}% + \vskip 1.5em% + {\large \lineskip .5em% + \begin {tabular}[t]{c}% + \@author + \end {tabular}\par}% + \vskip 0.2em{\textsuperscript{*}\,Correspondence: + Christoph Helma <helma@in-silico.ch>\\ + }% + % \vskip 1em{\large \@date}% + \end {center}% + \par + \vskip 1.5em} +\makeatother + +\begin{document} + +\maketitle + +\begin{abstract} +k-nearest neighbor (\texttt{lazar}), random forest, support vector +machine and deep learning algorithms were applied to a new +\emph{Salmonella} mutagenicity dataset with 8281 unique chemical +structures. Algorithm performance was evaluated using 5-fold +crossvalidation. TODO - results - conclusion +\end{abstract} + +\hypertarget{introduction}{% +\section{Introduction}\label{introduction}} + +TODO: algo history + +TODO: dataset history + +TODO: open problems + +\hypertarget{materials-and-methods}{% +\section{Materials and Methods}\label{materials-and-methods}} + +\hypertarget{mutagenicity-data}{% +\subsection{Mutagenicity data}\label{mutagenicity-data}} + +For all methods, the same training dataset was used. The training +dataset was compiled from the following sources: + +\begin{itemize} +\item + Kazius/Bursi Dataset (4337 compounds, Kazius, McGuire, and Bursi + (2005)): \url{http://cheminformatics.org/datasets/bursi/cas_4337.zip} +\item + Hansen Dataset (6513 compounds, Hansen et al. (2009)): + \url{http://doc.ml.tu-berlin.de/toxbenchmark/Mutagenicity_N6512.csv} +\item + EFSA Dataset (695 compounds): + \url{https://data.europa.eu/euodp/data/storage/f/2017-0719T142131/GENOTOX\%20data\%20and\%20dictionary.xls} +\end{itemize} + +Mutagenicity classifications from Kazius and Hansen datasets were used +without further processing. To achieve consistency between these +datasets, EFSA compounds were classified as mutagenic, if at least one +positive result was found for TA98 or T100 Salmonella strains. + +Dataset merges were based on unique SMILES (\emph{Simplified Molecular +Input Line Entry Specification}) strings of the compound structures. +Duplicated experimental data with the same outcome was merged into a +single value, because it is likely that it originated from the same +experiment. Contradictory results were kept as multiple measurements in +the database. The combined training dataset contains 8281 unique +structures. + +Source code for all data download, extraction and merge operations is +publicly available from the git repository +\url{https://git.in-silico.ch/pyrrolizidine} under a GPL3 License. + +TODO: check/fix git repo + +For the Random Forest (RF), Support Vector Machines (SVM), and Deep +Learning (DL) models, molecular descriptors were calculated with the +PaDEL-Descriptors program (\url{http://www.yapcwsoft.com} version 2.21, +Yap (2011)). + +TODO: sentence ?? + +From these descriptors were chosen, which were actually used for the +generation of the DL model. + +\hypertarget{algorithms}{% +\subsection{Algorithms}\label{algorithms}} + +\hypertarget{lazar}{% +\subsubsection{\texorpdfstring{\texttt{lazar}}{lazar}}\label{lazar}} + +\texttt{lazar} (\emph{lazy structure activity relationships}) is a +modular framework for read-across model development and validation. It +follows the following basic workflow: For a given chemical structure +\texttt{lazar}: + +\begin{itemize} +\item + searches in a database for similar structures (neighbours) with + experimental data, +\item + builds a local QSAR model with these neighbours and +\item + uses this model to predict the unknown activity of the query compound. +\end{itemize} + +This procedure resembles an automated version of read across predictions +in toxicology, in machine learning terms it would be classified as a +k-nearest-neighbour algorithm. + +Apart from this basic workflow, \texttt{lazar} is completely modular and +allows the researcher to use any algorithm for similarity searches and +local QSAR (\emph{Quantitative structure--activity relationship}) +modelling. Algorithms used within this study are described in the +following sections. + +\hypertarget{neighbour-identification}{% +\paragraph{Neighbour identification}\label{neighbour-identification}} + +Similarity calculations were based on MolPrint2D fingerprints (Bender et +al. (2004)) from the OpenBabel cheminformatics library (O'Boyle et al. +(2011)). The MolPrint2D fingerprint uses atom environments as molecular +representation, which resembles basically the chemical concept of +functional groups. For each atom in a molecule, it represents the +chemical environment using the atom types of connected atoms. + +MolPrint2D fingerprints are generated dynamically from chemical +structures and do not rely on predefined lists of fragments (such as +OpenBabel FP3, FP4 or MACCs fingerprints or lists of +toxicophores/toxicophobes). This has the advantage that they may capture +substructures of toxicological relevance that are not included in other +fingerprints. + +From MolPrint2D fingerprints a feature vector with all atom environments +of a compound can be constructed that can be used to calculate chemical +similarities. + +The chemical similarity between two compounds a and b is expressed as +the proportion between atom environments common in both structures A ∩ B +and the total number of atom environments A U B (Jaccard/Tanimoto +index). + +\[sim = \frac{\left| A\ \cap B \right|}{\left| A\ \cup B \right|}\] + +Threshold selection is a trade-off between prediction accuracy (high +threshold) and the number of predictable compounds (low threshold). As +it is in many practical cases desirable to make predictions even in the +absence of closely related neighbours, we follow a tiered approach: + +\begin{itemize} +\item + First a similarity threshold of 0.5 is used to collect neighbours, to + create a local QSAR model and to make a prediction for the query + compound. +\item + If any of these steps fails, the procedure is repeated with a + similarity threshold of 0.2 and the prediction is flagged with a + warning that it might be out of the applicability domain of the + training data. +\item + Similarity thresholds of 0.5 and 0.2 are the default values chosen + \textgreater{} by the software developers and remained unchanged + during the \textgreater{} course of these experiments. +\end{itemize} + +Compounds with the same structure as the query structure are +automatically eliminated from neighbours to obtain unbiased predictions +in the presence of duplicates. + +\hypertarget{local-qsar-models-and-predictions}{% +\paragraph{Local QSAR models and +predictions}\label{local-qsar-models-and-predictions}} + +Only similar compounds (neighbours) above the threshold are used for +local QSAR models. In this investigation, we are using a weighted +majority vote from the neighbour's experimental data for mutagenicity +classifications. Probabilities for both classes +(mutagenic/non-mutagenic) are calculated according to the following +formula and the class with the higher probability is used as prediction +outcome. + +\[p_{c} = \ \frac{\sum_{}^{}\text{sim}_{n,c}}{\sum_{}^{}\text{sim}_{n}}\] + +\(p_{c}\) Probability of class c (e.g.~mutagenic or non-mutagenic)\\ +\(\sum_{}^{}\text{sim}_{n,c}\) Sum of similarities of neighbours with +class c\\ +\(\sum_{}^{}\text{sim}_{n}\) Sum of all neighbours + +\hypertarget{applicability-domain}{% +\paragraph{Applicability domain}\label{applicability-domain}} + +The applicability domain (AD) of \texttt{lazar} models is determined by +the structural diversity of the training data. If no similar compounds +are found in the training data no predictions will be generated. +Warnings are issued if the similarity threshold had to be lowered from +0.5 to 0.2 in order to enable predictions. Predictions without warnings +can be considered as close to the applicability domain and predictions +with warnings as more distant from the applicability domain. +Quantitative applicability domain information can be obtained from the +similarities of individual neighbours. + +\hypertarget{availability}{% +\paragraph{Availability}\label{availability}} + +\begin{itemize} +\item + \texttt{lazar} experiments for this manuscript: + \url{https://git.in-silico.ch/pyrrolizidine} (source code, GPL3) +\item + \texttt{lazar} framework: \url{https://git.in-silico.ch/lazar} (source + code, GPL3) +\item + \texttt{lazar} GUI: \url{https://git.in-silico.ch/lazar-gui} (source + code, GPL3) +\item + Public web interface: \url{https://lazar.in-silico.ch} +\end{itemize} + +\hypertarget{random-forest-support-vector-machines-and-deep-learning-in-r-project}{% +\subsubsection{Random Forest, Support Vector Machines, and Deep Learning +in +R-project}\label{random-forest-support-vector-machines-and-deep-learning-in-r-project}} + +In comparison to \texttt{lazar}, three other models (Random Forest (RF), +Support Vector Machines (SVM), and Deep Learning (DL)) were evaluated. + +For the generation of these models, molecular 1D and 2D descriptors of +the training dataset were calculated using PaDEL-Descriptors +(\url{http://www.yapcwsoft.com} version 2.21, Yap (2011)). + +As the training dataset contained over 8280 instances, it was decided to +delete instances with missing values during data pre-processing. +Furthermore, substances with equivocal outcome were removed. The final +training dataset contained 8080 instances with known mutagenic +potential. The RF, SVM, and DL models were generated using the R +software (R-project for Statistical Computing, +\url{https://www.r-project.org/}\emph{;} version 3.3.1), specific R +packages used are identified for each step in the description below. +During feature selection, descriptor with near zero variance were +removed using `\emph{NearZeroVar}'-function (package `caret'). If the +percentage of the most common value was more than 90\% or when the +frequency ratio of the most common value to the second most common value +was greater than 95:5 (e.g.~95 instances of the most common value and +only 5 or less instances of the second most common value), a descriptor +was classified as having a near zero variance. After that, highly +correlated descriptors were removed using the +`\emph{findCorrelation}'-function (package `caret') with a cut-off of +0.9. This resulted in a training dataset with 516 descriptors. These +descriptors were scaled to be in the range between 0 and 1 using the +`\emph{preProcess}'-function (package `caret'). The scaling routine was +saved in order to apply the same scaling on the testing dataset. As +these three steps did not consider the outcome, it was decided that they +do not need to be included in the cross-validation of the model. To +further reduce the number of features, a LASSO (\emph{least absolute +shrinkage and selection operator}) regression was performed using the +`\emph{glmnet}'-function (package `\emph{glmnet}'). The reduced dataset +was used for the generation of the pre-trained models. + +For the RF model, the `\emph{randomForest}'-function (package +`\emph{randomForest}') was used. A forest with 1000 trees with maximal +terminal nodes of 200 was grown for the prediction. + +The `\emph{svm}'-function (package `e1071') with a \emph{radial basis +function kernel} was used for the SVM model. + +The DL model was generated using the `\emph{h2o.deeplearning}'-function +(package `\emph{h2o}'). The DL contained four hidden layer with 70, 50, +50, and 10 neurons, respectively. Other hyperparameter were set as +follows: l1=1.0E-7, l2=1.0E-11, epsilon = 1.0E-10, rho = 0.8, and +quantile\_alpha = 0.5. For all other hyperparameter, the default values +were used. Weights and biases were in a first step determined with an +unsupervised DL model. These values were then used for the actual, +supervised DL model. + +To validate these models, an internal cross-validation approach was +chosen. The training dataset was randomly split in training data, which +contained 95\% of the data, and validation data, which contain 5\% of +the data. A feature selection with LASSO on the training data was +performed, reducing the number of descriptors to approximately 100. This +step was repeated five times. Based on each of the five different +training data, the predictive models were trained and the performance +tested with the validation data. This step was repeated 10 times. +Furthermore, a y-randomisation using the RF model was performed. During +y-randomisation, the outcome (y-variable) is randomly permuted. The +theory is that after randomisation of the outcome, the model should not +be able to correlate the outcome to the properties (descriptor values) +of the substances. The performance of the model should therefore +indicate a by change prediction with an accuracy of about 50\%. If this +is true, it can be concluded that correlation between actual outcome and +properties of the substances is real and not by chance (Rücker, Rücker, +and Meringer (2007)). + +\includegraphics[width=6.26875in,height=5.48611in]{media/image1.png} + +Figure 1: Flowchart of the generation and validation of the models +generated in R-project + +\hypertarget{applicability-domain-1}{% +\paragraph{Applicability domain}\label{applicability-domain-1}} + +The AD of the training dataset and the PA dataset was evaluated using +the Jaccard distance. A Jaccard distance of `0' indicates that the +substances are similar, whereas a value of `1' shows that the substances +are different. The Jaccard distance was below 0.2 for all PAs relative +to the training dataset. Therefore, PA dataset is within the AD of the +training dataset and the models can be used to predict the genotoxic +potential of the PA dataset. + +\hypertarget{y-randomisation}{% +\paragraph{y-randomisation}\label{y-randomisation}} + +After y-randomisation of the outcome, the accuracy and CCR are around +50\%, indicating a chance in the distribution of the results. This +shows, that the outcome is actually related to the predictors and not by +chance. + +\hypertarget{deep-learning-in-tensorflow}{% +\subsubsection{Deep Learning in +TensorFlow}\label{deep-learning-in-tensorflow}} + +Alternatively, a DL model was established with Python-based TensorFlow +program (\url{https://www.tensorflow.org/}) using the high-level API +Keras (\url{https://www.tensorflow.org/guide/keras}) to build the +models. + +Data pre-processing was done by rank transformation using the +`\emph{QuantileTransformer}' procedure. A sequential model has been +used. Four layers have been used: input layer, two hidden layers (with +12, 8 and 8 nodes, respectively) and one output layer. For the output +layer, a sigmoidal activation function and for all other layers the ReLU +(`\emph{Rectified Linear Unit}') activation function was used. +Additionally, a L\textsuperscript{2}-penalty of 0.001 was used for the +input layer. For training of the model, the ADAM algorithm was used to +minimise the cross-entropy loss using the default parameters of Keras. +Training was performed for 100 epochs with a batch size of 64. The model +was implemented with Python 3.6 and Keras. For training of the model, a +6-fold cross-validation was used. Accuracy was estimated by ROC-AUC and +confusion matrix. + +\hypertarget{validation}{% +\subsection{Validation}\label{validation}} + +\hypertarget{results}{% +\section{Results}\label{results}} + +\hypertarget{lazar-1}{% +\subsection{\texorpdfstring{\texttt{lazar}}{lazar}}\label{lazar-1}} + +\hypertarget{random-forest}{% +\subsection{Random Forest}\label{random-forest}} + +The validation showed that the RF model has an accuracy of 64\%, a +sensitivity of 66\% and a specificity of 63\%. The confusion matrix of +the model, calculated for 8080 instances, is provided in Table 1. + +Table 1: Confusion matrix of the RF model + +\begin{longtable}[]{@{}lllll@{}} +\toprule +& Predicted genotoxicity & & &\tabularnewline +\midrule +\endhead +Measured genotoxicity & & \textbf{\emph{PP}} & \textbf{\emph{PN}} & +\textbf{\emph{Total}}\tabularnewline +& \textbf{\emph{TP}} & 2274 & 1163 & 3437\tabularnewline +& \textbf{\emph{TN}} & 1736 & 2907 & 4643\tabularnewline +& \textbf{\emph{Total}} & 4010 & 4070 & 8080\tabularnewline +\bottomrule +\end{longtable} + +PP: Predicted positive; PN: Predicted negative, TP: True positive, TN: +True negative + +\hypertarget{support-vector-machines}{% +\subsection{Support Vector Machines}\label{support-vector-machines}} + +The validation showed that the SVM model has an accuracy of 62\%, a +sensitivity of 65\% and a specificity of 60\%. The confusion matrix of +SVM model, calculated for 8080 instances, is provided in Table 2. + +Table 2: Confusion matrix of the SVM model + +\begin{longtable}[]{@{}lllll@{}} +\toprule +& Predicted genotoxicity & & &\tabularnewline +\midrule +\endhead +Measured genotoxicity & & \textbf{\emph{PP}} & \textbf{\emph{PN}} & +\textbf{\emph{Total}}\tabularnewline +& \textbf{\emph{TP}} & 2057 & 1107 & 3164\tabularnewline +& \textbf{\emph{TN}} & 1953 & 2963 & 4916\tabularnewline +& \textbf{\emph{Total}} & 4010 & 4070 & 8080\tabularnewline +\bottomrule +\end{longtable} + +PP: Predicted positive; PN: Predicted negative, TP: True positive, TN: +True negative + +\hypertarget{deep-learning-r-project}{% +\subsection{Deep Learning (R-project)}\label{deep-learning-r-project}} + +The validation showed that the DL model generated in R has an accuracy +of 59\%, a sensitivity of 89\% and a specificity of 30\%. The confusion +matrix of the model, normalised to 8080 instances, is provided in Table +3. + +Table 3: Confusion matrix of the DL model (R-project) + +\begin{longtable}[]{@{}lllll@{}} +\toprule +& Predicted genotoxicity & & &\tabularnewline +\midrule +\endhead +Measured genotoxicity & & \textbf{\emph{PP}} & \textbf{\emph{PN}} & +\textbf{\emph{Total}}\tabularnewline +& \textbf{\emph{TP}} & 3575 & 435 & 4010\tabularnewline +& \textbf{\emph{TN}} & 2853 & 1217 & 4070\tabularnewline +& \textbf{\emph{Total}} & 6428 & 1652 & 8080\tabularnewline +\bottomrule +\end{longtable} + +PP: Predicted positive; PN: Predicted negative, TP: True positive, TN: +True negative + +\hypertarget{dl-model-tensorflow}{% +\subsection{DL model (TensorFlow)}\label{dl-model-tensorflow}} + +The validation showed that the DL model generated in TensorFlow has an +accuracy of 68\%, a sensitivity of 70\% and a specificity of 46\%. The +confusion matrix of the model, normalised to 8080 instances, is provided +in Table 4. + +Table 4: Confusion matrix of the DL model (TensorFlow) + +\begin{longtable}[]{@{}lllll@{}} +\toprule +& Predicted genotoxicity & & &\tabularnewline +\midrule +\endhead +Measured genotoxicity & & \textbf{\emph{PP}} & \textbf{\emph{PN}} & +\textbf{\emph{Total}}\tabularnewline +& \textbf{\emph{TP}} & 2851 & 1227 & 4078\tabularnewline +& \textbf{\emph{TN}} & 1825 & 2177 & 4002\tabularnewline +& \textbf{\emph{Total}} & 4676 & 3404 & 8080\tabularnewline +\bottomrule +\end{longtable} + +PP: Predicted positive; PN: Predicted negative, TP: True positive, TN: +True negative + +The ROC curves from the 6-fold validation are shown in Figure 7. + +\includegraphics[width=3.825in,height=2.7327in]{media/image7.png} + +Figure 7: Six-fold cross-validation of TensorFlow DL model show an +average area under the ROC-curve (ROC-AUC; measure of accuracy) of 68\%. + +In summary, the validation results of the four methods are presented in +the following table. + +Table 5 Results of the cross-validation of the four models and after +y-randomisation + +\begin{longtable}[]{@{}lllll@{}} +\toprule +\begin{minipage}[b]{0.28\columnwidth}\raggedright +\strut +\end{minipage} & \begin{minipage}[b]{0.13\columnwidth}\raggedright +Accuracy\strut +\end{minipage} & \begin{minipage}[b]{0.09\columnwidth}\raggedright +CCR\strut +\end{minipage} & \begin{minipage}[b]{0.16\columnwidth}\raggedright +Sensitivity\strut +\end{minipage} & \begin{minipage}[b]{0.16\columnwidth}\raggedright +Specificity\strut +\end{minipage}\tabularnewline +\midrule +\endhead +\begin{minipage}[t]{0.28\columnwidth}\raggedright +RF model\strut +\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright +64.1\%\strut +\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\raggedright +64.4\%\strut +\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright +66.2\%\strut +\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright +62.6\%\strut +\end{minipage}\tabularnewline +\begin{minipage}[t]{0.28\columnwidth}\raggedright +SVM model\strut +\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright +62.1\%\strut +\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\raggedright +62.6\%\strut +\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright +65.0\%\strut +\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright +60.3\%\strut +\end{minipage}\tabularnewline +\begin{minipage}[t]{0.28\columnwidth}\raggedright +DL model\\ +(R-project)\strut +\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright +59.3\%\strut +\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\raggedright +59.5\%\strut +\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright +89.2\%\strut +\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright +29.9\%\strut +\end{minipage}\tabularnewline +\begin{minipage}[t]{0.28\columnwidth}\raggedright +DL model (TensorFlow)\strut +\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright +68\%\strut +\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\raggedright +62.2\%\strut +\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright +69.9\%\strut +\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright +45.6\%\strut +\end{minipage}\tabularnewline +\begin{minipage}[t]{0.28\columnwidth}\raggedright +y-randomisation\strut +\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright +50.5\%\strut +\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\raggedright +50.4\%\strut +\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright +50.3\%\strut +\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright +50.6\%\strut +\end{minipage}\tabularnewline +\bottomrule +\end{longtable} + +CCR (correct classification rate) + +\hypertarget{discussion}{% +\section{Discussion}\label{discussion}} + +General model performance + +Based on the results of the cross-validation for all models, +\texttt{lazar}, RF, SVM, DL (R-project) and DL (TensorFlow) it can be +state that the prediction results are not optimal due to different +reasons. The accuracy as measured during cross-validation of the four +models (RF, SVM, DL (R-project and TensorFlow)) was partly low with CCR +values between 59.3 and 68\%, with the R-generated DL model and the +TensorFlow-generated DL model showing the worst and the best +performance, respectively. The validation of the R-generated DL model +revealed a high sensitivity (89.2\%) but an unacceptably low specificity +of 29.9\% indicating a high number of false positive estimates. The +TensorFlow-generated DL model, however, showed an acceptable but not +optimal accuracy of 68\%, a sensitivity of 69.9\% and a specificity of +45.6\%. The low specificity indicates that both DL models tends to +predict too many instances as positive (genotoxic), and therefore have a +high false positive rate. This allows at least with the TensorFlow +generated DL model to make group statements, but the confidence for +estimations of single PAs appears to be insufficiently low. + +Several factors have likely contributed to the low to moderate +performance of the used methods as shown during the cross-validation: + +\begin{enumerate} +\def\labelenumi{\arabic{enumi}.} +\tightlist +\item + The outcome in the training dataset was based on the results of AMES + tests for genotoxicity \protect\hyperlink{_ENREF_63}{ICH 2011}(), an + \emph{in vitro} test in different strains of the bacteria + \emph{Salmonella typhimurium}. In this test, mutagenicity is evaluated + with and without prior metabolic activation of the test substance. + Metabolic activation could result in the formation of genotoxic + metabolites from non-genotoxic parent compounds. However, no + distinction was made in the training dataset between substances that + needed metabolic activation before being mutagenic and those that were + mutagenic without metabolic activation. \texttt{lazar} is able to + handle this `inaccuracy' in the training dataset well due to the way + the algorithm works: \texttt{lazar} predicts the genotoxic potential + based on the neighbours of substances with comparable structural + features, considering mutagenic and not mutagenic neighbours. Based on + the structural similarity, a probability for mutagenicity and no + mutagenicity is calculated independently from each other (meaning that + the sum of probabilities does not necessarily adds up to 100\%). The + class with the higher outcome is then the overall outcome for the + substance. +\end{enumerate} + +\begin{quote} +In contrast, the other models need to be trained first to recognise the +structural features that are responsible for genotoxicity. Therefore, +the mixture of substances being mutagenic with and without metabolic +activation in the training dataset may have adversely affected the +ability to separate the dataset in two distinct classes and thus +explains the relatively low performance of these models. +\end{quote} + +\begin{enumerate} +\def\labelenumi{\arabic{enumi}.} +\setcounter{enumi}{1} +\tightlist +\item + Machine learning algorithms try to find an optimized solution in a + high-dimensional (one dimension per each predictor) space. Sometimes + these methods do not find the global optimum of estimates but only + local (not optimal) solutions. Strategies to find the global solutions + are systematic variation (grid search) of the hyperparameters of the + methods, which may be very time consuming in particular in large + datasets. +\end{enumerate} + +\hypertarget{conclusions}{% +\section{Conclusions}\label{conclusions}} + +In this study, an attempt was made to predict the genotoxic potential of +PAs using five different machine learning techniques (\texttt{lazar}, +RF, SVM, DL (R-project and TensorFlow). The results of all models fitted +only partly to the findings in literature, with best results obtained +with the TensorFlow DL model. Therefore, modelling allows statements on +the relative risks of genotoxicity of the different PA groups. +Individual predictions for selective PAs appear, however, not reliable +on the current basis of the used training dataset. + +This study emphasises the importance of critical assessment of +predictions by QSAR models. This includes not only extensive literature +research to assess the plausibility of the predictions, but also a good +knowledge of the metabolism of the test substances and understanding for +possible mechanisms of toxicity. + +In further studies, additional machine learning techniques or a modified +(extended) training dataset should be used for an additional attempt to +predict the genotoxic potential of PAs. + +\hypertarget{references}{% +\section*{References}\label{references}} +\addcontentsline{toc}{section}{References} + +\hypertarget{refs}{} +\leavevmode\hypertarget{ref-Bender2004}{}% +Bender, Andreas, Hamse Y. Mussa, Robert C. Glen, and Stephan Reiling. +2004. ``Molecular Similarity Searching Using Atom Environments, +Information-Based Feature Selection, and a Naïve Bayesian Classifier.'' +\emph{Journal of Chemical Information and Computer Sciences} 44 (1): +170--78. \url{https://doi.org/10.1021/ci034207y}. + +\leavevmode\hypertarget{ref-Hansen2009}{}% +Hansen, Katja, Sebastian Mika, Timon Schroeter, Andreas Sutter, Antonius +ter Laak, Thomas Steger-Hartmann, Nikolaus Heinrich, and Klaus-Robert +Müller. 2009. ``Benchmark Data Set for in Silico Prediction of Ames +Mutagenicity.'' \emph{Journal of Chemical Information and Modeling} 49 +(9): 2077--81. \url{https://doi.org/10.1021/ci900161g}. + +\leavevmode\hypertarget{ref-Kazius2005}{}% +Kazius, J., R. McGuire, and R. Bursi. 2005. ``Derivation and Validation +of Toxicophores for Mutagenicity Prediction.'' \emph{J Med Chem}, no. +48: 312--20. + +\leavevmode\hypertarget{ref-OBoyle2011a}{}% +O'Boyle, Noel, Michael Banck, Craig James, Chris Morley, Tim +Vandermeersch, and Geoffrey Hutchison. 2011. ``Open Babel: An open +chemical toolbox.'' \emph{J. Cheminf.} 3 (1): 33. +\url{https://doi.org/doi:10.1186/1758-2946-3-33}. + +\leavevmode\hypertarget{ref-Ruxfccker2007}{}% +Rücker, C, G Rücker, and M. Meringer. 2007. ``Y-Randomization and Its +Variants in Qspr/Qsar.'' \emph{J. Chem. Inf. Model.}, no. 47: 2345--57. + +\leavevmode\hypertarget{ref-Yap2011}{}% +Yap, CW. 2011. ``PaDEL-Descriptor: An Open Source Software to Calculate +Molecular Descriptors and Fingerprints.'' \emph{Journal of Computational +Chemistry}, no. 32: 1466--74. + +\end{document} |