paper/outfile.latex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779

\documentclass[]{scrartcl}
\usepackage{lmodern}
\usepackage{amssymb,amsmath}
\usepackage{ifxetex,ifluatex}
\usepackage{fixltx2e} % provides \textsubscript
\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
  \usepackage[T1]{fontenc}
  \usepackage[utf8]{inputenc}
\else % if luatex or xelatex
  \ifxetex
    \usepackage{mathspec}
  \else
    \usepackage{fontspec}
  \fi
  \defaultfontfeatures{Ligatures=TeX,Scale=MatchLowercase}
\fi
% use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
% use microtype if available
\IfFileExists{microtype.sty}{%
\usepackage{microtype}
\UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
}{}
\usepackage[unicode=true]{hyperref}
\hypersetup{
            pdftitle={A comparison of random forest, support vector machine, deep learning and lazar algorithms for predicting mutagenicity},
            pdfkeywords={mutagenicity, (Q)SAR, lazar, random forest, support vector machine, deep
learning},
            pdfborder={0 0 0},
            breaklinks=true}
\urlstyle{same}  % don't use monospace font for urls
\usepackage{longtable,booktabs}
\usepackage{graphicx,grffile}
\makeatletter
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
\makeatother
% Scale images if necessary, so that they will not overflow the page
% margins by default, and it is still possible to overwrite the defaults
% using explicit options in \includegraphics[width, height, ...]{}
\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
\IfFileExists{parskip.sty}{%
\usepackage{parskip}
}{% else
\setlength{\parindent}{0pt}
\setlength{\parskip}{6pt plus 2pt minus 1pt}
}
\setlength{\emergencystretch}{3em}  % prevent overfull lines
\providecommand{\tightlist}{%
  \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
\setcounter{secnumdepth}{0}
% Redefines (sub)paragraphs to behave more like sections
\ifx\paragraph\undefined\else
\let\oldparagraph\paragraph
\renewcommand{\paragraph}[1]{\oldparagraph{#1}\mbox{}}
\fi
\ifx\subparagraph\undefined\else
\let\oldsubparagraph\subparagraph
\renewcommand{\subparagraph}[1]{\oldsubparagraph{#1}\mbox{}}
\fi

\title{A comparison of random forest, support vector machine, deep learning and
lazar algorithms for predicting mutagenicity}
\usepackage{authblk}
\author[%
  1%
  ]{%
  Christoph Helma%
  %
  \textsuperscript{*\,}%
  %%
  %
}
\author[%
  2%
  ]{%
  Verena Schöning%
  %
  %
}
\author[%
  2%
  ]{%
  Philipp Boss%
  %
  %
}
\author[%
  2%
  ]{%
  Jürgen Drewe%
  %
  %
}
\affil[1]{\normalsize in silico toxicology gmbh, \footnotesize Rastatterstrasse 41, 4057 Basel, Switzerland}
\affil[2]{\normalsize Zeller AG, \footnotesize Seeblickstrasse 4, 8590 Romanshorn, Switzerland}
\date{}

\makeatletter
\def\@maketitle{%
  \newpage \null \vskip 2em
  \begin {center}%
    \let \footnote \thanks
         {\LARGE \@title \par}%
         \vskip 1.5em%
                {\large \lineskip .5em%
                  \begin {tabular}[t]{c}%
                    \@author
                  \end {tabular}\par}%
                                                \vskip 0.2em{\textsuperscript{*}\,Correspondence:
                                    Christoph Helma <helma@in-silico.ch>\\
                  }%
                %                \vskip 1em{\large \@date}%
  \end {center}%
  \par
  \vskip 1.5em}
\makeatother

\begin{document}

\maketitle

\begin{abstract}
k-nearest neighbor (\texttt{lazar}), random forest, support vector
machine and deep learning algorithms were applied to a new
\emph{Salmonella} mutagenicity dataset with 8281 unique chemical
structures. Algorithm performance was evaluated using 5-fold
crossvalidation. TODO - results - conclusion
\end{abstract}

\hypertarget{introduction}{%
\section{Introduction}\label{introduction}}

TODO: algo history

TODO: dataset history

TODO: open problems

\hypertarget{materials-and-methods}{%
\section{Materials and Methods}\label{materials-and-methods}}

\hypertarget{mutagenicity-data}{%
\subsection{Mutagenicity data}\label{mutagenicity-data}}

For all methods, the same training dataset was used. The training
dataset was compiled from the following sources:

\begin{itemize}
\item
  Kazius/Bursi Dataset (4337 compounds, Kazius, McGuire, and Bursi
  (2005)): \url{http://cheminformatics.org/datasets/bursi/cas_4337.zip}
\item
  Hansen Dataset (6513 compounds, Hansen et al. (2009)):
  \url{http://doc.ml.tu-berlin.de/toxbenchmark/Mutagenicity_N6512.csv}
\item
  EFSA Dataset (695 compounds):
  \url{https://data.europa.eu/euodp/data/storage/f/2017-0719T142131/GENOTOX\%20data\%20and\%20dictionary.xls}
\end{itemize}

Mutagenicity classifications from Kazius and Hansen datasets were used
without further processing. To achieve consistency between these
datasets, EFSA compounds were classified as mutagenic, if at least one
positive result was found for TA98 or T100 Salmonella strains.

Dataset merges were based on unique SMILES (\emph{Simplified Molecular
Input Line Entry Specification}) strings of the compound structures.
Duplicated experimental data with the same outcome was merged into a
single value, because it is likely that it originated from the same
experiment. Contradictory results were kept as multiple measurements in
the database. The combined training dataset contains 8281 unique
structures.

Source code for all data download, extraction and merge operations is
publicly available from the git repository
\url{https://git.in-silico.ch/pyrrolizidine} under a GPL3 License.

TODO: check/fix git repo

For the Random Forest (RF), Support Vector Machines (SVM), and Deep
Learning (DL) models, molecular descriptors were calculated with the
PaDEL-Descriptors program (\url{http://www.yapcwsoft.com} version 2.21,
Yap (2011)).

TODO: sentence ??

From these descriptors were chosen, which were actually used for the
generation of the DL model.

\hypertarget{algorithms}{%
\subsection{Algorithms}\label{algorithms}}

\hypertarget{lazar}{%
\subsubsection{\texorpdfstring{\texttt{lazar}}{lazar}}\label{lazar}}

\texttt{lazar} (\emph{lazy structure activity relationships}) is a
modular framework for read-across model development and validation. It
follows the following basic workflow: For a given chemical structure
\texttt{lazar}:

\begin{itemize}
\item
  searches in a database for similar structures (neighbours) with
  experimental data,
\item
  builds a local QSAR model with these neighbours and
\item
  uses this model to predict the unknown activity of the query compound.
\end{itemize}

This procedure resembles an automated version of read across predictions
in toxicology, in machine learning terms it would be classified as a
k-nearest-neighbour algorithm.

Apart from this basic workflow, \texttt{lazar} is completely modular and
allows the researcher to use any algorithm for similarity searches and
local QSAR (\emph{Quantitative structure--activity relationship})
modelling. Algorithms used within this study are described in the
following sections.

\hypertarget{neighbour-identification}{%
\paragraph{Neighbour identification}\label{neighbour-identification}}

Similarity calculations were based on MolPrint2D fingerprints (Bender et
al. (2004)) from the OpenBabel cheminformatics library (O'Boyle et al.
(2011)). The MolPrint2D fingerprint uses atom environments as molecular
representation, which resembles basically the chemical concept of
functional groups. For each atom in a molecule, it represents the
chemical environment using the atom types of connected atoms.

MolPrint2D fingerprints are generated dynamically from chemical
structures and do not rely on predefined lists of fragments (such as
OpenBabel FP3, FP4 or MACCs fingerprints or lists of
toxicophores/toxicophobes). This has the advantage that they may capture
substructures of toxicological relevance that are not included in other
fingerprints.

From MolPrint2D fingerprints a feature vector with all atom environments
of a compound can be constructed that can be used to calculate chemical
similarities.

The chemical similarity between two compounds a and b is expressed as
the proportion between atom environments common in both structures A ∩ B
and the total number of atom environments A U B (Jaccard/Tanimoto
index).

\[sim = \frac{\left| A\  \cap B \right|}{\left| A\  \cup B \right|}\]

Threshold selection is a trade-off between prediction accuracy (high
threshold) and the number of predictable compounds (low threshold). As
it is in many practical cases desirable to make predictions even in the
absence of closely related neighbours, we follow a tiered approach:

\begin{itemize}
\item
  First a similarity threshold of 0.5 is used to collect neighbours, to
  create a local QSAR model and to make a prediction for the query
  compound.
\item
  If any of these steps fails, the procedure is repeated with a
  similarity threshold of 0.2 and the prediction is flagged with a
  warning that it might be out of the applicability domain of the
  training data.
\item
  Similarity thresholds of 0.5 and 0.2 are the default values chosen
  \textgreater{} by the software developers and remained unchanged
  during the \textgreater{} course of these experiments.
\end{itemize}

Compounds with the same structure as the query structure are
automatically eliminated from neighbours to obtain unbiased predictions
in the presence of duplicates.

\hypertarget{local-qsar-models-and-predictions}{%
\paragraph{Local QSAR models and
predictions}\label{local-qsar-models-and-predictions}}

Only similar compounds (neighbours) above the threshold are used for
local QSAR models. In this investigation, we are using a weighted
majority vote from the neighbour's experimental data for mutagenicity
classifications. Probabilities for both classes
(mutagenic/non-mutagenic) are calculated according to the following
formula and the class with the higher probability is used as prediction
outcome.

\[p_{c} = \ \frac{\sum_{}^{}\text{sim}_{n,c}}{\sum_{}^{}\text{sim}_{n}}\]

\(p_{c}\) Probability of class c (e.g.~mutagenic or non-mutagenic)\\
\(\sum_{}^{}\text{sim}_{n,c}\) Sum of similarities of neighbours with
class c\\
\(\sum_{}^{}\text{sim}_{n}\) Sum of all neighbours

\hypertarget{applicability-domain}{%
\paragraph{Applicability domain}\label{applicability-domain}}

The applicability domain (AD) of \texttt{lazar} models is determined by
the structural diversity of the training data. If no similar compounds
are found in the training data no predictions will be generated.
Warnings are issued if the similarity threshold had to be lowered from
0.5 to 0.2 in order to enable predictions. Predictions without warnings
can be considered as close to the applicability domain and predictions
with warnings as more distant from the applicability domain.
Quantitative applicability domain information can be obtained from the
similarities of individual neighbours.

\hypertarget{availability}{%
\paragraph{Availability}\label{availability}}

\begin{itemize}
\item
  \texttt{lazar} experiments for this manuscript:
  \url{https://git.in-silico.ch/pyrrolizidine} (source code, GPL3)
\item
  \texttt{lazar} framework: \url{https://git.in-silico.ch/lazar} (source
  code, GPL3)
\item
  \texttt{lazar} GUI: \url{https://git.in-silico.ch/lazar-gui} (source
  code, GPL3)
\item
  Public web interface: \url{https://lazar.in-silico.ch}
\end{itemize}

\hypertarget{random-forest-support-vector-machines-and-deep-learning-in-r-project}{%
\subsubsection{Random Forest, Support Vector Machines, and Deep Learning
in
R-project}\label{random-forest-support-vector-machines-and-deep-learning-in-r-project}}

In comparison to \texttt{lazar}, three other models (Random Forest (RF),
Support Vector Machines (SVM), and Deep Learning (DL)) were evaluated.

For the generation of these models, molecular 1D and 2D descriptors of
the training dataset were calculated using PaDEL-Descriptors
(\url{http://www.yapcwsoft.com} version 2.21, Yap (2011)).

As the training dataset contained over 8280 instances, it was decided to
delete instances with missing values during data pre-processing.
Furthermore, substances with equivocal outcome were removed. The final
training dataset contained 8080 instances with known mutagenic
potential. The RF, SVM, and DL models were generated using the R
software (R-project for Statistical Computing,
\url{https://www.r-project.org/}\emph{;} version 3.3.1), specific R
packages used are identified for each step in the description below.
During feature selection, descriptor with near zero variance were
removed using `\emph{NearZeroVar}'-function (package `caret'). If the
percentage of the most common value was more than 90\% or when the
frequency ratio of the most common value to the second most common value
was greater than 95:5 (e.g.~95 instances of the most common value and
only 5 or less instances of the second most common value), a descriptor
was classified as having a near zero variance. After that, highly
correlated descriptors were removed using the
`\emph{findCorrelation}'-function (package `caret') with a cut-off of
0.9. This resulted in a training dataset with 516 descriptors. These
descriptors were scaled to be in the range between 0 and 1 using the
`\emph{preProcess}'-function (package `caret'). The scaling routine was
saved in order to apply the same scaling on the testing dataset. As
these three steps did not consider the outcome, it was decided that they
do not need to be included in the cross-validation of the model. To
further reduce the number of features, a LASSO (\emph{least absolute
shrinkage and selection operator}) regression was performed using the
`\emph{glmnet}'-function (package `\emph{glmnet}'). The reduced dataset
was used for the generation of the pre-trained models.

For the RF model, the `\emph{randomForest}'-function (package
`\emph{randomForest}') was used. A forest with 1000 trees with maximal
terminal nodes of 200 was grown for the prediction.

The `\emph{svm}'-function (package `e1071') with a \emph{radial basis
function kernel} was used for the SVM model.

The DL model was generated using the `\emph{h2o.deeplearning}'-function
(package `\emph{h2o}'). The DL contained four hidden layer with 70, 50,
50, and 10 neurons, respectively. Other hyperparameter were set as
follows: l1=1.0E-7, l2=1.0E-11, epsilon = 1.0E-10, rho = 0.8, and
quantile\_alpha = 0.5. For all other hyperparameter, the default values
were used. Weights and biases were in a first step determined with an
unsupervised DL model. These values were then used for the actual,
supervised DL model.

To validate these models, an internal cross-validation approach was
chosen. The training dataset was randomly split in training data, which
contained 95\% of the data, and validation data, which contain 5\% of
the data. A feature selection with LASSO on the training data was
performed, reducing the number of descriptors to approximately 100. This
step was repeated five times. Based on each of the five different
training data, the predictive models were trained and the performance
tested with the validation data. This step was repeated 10 times.
Furthermore, a y-randomisation using the RF model was performed. During
y-randomisation, the outcome (y-variable) is randomly permuted. The
theory is that after randomisation of the outcome, the model should not
be able to correlate the outcome to the properties (descriptor values)
of the substances. The performance of the model should therefore
indicate a by change prediction with an accuracy of about 50\%. If this
is true, it can be concluded that correlation between actual outcome and
properties of the substances is real and not by chance (Rücker, Rücker,
and Meringer (2007)).

\includegraphics[width=6.26875in,height=5.48611in]{media/image1.png}

Figure 1: Flowchart of the generation and validation of the models
generated in R-project

\hypertarget{applicability-domain-1}{%
\paragraph{Applicability domain}\label{applicability-domain-1}}

The AD of the training dataset and the PA dataset was evaluated using
the Jaccard distance. A Jaccard distance of `0' indicates that the
substances are similar, whereas a value of `1' shows that the substances
are different. The Jaccard distance was below 0.2 for all PAs relative
to the training dataset. Therefore, PA dataset is within the AD of the
training dataset and the models can be used to predict the genotoxic
potential of the PA dataset.

\hypertarget{y-randomisation}{%
\paragraph{y-randomisation}\label{y-randomisation}}

After y-randomisation of the outcome, the accuracy and CCR are around
50\%, indicating a chance in the distribution of the results. This
shows, that the outcome is actually related to the predictors and not by
chance.

\hypertarget{deep-learning-in-tensorflow}{%
\subsubsection{Deep Learning in
TensorFlow}\label{deep-learning-in-tensorflow}}

Alternatively, a DL model was established with Python-based TensorFlow
program (\url{https://www.tensorflow.org/}) using the high-level API
Keras (\url{https://www.tensorflow.org/guide/keras}) to build the
models.

Data pre-processing was done by rank transformation using the
`\emph{QuantileTransformer}' procedure. A sequential model has been
used. Four layers have been used: input layer, two hidden layers (with
12, 8 and 8 nodes, respectively) and one output layer. For the output
layer, a sigmoidal activation function and for all other layers the ReLU
(`\emph{Rectified Linear Unit}') activation function was used.
Additionally, a L\textsuperscript{2}-penalty of 0.001 was used for the
input layer. For training of the model, the ADAM algorithm was used to
minimise the cross-entropy loss using the default parameters of Keras.
Training was performed for 100 epochs with a batch size of 64. The model
was implemented with Python 3.6 and Keras. For training of the model, a
6-fold cross-validation was used. Accuracy was estimated by ROC-AUC and
confusion matrix.

\hypertarget{validation}{%
\subsection{Validation}\label{validation}}

\hypertarget{results}{%
\section{Results}\label{results}}

\hypertarget{lazar-1}{%
\subsection{\texorpdfstring{\texttt{lazar}}{lazar}}\label{lazar-1}}

\hypertarget{random-forest}{%
\subsection{Random Forest}\label{random-forest}}

The validation showed that the RF model has an accuracy of 64\%, a
sensitivity of 66\% and a specificity of 63\%. The confusion matrix of
the model, calculated for 8080 instances, is provided in Table 1.

Table 1: Confusion matrix of the RF model

\begin{longtable}[]{@{}lllll@{}}
\toprule
& Predicted genotoxicity & & &\tabularnewline
\midrule
\endhead
Measured genotoxicity & & \textbf{\emph{PP}} & \textbf{\emph{PN}} &
\textbf{\emph{Total}}\tabularnewline
& \textbf{\emph{TP}} & 2274 & 1163 & 3437\tabularnewline
& \textbf{\emph{TN}} & 1736 & 2907 & 4643\tabularnewline
& \textbf{\emph{Total}} & 4010 & 4070 & 8080\tabularnewline
\bottomrule
\end{longtable}

PP: Predicted positive; PN: Predicted negative, TP: True positive, TN:
True negative

\hypertarget{support-vector-machines}{%
\subsection{Support Vector Machines}\label{support-vector-machines}}

The validation showed that the SVM model has an accuracy of 62\%, a
sensitivity of 65\% and a specificity of 60\%. The confusion matrix of
SVM model, calculated for 8080 instances, is provided in Table 2.

Table 2: Confusion matrix of the SVM model

\begin{longtable}[]{@{}lllll@{}}
\toprule
& Predicted genotoxicity & & &\tabularnewline
\midrule
\endhead
Measured genotoxicity & & \textbf{\emph{PP}} & \textbf{\emph{PN}} &
\textbf{\emph{Total}}\tabularnewline
& \textbf{\emph{TP}} & 2057 & 1107 & 3164\tabularnewline
& \textbf{\emph{TN}} & 1953 & 2963 & 4916\tabularnewline
& \textbf{\emph{Total}} & 4010 & 4070 & 8080\tabularnewline
\bottomrule
\end{longtable}

PP: Predicted positive; PN: Predicted negative, TP: True positive, TN:
True negative

\hypertarget{deep-learning-r-project}{%
\subsection{Deep Learning (R-project)}\label{deep-learning-r-project}}

The validation showed that the DL model generated in R has an accuracy
of 59\%, a sensitivity of 89\% and a specificity of 30\%. The confusion
matrix of the model, normalised to 8080 instances, is provided in Table
3.

Table 3: Confusion matrix of the DL model (R-project)

\begin{longtable}[]{@{}lllll@{}}
\toprule
& Predicted genotoxicity & & &\tabularnewline
\midrule
\endhead
Measured genotoxicity & & \textbf{\emph{PP}} & \textbf{\emph{PN}} &
\textbf{\emph{Total}}\tabularnewline
& \textbf{\emph{TP}} & 3575 & 435 & 4010\tabularnewline
& \textbf{\emph{TN}} & 2853 & 1217 & 4070\tabularnewline
& \textbf{\emph{Total}} & 6428 & 1652 & 8080\tabularnewline
\bottomrule
\end{longtable}

PP: Predicted positive; PN: Predicted negative, TP: True positive, TN:
True negative

\hypertarget{dl-model-tensorflow}{%
\subsection{DL model (TensorFlow)}\label{dl-model-tensorflow}}

The validation showed that the DL model generated in TensorFlow has an
accuracy of 68\%, a sensitivity of 70\% and a specificity of 46\%. The
confusion matrix of the model, normalised to 8080 instances, is provided
in Table 4.

Table 4: Confusion matrix of the DL model (TensorFlow)

\begin{longtable}[]{@{}lllll@{}}
\toprule
& Predicted genotoxicity & & &\tabularnewline
\midrule
\endhead
Measured genotoxicity & & \textbf{\emph{PP}} & \textbf{\emph{PN}} &
\textbf{\emph{Total}}\tabularnewline
& \textbf{\emph{TP}} & 2851 & 1227 & 4078\tabularnewline
& \textbf{\emph{TN}} & 1825 & 2177 & 4002\tabularnewline
& \textbf{\emph{Total}} & 4676 & 3404 & 8080\tabularnewline
\bottomrule
\end{longtable}

PP: Predicted positive; PN: Predicted negative, TP: True positive, TN:
True negative

The ROC curves from the 6-fold validation are shown in Figure 7.

\includegraphics[width=3.825in,height=2.7327in]{media/image7.png}

Figure 7: Six-fold cross-validation of TensorFlow DL model show an
average area under the ROC-curve (ROC-AUC; measure of accuracy) of 68\%.

In summary, the validation results of the four methods are presented in
the following table.

Table 5 Results of the cross-validation of the four models and after
y-randomisation

\begin{longtable}[]{@{}lllll@{}}
\toprule
\begin{minipage}[b]{0.28\columnwidth}\raggedright
\strut
\end{minipage} & \begin{minipage}[b]{0.13\columnwidth}\raggedright
Accuracy\strut
\end{minipage} & \begin{minipage}[b]{0.09\columnwidth}\raggedright
CCR\strut
\end{minipage} & \begin{minipage}[b]{0.16\columnwidth}\raggedright
Sensitivity\strut
\end{minipage} & \begin{minipage}[b]{0.16\columnwidth}\raggedright
Specificity\strut
\end{minipage}\tabularnewline
\midrule
\endhead
\begin{minipage}[t]{0.28\columnwidth}\raggedright
RF model\strut
\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright
64.1\%\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\raggedright
64.4\%\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
66.2\%\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
62.6\%\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.28\columnwidth}\raggedright
SVM model\strut
\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright
62.1\%\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\raggedright
62.6\%\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
65.0\%\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
60.3\%\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.28\columnwidth}\raggedright
DL model\\
(R-project)\strut
\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright
59.3\%\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\raggedright
59.5\%\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
89.2\%\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
29.9\%\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.28\columnwidth}\raggedright
DL model (TensorFlow)\strut
\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright
68\%\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\raggedright
62.2\%\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
69.9\%\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
45.6\%\strut
\end{minipage}\tabularnewline
\begin{minipage}[t]{0.28\columnwidth}\raggedright
y-randomisation\strut
\end{minipage} & \begin{minipage}[t]{0.13\columnwidth}\raggedright
50.5\%\strut
\end{minipage} & \begin{minipage}[t]{0.09\columnwidth}\raggedright
50.4\%\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
50.3\%\strut
\end{minipage} & \begin{minipage}[t]{0.16\columnwidth}\raggedright
50.6\%\strut
\end{minipage}\tabularnewline
\bottomrule
\end{longtable}

CCR (correct classification rate)

\hypertarget{discussion}{%
\section{Discussion}\label{discussion}}

General model performance

Based on the results of the cross-validation for all models,
\texttt{lazar}, RF, SVM, DL (R-project) and DL (TensorFlow) it can be
state that the prediction results are not optimal due to different
reasons. The accuracy as measured during cross-validation of the four
models (RF, SVM, DL (R-project and TensorFlow)) was partly low with CCR
values between 59.3 and 68\%, with the R-generated DL model and the
TensorFlow-generated DL model showing the worst and the best
performance, respectively. The validation of the R-generated DL model
revealed a high sensitivity (89.2\%) but an unacceptably low specificity
of 29.9\% indicating a high number of false positive estimates. The
TensorFlow-generated DL model, however, showed an acceptable but not
optimal accuracy of 68\%, a sensitivity of 69.9\% and a specificity of
45.6\%. The low specificity indicates that both DL models tends to
predict too many instances as positive (genotoxic), and therefore have a
high false positive rate. This allows at least with the TensorFlow
generated DL model to make group statements, but the confidence for
estimations of single PAs appears to be insufficiently low.

Several factors have likely contributed to the low to moderate
performance of the used methods as shown during the cross-validation:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  The outcome in the training dataset was based on the results of AMES
  tests for genotoxicity \protect\hyperlink{_ENREF_63}{ICH 2011}(), an
  \emph{in vitro} test in different strains of the bacteria
  \emph{Salmonella typhimurium}. In this test, mutagenicity is evaluated
  with and without prior metabolic activation of the test substance.
  Metabolic activation could result in the formation of genotoxic
  metabolites from non-genotoxic parent compounds. However, no
  distinction was made in the training dataset between substances that
  needed metabolic activation before being mutagenic and those that were
  mutagenic without metabolic activation. \texttt{lazar} is able to
  handle this `inaccuracy' in the training dataset well due to the way
  the algorithm works: \texttt{lazar} predicts the genotoxic potential
  based on the neighbours of substances with comparable structural
  features, considering mutagenic and not mutagenic neighbours. Based on
  the structural similarity, a probability for mutagenicity and no
  mutagenicity is calculated independently from each other (meaning that
  the sum of probabilities does not necessarily adds up to 100\%). The
  class with the higher outcome is then the overall outcome for the
  substance.
\end{enumerate}

\begin{quote}
In contrast, the other models need to be trained first to recognise the
structural features that are responsible for genotoxicity. Therefore,
the mixture of substances being mutagenic with and without metabolic
activation in the training dataset may have adversely affected the
ability to separate the dataset in two distinct classes and thus
explains the relatively low performance of these models.
\end{quote}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Machine learning algorithms try to find an optimized solution in a
  high-dimensional (one dimension per each predictor) space. Sometimes
  these methods do not find the global optimum of estimates but only
  local (not optimal) solutions. Strategies to find the global solutions
  are systematic variation (grid search) of the hyperparameters of the
  methods, which may be very time consuming in particular in large
  datasets.
\end{enumerate}

\hypertarget{conclusions}{%
\section{Conclusions}\label{conclusions}}

In this study, an attempt was made to predict the genotoxic potential of
PAs using five different machine learning techniques (\texttt{lazar},
RF, SVM, DL (R-project and TensorFlow). The results of all models fitted
only partly to the findings in literature, with best results obtained
with the TensorFlow DL model. Therefore, modelling allows statements on
the relative risks of genotoxicity of the different PA groups.
Individual predictions for selective PAs appear, however, not reliable
on the current basis of the used training dataset.

This study emphasises the importance of critical assessment of
predictions by QSAR models. This includes not only extensive literature
research to assess the plausibility of the predictions, but also a good
knowledge of the metabolism of the test substances and understanding for
possible mechanisms of toxicity.

In further studies, additional machine learning techniques or a modified
(extended) training dataset should be used for an additional attempt to
predict the genotoxic potential of PAs.

\hypertarget{references}{%
\section*{References}\label{references}}
\addcontentsline{toc}{section}{References}

\hypertarget{refs}{}
\leavevmode\hypertarget{ref-Bender2004}{}%
Bender, Andreas, Hamse Y. Mussa, Robert C. Glen, and Stephan Reiling.
2004. ``Molecular Similarity Searching Using Atom Environments,
Information-Based Feature Selection, and a Naïve Bayesian Classifier.''
\emph{Journal of Chemical Information and Computer Sciences} 44 (1):
170--78. \url{https://doi.org/10.1021/ci034207y}.

\leavevmode\hypertarget{ref-Hansen2009}{}%
Hansen, Katja, Sebastian Mika, Timon Schroeter, Andreas Sutter, Antonius
ter Laak, Thomas Steger-Hartmann, Nikolaus Heinrich, and Klaus-Robert
Müller. 2009. ``Benchmark Data Set for in Silico Prediction of Ames
Mutagenicity.'' \emph{Journal of Chemical Information and Modeling} 49
(9): 2077--81. \url{https://doi.org/10.1021/ci900161g}.

\leavevmode\hypertarget{ref-Kazius2005}{}%
Kazius, J., R. McGuire, and R. Bursi. 2005. ``Derivation and Validation
of Toxicophores for Mutagenicity Prediction.'' \emph{J Med Chem}, no.
48: 312--20.

\leavevmode\hypertarget{ref-OBoyle2011a}{}%
O'Boyle, Noel, Michael Banck, Craig James, Chris Morley, Tim
Vandermeersch, and Geoffrey Hutchison. 2011. ``Open Babel: An open
chemical toolbox.'' \emph{J. Cheminf.} 3 (1): 33.
\url{https://doi.org/doi:10.1186/1758-2946-3-33}.

\leavevmode\hypertarget{ref-Ruxfccker2007}{}%
Rücker, C, G Rücker, and M. Meringer. 2007. ``Y-Randomization and Its
Variants in Qspr/Qsar.'' \emph{J. Chem. Inf. Model.}, no. 47: 2345--57.

\leavevmode\hypertarget{ref-Yap2011}{}%
Yap, CW. 2011. ``PaDEL-Descriptor: An Open Source Software to Calculate
Molecular Descriptors and Fingerprints.'' \emph{Journal of Computational
Chemistry}, no. 32: 1466--74.

\end{document}