From e12b47c62046d099c6ac90a9a9e01942ba2c0a98 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Tue, 6 Oct 2020 23:35:58 +0200 Subject: methods and results updated --- Gemfile | 2 +- Gemfile.lock | 2 - figures/roc.png | Bin 76960 -> 75787 bytes gemset.nix | 12 +-- mutagenicity.md | 272 +++++++++++++++++++++++++++++++++---------------------- mutagenicity.pdf | Bin 0 -> 1739347 bytes 6 files changed, 167 insertions(+), 121 deletions(-) create mode 100644 mutagenicity.pdf diff --git a/Gemfile b/Gemfile index bf0d961..7659d00 100644 --- a/Gemfile +++ b/Gemfile @@ -1,4 +1,4 @@ source 'https://rubygems.org' do gem 'mustache' - gem 'openbabel' + #gem 'openbabel' end diff --git a/Gemfile.lock b/Gemfile.lock index e82d9b2..298fd25 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -2,14 +2,12 @@ GEM remote: https://rubygems.org/ specs: mustache (1.1.1) - openbabel (2.4.90.0) PLATFORMS ruby DEPENDENCIES mustache! - openbabel! BUNDLED WITH 2.1.4 diff --git a/figures/roc.png b/figures/roc.png index 4e8f388..a7cb04f 100644 Binary files a/figures/roc.png and b/figures/roc.png differ diff --git a/gemset.nix b/gemset.nix index 18adbb1..a1bca28 100644 --- a/gemset.nix +++ b/gemset.nix @@ -9,14 +9,4 @@ }; version = "1.1.1"; }; - openbabel = { - groups = ["default"]; - platforms = []; - source = { - remotes = ["https://rubygems.org"]; - sha256 = "0fancy2yh4y52ywkn0j2v9sd9n854l2gir4a4xl7s07wvg8l52s6"; - type = "gem"; - }; - version = "2.4.90.0"; - }; -} \ No newline at end of file +} diff --git a/mutagenicity.md b/mutagenicity.md index 321a460..418c2d1 100644 --- a/mutagenicity.md +++ b/mutagenicity.md @@ -1,5 +1,8 @@ --- -title: A comparison of random forest, support vector machine, linear regression, deep learning and lazar algorithms for predicting the mutagenic potential of different pyrrolizidine alkaloids +title: A comparison of twelve machine learning models based on an expanded mutagenicity dataset and their application for predicting pyrrolizidine alkaloid mutagenicity +# TODO check # algorithms + +#title: A comparison of random forest, support vector machine, linear regression, deep learning and lazar algorithms for predicting the mutagenic potential of different pyrrolizidine alkaloids #subtitle: Performance comparison with a new expanded dataset author: - Christoph Helma: @@ -38,17 +41,20 @@ header-includes: Abstract ======== -Random forest, support vector machine, deep learning and k-nearest neighbor + Introduction ============ TODO + + + The main objectives of this study were - to generate a new training dataset, by combining the most comprehensive public mutagenicity datasets - - to compare the performance of global models (RF, SVM, Neural Nets) with local models (`lazar`) - - to apply these models for the prediction of the mutagenicity of pyrrolizidine alkaloids + - to compare the performance of global models (RF, SVM, LR, NN) with local models (`lazar`) + - to compare the performance of MolPrint2D fingerprints with PaDEL descriptors + - to apply these models for the prediction of pyrrolizidine alkaloid mutagenicity Materials and Methods ===================== @@ -145,11 +161,16 @@ available from the git repository under a GPL3 License. The new combined dataset can be found at . -### Pyrrolizidine dataset +### Pyrrolizidine alkaloid (PA) dataset The testing dataset consisted of 602 different PAs. The compilation of the PA dataset is described in detail in [Schöning et al. -(2017)](#_ENREF_119). The PAs were assigned to groups according to +(2017)](#_ENREF_119). + +TODO: **Verena** Quellen und Auswahlkriterien + + + +Descriptors +----------- + +### MolPrint2D fingerprints (*MP2D*) + +MolPrint2D fingerprints (@OBoyle2011a) use atom environments as molecular +representation. They determine for each atom in a molecule, the atom types of +its connected atoms to represent their chemical environment. This resembles +basically the chemical concept of functional groups. + +In contrast to predefined lists of fragments (e.g. FP3, FP4 or MACCs +fingerprints) or descriptors (e.g PaDEL) they are generated dynamically from +chemical structures. This has the advantage that they can capture substructures +of toxicological relevance that are not included in other descriptors. + +Chemical similarities (e.g. Tanimoto indices) can be calculated very +efficiently with MolPrint2D fingerprints. Using them as descriptors for global +models leads however to huge, sparsely populated matrices that cannot be +handled with traditional machine learning algorithms. In our experiments none +of the R and Tensorflow algorithms was capable to use them as descriptors. + +MolPrint2D fingerprints were calculated with the OpenBabel cheminformatics +library (@OBoyle2011a). +#### PaDEL descriptors + +For R and Tensorflow models, molecular 1D and 2D descriptors were calculated +with the PaDEL-Descriptors program ( version 2.21, @Yap2011). + +As the training dataset contained over 8309 instances, it was decided to +delete instances with missing values during data pre-processing. +Furthermore, substances with equivocal outcome were removed. The final +training dataset contained 8080 instances with known mutagenic +potential. + +During feature +selection, descriptor with near zero variance were removed using +'*NearZeroVar*'-function (package 'caret'). If the percentage of the +most common value was more than 90% or when the frequency ratio of the +most common value to the second most common value was greater than 95:5 +(e.g. 95 instances of the most common value and only 5 or less instances +of the second most common value), a descriptor was classified as having +a near zero variance. After that, highly correlated descriptors were +removed using the '*findCorrelation*'-function (package 'caret') with a +cut-off of 0.9. This resulted in a training dataset with 516 +descriptors. These descriptors were scaled to be in the range between 0 +and 1 using the '*preProcess*'-function (package 'caret'). The scaling +routine was saved in order to apply the same scaling on the testing +dataset. As these three steps did not consider the outcome, it was +decided that they do not need to be included in the cross-validation of +the model. To further reduce the number of features, a LASSO (*least +absolute shrinkage and selection operator*) regression was performed +using the '*glmnet*'-function (package '*glmnet*'). The reduced dataset +was used for the generation of the pre-trained models. Algorithms ---------- @@ -207,30 +283,22 @@ sections. #### Neighbour identification -Similarity calculations were based on MolPrint2D fingerprints (*MP2D*, -@Bender2004) from the OpenBabel cheminformatics library (@OBoyle2011a). The -MolPrint2D fingerprint uses atom environments as molecular representation, -which resembles basically the chemical concept of functional groups. For each -atom in a molecule, it represents the chemical environment using the atom types -of connected atoms. +Utilizing this modularity, similarity calculations were based both on +MolPrint2D fingerprints and on PaDEL descriptors. -MolPrint2D fingerprints are generated dynamically from chemical -structures and do not rely on predefined lists of fragments (such as -OpenBabel FP3, FP4 or MACCs fingerprints or lists of -toxicophores/toxicophobes). This has the advantage that they may capture -substructures of toxicological relevance that are not included in other -fingerprints. +For MolPrint2D fingerprints chemical similarity between two compounds $a$ and +$b$ is expressed as the proportion between atom environments common in both +structures $A \cap B$ and the total number of atom environments $A \cup B$ +(Jaccard/Tanimoto index). -From MolPrint2D fingerprints a feature vector with all atom environments -of a compound can be constructed that can be used to calculate chemical -similarities. +$$sim = \frac{\lvert A\ \cap B \rvert}{\lvert A\ \cup B \rvert}$$ -The chemical similarity between two compounds $a$ and $b$ is expressed as -the proportion between atom environments common in both structures $A \cap B$ -and the total number of atom environments $A \cup B$ (Jaccard/Tanimoto -index). +For PaDEL descriptors chemical similarity between two compounds $a$ and $b$ is +expressed as the cosine similarity between the descriptor vectors $A$ for $a$ +and $B$ for $b$. + +$$sim = \frac{A \cdot B}{\lvert A \rvert \lvert B \rvert}$$ -$$sim = \frac{\left| A\ \cap B \right|}{\left| A\ \cup B \right|}$$ Threshold selection is a trade-off between prediction accuracy (high threshold) and the number of predictable compounds (low threshold). As @@ -302,44 +370,10 @@ of individual neighbours. ### R Random Forest, Support Vector Machines, and Deep Learning -#### PaDEL descriptors - -For Random Forest (RF), Support Vector Machines (SVM), and Deep -Learning (DL) models, molecular descriptors were calculated -with the PaDEL-Descriptors program ( version 2.21, @Yap2011). The same descriptors were used for TensorFlow models. - -TODO: **Verena** kannst Du bitte die PaDEL Deskriptoren etwas ausfuehrlicher beschreiben (welche Typen, Anzahl, Bedeutung etc) - -For the generation of these models, molecular 1D and 2D descriptors of -the training dataset were calculated using PaDEL-Descriptors ( version -2.21, @Yap2011). - -As the training dataset contained over 8309 instances, it was decided to -delete instances with missing values during data pre-processing. -Furthermore, substances with equivocal outcome were removed. The final -training dataset contained 8080 instances with known mutagenic -potential. The RF, SVM, and DL models were generated using the R +The RF, SVM, and DL models were generated using the R software (R-project for Statistical Computing, *;* version 3.3.1), specific R packages used -are identified for each step in the description below. During feature -selection, descriptor with near zero variance were removed using -'*NearZeroVar*'-function (package 'caret'). If the percentage of the -most common value was more than 90% or when the frequency ratio of the -most common value to the second most common value was greater than 95:5 -(e.g. 95 instances of the most common value and only 5 or less instances -of the second most common value), a descriptor was classified as having -a near zero variance. After that, highly correlated descriptors were -removed using the '*findCorrelation*'-function (package 'caret') with a -cut-off of 0.9. This resulted in a training dataset with 516 -descriptors. These descriptors were scaled to be in the range between 0 -and 1 using the '*preProcess*'-function (package 'caret'). The scaling -routine was saved in order to apply the same scaling on the testing -dataset. As these three steps did not consider the outcome, it was -decided that they do not need to be included in the cross-validation of -the model. To further reduce the number of features, a LASSO (*least -absolute shrinkage and selection operator*) regression was performed -using the '*glmnet*'-function (package '*glmnet*'). The reduced dataset -was used for the generation of the pre-trained models. +are identified for each step in the description below. #### Random Forest @@ -378,6 +412,8 @@ validation data. This step was repeated 10 times. #### Applicability domain +TODO: **Verena**: Mit welchen Deskriptoren hast Du den Jaccard index berechnet? Fuer den Jaccard index braucht man binaere Deskriptoren (zB MP2D), mit PaDEL Deskriptoren koennte man zB eine euklidische oder cosinus Distanz berechnen. + The AD of the training dataset and the PA dataset was evaluated using the Jaccard distance. A Jaccard distance of '0' indicates that the substances are similar, whereas a value of '1' shows that the substances @@ -386,7 +422,13 @@ to the training dataset. Therefore, PA dataset is within the AD of the training dataset and the models can be used to predict the genotoxic potential of the PA dataset. -### TensorFlow models +#### Availability + +R scripts for these experiments can be found in https://git.in-silico.ch/mutagenicity-paper/scripts/R. + +### Tensorflow models + +TODO: **Philipp** bitte ergaenzen #### Logistic regression (SGD) @@ -396,11 +438,11 @@ potential of the PA dataset. #### Deep Learning -Alternatively, a DL model was established with Python-based TensorFlow +Alternatively, a DL model was established with Python-based Tensorflow program () using the high-level API Keras () to build the models. -TensorFlow models used the same PaDEL descriptors as the R models. +Tensorflow models used the same PaDEL descriptors as the R models. Data pre-processing was done by rank transformation using the '*QuantileTransformer*' procedure. A sequential model has been used. @@ -412,8 +454,7 @@ a L^2^-penalty of 0.001 was used for the input layer. For training of the model, the ADAM algorithm was used to minimise the cross-entropy loss using the default parameters of Keras. Training was performed for 100 epochs with a batch size of 64. The model was implemented with -Python 3.6 and Keras. For training of the model, a 10-fold -cross-validation was used. +Python 3.6 and Keras. TODO: **Philipp** kannst Du bitte ueberpruefen ob die Beschreibung noch stimmt und ob der Ablauf von Verena (Figure 1) auch fuer Deine Modelle gilt @@ -421,11 +462,41 @@ und ob der Ablauf von Verena (Figure 1) auch fuer Deine Modelle gilt Validation ---------- +10-fold cross-validation was used for all Tensorflow models. + +#### Availability + +Jupyter notebooks for these experiments can be found in https://git.in-silico.ch/mutagenicity-paper/scripts/tensorflow. + Results ======= -TODO: **Verena** und **Philipp**: koennt Ihr bitte gegenchecken, ob ich keine Zahlendreher in den Ergebnissen habe +10-fold crossvalidations +------------------------ + +Crossvalidation results are summarized in the following tables: @tbl:lazar shows `lazar` results with MolPrint2D and PaDEL descriptors, @tbl:R summarizes R results and @tbl:tensorflow Tensorflow results. + +@fig:roc depicts the position of all crossvalidation results in receiver operating characteristic (ROC) space. +Confusion matrices for all models are available from the git repository http://git.in-silico.ch/mutagenicity-paper/10-fold-crossvalidations/confusion-matrices/, individual predictions can be found in +http://git.in-silico.ch/mutagenicity-paper/10-fold-crossvalidations/predictions/. + +The most accurate crossvalidation predictions have been obtained with `lazar` models with MolPrint2D descriptors ({{lazar-high-confidence.acc}} for predictions with high confidence, {{lazar-all.acc}} for all predictions). Models utilizing PaDEL descriptors have generally lower accuracies ranging from TODO to TODO. Sensitivity and specificity is generally well balanced with the exception of `lazar`-PaDEL (low sensitivity) and R deep learning (low specificity) models. + +| |R-RF | R-SVM | R-DL | TF | TF-FS | L | L-HC | L-P | L-P-HC| +|-|-----|-------|------|----|-------|---|------|------|--------| +|Accuracy|{{R-RF.acc}}|{{R-SVM.acc}}|{{R-DL.acc}}|{{tensorflow-all.acc}}|{{tensorflow-selected.acc}}|{{lazar-all.acc}}|{{lazar-high-confidence.acc}}|{{lazar-padel-all.acc}}|{{lazar-padel-high-confidence.acc}}| +|Sensitivity|{{R-RF.tpr}}|{{R-SVM.tpr}}|{{R-DL.tpr}}|{{tensorflow-all.tpr}}|{{tensorflow-selected.tpr}}|{{lazar-all.tpr}}|{{lazar-high-confidence.tpr}}|{{lazar-padel-all.tpr}}|{{lazar-padel-high-confidence.tpr}}| +|Specificity|{{R-RF.tnr}}|{{R-SVM.tnr}}|{{R-DL.tnr}}|{{tensorflow-all.tnr}}|{{tensorflow-selected.tnr}}|{{lazar-all.tnr}}|{{lazar-high-confidence.tnr}}|{{lazar-padel-all.tnr}}|{{lazar-padel-high-confidence.tnr}}| +|PPV|{{R-RF.ppv}}|{{R-SVM.ppv}}|{{R-DL.ppv}}|{{tensorflow-all.ppv}}|{{tensorflow-selected.ppv}}|{{lazar-all.ppv}}|{{lazar-high-confidence.ppv}}|{{lazar-padel-all.ppv}}|{{lazar-padel-high-confidence.ppv}}| +|NPV|{{R-RF.npv}}|{{R-SVM.npv}}|{{R-DL.npv}}|{{tensorflow-all.npv}}|{{tensorflow-selected.npv}}|{{lazar-all.npv}}|{{lazar-high-confidence.npv}}|{{lazar-padel-all.npv}}|{{lazar-padel-high-confidence.npv}}| +|Nr. predictions|{{R-RF.n}}|{{R-SVM.n}}|{{R-DL.n}}|{{tensorflow-all.n}}|{{tensorflow-selected.n}}|{{lazar-all.n}}|{{lazar-high-confidence.n}}|{{lazar-padel-all.n}}|{{lazar-padel-high-confidence.n}}| + +: Summary of crossvalidation results. *R-RF*: R Random Forests, *R-SVM*: R Support Vector Machines, *R-DL*: R Deep Learning, *TF*: Tensorflow without feature selection, *TF-FS*: Tensorflow with feature selection, *L*: lazar, *L-HC*: lazar high confidence predictions, *L-P*: lazar with PaDEL descriptors, *L-P-HC*: lazar PaDEL high confidence predictions, *PPV*: Positive predictive value (Precision), *NPV*: Negative predictive value {#tbl:summary} + +![ROC plot of crossvalidation results. *R-RF*: R Random Forests, *R-SVM*: R Support Vector Machines, *R-DL*: R Deep Learning, *TF*: Tensorflow without feature selection, *TF-FS*: Tensorflow with feature selection, *L*: lazar, *L-HC*: lazar high confidence predictions, *L-P*: lazar with PaDEL descriptors, *L-P-HC*: lazar PaDEL high confidence predictions (overlaps with L-P)](figures/roc.png){#fig:roc} + + -Summary -------- +Pyrrolizidine alkaloid mutagenicity predictions +----------------------------------------------- -The results of all crossvalidation experiments are summarized in @tbl:summary. +Pyrrolizidine alkaloid mutagenicity predictions are summarized in Table @tab:pa. -| |R-RF | R-SVM | R-DL | TF | TF-FS | L | L-HC | L-P | L-P-HC| -|-|-----|-------|------|----|-------|---|------|------|--------| -|Accuracy|{{R-RF.acc}}|{{R-SVM.acc}}|{{R-DL.acc}}|{{tensorflow-all.acc}}|{{tensorflow-selected.acc}}|{{lazar-all.acc}}|{{lazar-high-confidence.acc}}|{{lazar-padel-all.acc}}|{{lazar-padel-high-confidence.acc}}| -|Sensitivity|{{R-RF.tpr}}|{{R-SVM.tpr}}|{{R-DL.tpr}}|{{tensorflow-all.tpr}}|{{tensorflow-selected.tpr}}|{{lazar-all.tpr}}|{{lazar-high-confidence.tpr}}|{{lazar-padel-all.tpr}}|{{lazar-padel-high-confidence.tpr}}| -|Specificity|{{R-RF.tnr}}|{{R-SVM.tnr}}|{{R-DL.tnr}}|{{tensorflow-all.tnr}}|{{tensorflow-selected.tnr}}|{{lazar-all.tnr}}|{{lazar-high-confidence.tnr}}|{{lazar-padel-all.tnr}}|{{lazar-padel-high-confidence.tnr}}| -|PPV|{{R-RF.ppv}}|{{R-SVM.ppv}}|{{R-DL.ppv}}|{{tensorflow-all.ppv}}|{{tensorflow-selected.ppv}}|{{lazar-all.ppv}}|{{lazar-high-confidence.ppv}}|{{lazar-padel-all.ppv}}|{{lazar-padel-high-confidence.ppv}}| -|NPV|{{R-RF.npv}}|{{R-SVM.npv}}|{{R-DL.npv}}|{{tensorflow-all.npv}}|{{tensorflow-selected.npv}}|{{lazar-all.npv}}|{{lazar-high-confidence.npv}}|{{lazar-padel-all.npv}}|{{lazar-padel-high-confidence.npv}}| -|Nr. predictions|{{R-RF.n}}|{{R-SVM.n}}|{{R-DL.n}}|{{tensorflow-all.n}}|{{tensorflow-selected.n}}|{{lazar-all.n}}|{{lazar-high-confidence.n}}|{{lazar-padel-all.n}}|{{lazar-padel-high-confidence.n}}| - -: Summary of crossvalidation results. *R-RF*: R Random Forests, *R-SVM*: R Support Vector Machines, *R-DL*: R Deep Learning, *TF*: TensorFlow without feature selection, *TF-FS*: TensorFlow with feature selection, *L*: lazar, *L-HC*: lazar high confidence predictions, *L-P*: lazar with PaDEL descriptors, *L-P-HC*: lazar PaDEL high confidence predictions, *PPV*: Positive predictive value (Precision), *NPV*: Negative predictive value {#tbl:summary} +@fig:tsne-mp2d shows the position of pyrrolizidine alkaloids (PA) in the mutagenicity training dataset in MP2D space -@fig:roc shows the position of crossvalidation results in receiver operating characteristic (ROC) space. +![t-sne visualisation of mutagenicty training data and pyrrolizidine alkaloids (PA)](figures/tsne-mp2d.png){#fig:tsne-mp2d} -![ROC plot of crossvalidation results. *R-RF*: R Random Forests, *R-SVM*: R Support Vector Machines, *R-DL*: R Deep Learning, *TF*: TensorFlow without feature selection, *TF-FS*: TensorFlow with feature selection, *L*: lazar, *L-HC*: lazar high confidence predictions, *L-P*: lazar with PaDEL descriptors, *L-P-HC*: lazar PaDEL high confidence predictions (overlaps with L-P)](figures/roc.png){#fig:roc} +@fig:tsne-padel shows the position of pyrrolizidine alkaloids (PA) in the mutagenicity training dataset in PADEL space -Predictions for pyrrolizidine alkaloid mutagenicity ----------------------------------------------------- +![t-sne visualisation of mutagenicty training data and pyrrolizidine alkaloids (PA)](figures/tsne-padel.png){#fig:tsne-padel} Discussion ========== @@ -567,7 +629,7 @@ Model performance @tbl:summary and @fig:roc show that the standard `lazar` algorithm (with MP2D fingerprints) give the most accurate crossvalidation results. R Random Forests, -Support Vector Machines and TensorFlow models have similar accuracies with +Support Vector Machines and Tensorflow models have similar accuracies with balanced sensitivity (true position rate) and specificity (true negative rate). `lazar` models with PaDEL descriptors have low sensitivity and R Deep Learning models have low specificity. @@ -583,14 +645,14 @@ similar to the experimental variability (@Helma2018). The lowest number of predictions ({{lazar-padel-high-confidence.n}}) has been obtained from `lazar`/PaDEL high confidence predictions, the largest number of -predictions comes from TensorFlow models ({{tensorflow-all.n}}). Standard +predictions comes from Tensorflow models ({{tensorflow-all.n}}). Standard `lazar` give a slightly lower number of predictions ({{lazar-all.n}}) than R -and TensorFlow models. This is not necessarily a disadvantage, because `lazar` +and Tensorflow models. This is not necessarily a disadvantage, because `lazar` abstains from predictions, if the query compound is very dissimilar from the compounds in the training set and thus avoids to make predictions for compounds that do not fall into its applicability domain. -There are two major differences between `lazar` and R/TensorFlow models, which +There are two major differences between `lazar` and R/Tensorflow models, which might explain the different prediction accuracies: - `lazar` uses MolPrint2D fingerprints, while all other models use PaDEL descriptors @@ -614,7 +676,7 @@ PaDEL calculates topological and physical-chemical descriptors. TODO: **Verena** kannst Du bitte die Deskriptoren nochmals kurz beschreiben -PaDEL descriptors were used for the R and TensorFlow models. In addition we +PaDEL descriptors were used for the R and Tensorflow models. In addition we have used PaDEL descriptors to calculate cosine similarities for the `lazar` algorithm and compared the results with standard MP2D similarities, which led to a significant decrease of `lazar` prediction accuracies. Based on this @@ -622,7 +684,7 @@ result we can conclude, that PaDEL descriptors are less suited for similarity calculations than MP2D descriptors. In order to investigate, if MP2D fingerprints are also a better option for -global models we have tried to build R and TensorFlow models both with and +global models we have tried to build R and Tensorflow models both with and without unsupervised feature selection. Unfortunately none of the algorithms was capable to deal with the large and sparsely populated descriptor matrix. Based on this result we can conclude, that MP2D descriptors are at the moment @@ -642,7 +704,7 @@ Algorithms structures for a given compound and calculates the prediction based on the experimental data for these structures. The QSAR literature calls such models frequently *local models*, because models are generated specifically for each -query compound. R and TensorFlow models are in contrast *global models*, i.e. a +query compound. R and Tensorflow models are in contrast *global models*, i.e. a single model is used to make predictions for all compounds. It has been postulated in the past, that local models are more accurate, because they can account better for mechanisms, that affect only a subset of the training data. @@ -656,10 +718,6 @@ modelling algorithms that are capable to handle large, sparse binary matrices. Mutagenicity of PAs ------------------- -@fig:tsne-mp2d shows the position of pyrrolizidine alkaloids (PA) in the mutagenicity training dataset - -![t-sne visualisation of mutagenicty training data and pyrrolizidine alkaloids (PA)](figures/tsne-mp2d.png){#fig:tsne-mp2d} - Due to the low to moderate predictivity of all models, quantitative statement on the genotoxicity of single PAs cannot be made with sufficient confidence. @@ -670,7 +728,7 @@ literature, and are therefore not further considered in the discussion. Necic acid The rank order of the necic acid is comparable in the four models -considered (LAZAR, RF and DL (R-project and TensorFlow). PAs from the +considered (LAZAR, RF and DL (R-project and Tensorflow). PAs from the monoester type had the lowest genotoxic potential, followed by PAs from the open-ring diester type. PAs with macrocyclic diesters had the highest genotoxic potential. The result fit well with current state of @@ -684,7 +742,7 @@ Necine base The rank order of necine base is comparable in LAZAR, RF, and DL (R-project) models: with platynecine being less or as genotoxic as retronecine, and otonecine being the most genotoxic. In the -TensorFlow-generate DL model, platynecine also has the lowest genotoxic +Tensorflow-generate DL model, platynecine also has the lowest genotoxic probability, but are then followed by the otonecines and last by retronecine. These results partly correspond to earlier published studies. Saturated PAs of the platynecine-type are generally accepted to @@ -697,7 +755,7 @@ than those of the retronecine-type [Li et al. 2013](#_ENREF_80)(). Modifications of necine base -The group-specific results of the TensorFlow-generated DL model appear +The group-specific results of the Tensorflow-generated DL model appear to reflect the expected relationship between the groups: the low genotoxic potential of *N*-oxides and the highest potential of dehydropyrrolizidines [Chen et al. 2010](#_ENREF_26)(). @@ -736,7 +794,7 @@ corresponding tertiary PAs. However, in the groups of modification of the necine base, dehydropyrrolizidine, the toxic principle of PAs, should have had the highest genotoxic potential. Taken together, the predictions of the modifications of the necine base from the LAZAR, RF -and R-generated DL model cannot -- in contrast to the TensorFlow DL +and R-generated DL model cannot -- in contrast to the Tensorflow DL model - be considered as reliable. Overall, when comparing the prediction results of the PAs to current @@ -748,7 +806,7 @@ issues: the extended AD, 92.3% of the PAs could be included in the prediction. Even though the Jaccard distance between the training dataset and the PA dataset for the RF, SVM, and DL (R-project and - TensorFlow) models was small, suggesting a high similarity, the + Tensorflow) models was small, suggesting a high similarity, the LAZAR indicated that PAs have only few local neighbours, which might adversely affect the prediction of the mutagenic potential of PAs. @@ -774,7 +832,7 @@ Conclusions =========== A new public *Salmonella* mutagenicity training dataset with 8309 compounds was -created and used it to train `lazar`, R and TensorFlow models. The best +created and used it to train `lazar`, R and Tensorflow models. The best performance was obtained with `lazar` models using MolPrint2D descriptors, with prediction accuracies comparable to the interlaboratory variability of the Ames test. Differences between algorithms (local vs. global models) and/or @@ -783,9 +841,9 @@ prediction accuracies. In this study, an attempt was made to predict the genotoxic potential of PAs using five different machine learning techniques (LAZAR, RF, SVM, DL -(R-project and TensorFlow). The results of all models fitted only partly +(R-project and Tensorflow). The results of all models fitted only partly to the findings in literature, with best results obtained with the -TensorFlow DL model. Therefore, modelling allows statements on the +Tensorflow DL model. Therefore, modelling allows statements on the relative risks of genotoxicity of the different PA groups. Individual predictions for selective PAs appear, however, not reliable on the current basis of the used training dataset. diff --git a/mutagenicity.pdf b/mutagenicity.pdf new file mode 100644 index 0000000..b7cd456 Binary files /dev/null and b/mutagenicity.pdf differ -- cgit v1.2.3