From 1f26844dd7b25ac9f8891e745eefcce17239b16c Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Mon, 29 Mar 2021 15:11:49 +0200
Subject: introduction, nixpkgs pinned

---
 bibliography.bib |  17 +++++++++++++
 mutagenicity.md  |  72 ++++++++++++++++++++++++++++++++-----------------------
 mutagenicity.pdf | Bin 3176503 -> 3178373 bytes
 shell.nix        |   3 ++-
 4 files changed, 61 insertions(+), 31 deletions(-)

diff --git a/bibliography.bib b/bibliography.bib
index 45c5852..ca5c3da 100644
--- a/bibliography.bib
+++ b/bibliography.bib
@@ -1,3 +1,20 @@
+@misc{ICH2017,
+  author = {International Council for Harmonisation of Technical Requirements for Pharmaceuticals for Human Use (ICH)},
+  title = {Assessment and control of DNA reactive (mutagenic) impurities in pharmaceuticals to limit potential carcinogenic risk M7(R1)},
+  year = 2017,
+  note = {\url{https://database.ich.org/sites/default/files/M7_R1_Guideline.pdf}},
+  note = {Accessed: 29-03-2021},
+}
+
+@misc{ECHA2017,
+  author ={European Chemicals Agency (ECHA)},
+  title = {Guidance on Information Requirements and Chemical Safety Assessment, Chapter R.7a: Endpoint specific guidance},
+  year = 2017,
+  note ={\url{https://echa.europa.eu/documents/10162/13632/information_requirements_r6_en.pdf}},
+  note = {Accessed: 29-03-2021},
+  isbn = {978-92-9495-970-6},
+  doi = {10.2823/337352}},
+}
 @article{Rubiolo1992,
   author = {Rubiolo, P. and Pieters, L. and Calomme, M. and Bicchi, C. and Vlietinck, A. and Vanden Berghe, D.},
   year = 1992,
diff --git a/mutagenicity.md b/mutagenicity.md
index 3911799..9c5f427 100644
--- a/mutagenicity.md
+++ b/mutagenicity.md
@@ -52,28 +52,46 @@ MolPrint2D and Chemistry Development Kit (CDK) descriptors.  Crossvalidation
 accuracies of all investigated models ranged from 80-85% which is comparable
 with the interlaboratory variability of the *Salmonella* mutagenicity assay.
 Pyrrolizidine alkaloid predictions showed a clear distinction between chemical
-groups, where Otonecines had the highest proportion of positive mutagenicity
-predictions and Monoesters the lowest.
+groups, where otonecines had the highest proportion of positive mutagenicity
+predictions and monoesters the lowest.
 
 Introduction
 ============
 
-**TODO**: rationale for investigation
-<!---
-
-Mutagenicity datasets
-Algorithms
-descriptors
-define abbreviations
-pyrrolizidine 
-large dataset -> comparison of algorithms and descriptors
-reliable experimental outcome
---->
+The assessment of mutagenicity is an important part in the safety assessment of
+chemical structures, because genomic changes may lead to cancer and germ
+cells damage.  The *Salmonella typhimurium* bacterial reverse mutation
+test (Ames test) is capable to identify substances that cause mutations (e.g.,
+base-pair substitutions, frameshifts, insertions, deletions) and is generally
+used as the first step in genotoxicity and carcinogenicity assessments.
+
+Computer based (*in silico*) mutagenicity predictions can be used in the early
+screening of novel compounds (e.g. drug candidates), but they are also gaining
+regulatory acceptance e.g. for the registration of industrial chemicals within
+REACH (@ECHA2017) or the assessment of impurities in pharmaceuticals (ICH M7
+guideline, @ICH2017).
+
+*Salmonella* mutagenicity is at the moment the toxicological endpoint with the
+largest amount of public data for almost 10000 structures, whereas datasets for
+other endpoints contain typically only a few hundred compounds. The Ames test
+itself is relatively reproducible with an interlaboratory variability of 80-85%
+(@Benigni1988).
+
+This makes the development of mutagenicity models also interesting from a
+computational chemistry and machine learning point of view.  The relatively
+large amount of public data reduces the probability of chance effects due to
+small sample sizes and the reliability of the underlying assay reduces the risk
+of overfitting experimental errors.
+
+Within this study we attempted
+
+  - to generate a new public mutagenicity training dataset, by combining the most comprehensive public datasets
+  - to compare the performance of MolPrint2D (*MP2D*) fingerprints with Chemistry Development Kit (*CDK*) descriptors for mutagenicity predictions
+  - to compare the performance of global QSAR models (random forests (*RF*), support vector machines (*SVM*), logistic regression (*LR*), neural nets (*NN*)) with local models (`lazar`)
 
-As case study we decided to apply these mutagenicity models to {{pa.nr}}
-Pyrrolizidines alkaloids (PAs) in order to highlight potentials and problems
-with the applicability of mutagenicity models for compounds with very limited
-experimental data.
+In order to highlight potentials and problems with the application of
+mutagenicity models to compounds with limited experimental data we decided to
+apply these mutagenicity models to {{pa.nr}} Pyrrolizidine alkaloids (PAs).
 
 Pyrrolizidine alkaloids (PAs) are characteristic metabolites of some plant
 families, mainly: *Asteraceae*, *Boraginaceae*, *Fabaceae* and *Orchidaceae*
@@ -87,14 +105,8 @@ base and necic acid (@Hadi2021; @Allemang2018, @Louisse2019). However, due to
 limited availability of pure substances, only a limited number of PAs have been
 investigated with regards to their structure-specific mutagenicity. To overcome
 this bottleneck, the prediction of structure-specific mutagenic potential of
-PAs with different machine learning models could provide further inside in the mechanisms.
-
-Summing up the main objectives of this study were
-
-  - to generate a new mutagenicity training dataset, by combining the most comprehensive public datasets
-  - to compare the performance of MolPrint2D (*MP2D*) fingerprints with Chemistry Development Kit (*CDK*) descriptors
-  - to compare the performance of global QSAR models (random forests (*RF*), support vector machines (*SVM*), logistic regression (*LR*), neural nets (*NN*)) with local models (`lazar`)
-  - to apply these models for the prediction of pyrrolizidine alkaloid mutagenicity
+PAs with different machine learning models could provide further inside in the
+mechanisms.
 
 Materials and Methods
 =====================
@@ -585,12 +597,12 @@ models ({{pa.mp2d_svm.mut_perc}}-{{pa.mp2d_lazar_high_confidence.mut_perc}}%,
 @tbl:pa-summary, @fig:pa-groups). 
 
 Over all models, the mean value of mutagenic predicted PAs was highest for
-Otonecines ({{pa.groups.Otonecine.mut_perc}}%, 
+otonecines ({{pa.groups.Otonecine.mut_perc}}%, 
 {{pa.groups.Otonecine.mut}}/{{pa.groups.Otonecine.n_pred}}),
-followed by Macrocyclic diesters ({{pa.groups.Macrocyclic_diester.mut_perc}}%, {{pa.groups.Macrocyclic_diester.mut}}/{{pa.groups.Macrocyclic_diester.n_pred}}),
-Dehydropyrrolizidine ({{pa.groups.Dehydropyrrolizidine.mut_perc}}%, {{pa.groups.Dehydropyrrolizidine.mut}}/{{pa.groups.Dehydropyrrolizidine.n_pred}}),
-Tertiary PAs ({{pa.groups.Tertiary_PA.mut_perc}}%, {{pa.groups.Tertiary_PA.mut}}/{{pa.groups.Tertiary_PA.n_pred}}) and
-Retronecines ({{pa.groups.Retronecine.mut_perc}}%, {{pa.groups.Retronecine.mut}}/{{pa.groups.Retronecine.n_pred}}).
+followed by macrocyclic diesters ({{pa.groups.Macrocyclic_diester.mut_perc}}%, {{pa.groups.Macrocyclic_diester.mut}}/{{pa.groups.Macrocyclic_diester.n_pred}}),
+dehydropyrrolizidines ({{pa.groups.Dehydropyrrolizidine.mut_perc}}%, {{pa.groups.Dehydropyrrolizidine.mut}}/{{pa.groups.Dehydropyrrolizidine.n_pred}}),
+tertiary PAs ({{pa.groups.Tertiary_PA.mut_perc}}%, {{pa.groups.Tertiary_PA.mut}}/{{pa.groups.Tertiary_PA.n_pred}}) and
+retronecines ({{pa.groups.Retronecine.mut_perc}}%, {{pa.groups.Retronecine.mut}}/{{pa.groups.Retronecine.n_pred}}).
 
 When excluding the aforementioned three deviating models,
 the rank order stays the same, but the percentage of mutagenic PAs is higher.
diff --git a/mutagenicity.pdf b/mutagenicity.pdf
index 5a333b1..e46da4d 100644
Binary files a/mutagenicity.pdf and b/mutagenicity.pdf differ
diff --git a/shell.nix b/shell.nix
index 1ee567f..ab0ed26 100644
--- a/shell.nix
+++ b/shell.nix
@@ -1,4 +1,5 @@
-with import <nixpkgs> { };
+#with import <nixpkgs> { };
+with import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/d88cdc7bc1a7b3d5a50369bf4d2c7844e4868be2.tar.gz") {};
 let
   R-packages = rWrapper.override { packages = with rPackages; [ ggplot2 Rtsne ]; };
   gems = bundlerEnv { name = "mustache"; gemdir = ./.; };
-- 
cgit v1.2.3