Martins contribution

author: Christoph Helma <helma@in-silico.ch> 2016-02-23 12:15:27 +0100
committer: Christoph Helma <helma@in-silico.ch> 2016-02-23 12:15:27 +0100
commit: 3c1e3821db9a7ffe813621872a8ba253a09de77c (patch)
tree: e01a1525d611581ef922f5ac8cb4d1f4445653ef
parent: 4d67ddb9fe72cb4cba42e508a943e1d4d04fef8c (diff)
12 files changed, 369 insertions, 300 deletions
diff --git a/paper/Makefile b/paper/Makefile
index 4aa5ab3..7380267 100644
--- a/paper/Makefile
+++ b/paper/Makefile
@@ -2,21 +2,20 @@
 loael.pdf: loael.md references.bibtex
 	pandoc -r markdown+simple_tables+table_captions+yaml_metadata_block -s -S --bibliography=references.bibtex --latex-engine=pdflatex --filter pandoc-crossref --filter pandoc-citeproc -o loael.pdf loael.md
 
-loael.md: loael.Rmd figures validations
+loael.md: loael.Rmd # TODO: add further dependencies
 	Rscript --vanilla -e "library(knitr); knit('loael.Rmd');"
 
 loael.docx: loael.md 
 	pandoc --filter pandoc-crossref --filter pandoc-citeproc loael.md -s -o loael.docx
 
-rmse.R: rmse.rb
-	ruby rmse.rb
-
 # Figures
 
-figures: datasets validations figure/functional-groups.pdf  figure/test-prediction.pdf figure/test-correlation.pdf figure/crossvalidation.pdf
+figures: datasets validations figure/functional-groups.pdf  figure/test-prediction.pdf figure/test-correlation.pdf figure/crossvalidation.pdf figure/dataset-variability.pdf
 
 figure/functional-groups.pdf: data/functional-groups-reduced4R.csv functional-groups.R
 	Rscript functional-groups.R
+figure/dataset-variability.pdf: data/mazzatorta.csv data/swiss.csv dataset-variability.R
+	Rscript dataset-variability.R
 
 figure/crossvalidation.pdf: data/mazzatorta-cv.csv data/swiss-cv.csv data/combined-cv.csv 
 	Rscript crossvalidation-plots.R
diff --git a/paper/crossvalidation.R b/paper/crossvalidation.R
deleted file mode 100644
index a32f608..0000000
--- a/paper/crossvalidation.R
+++ /dev/null
@@ -1,16 +0,0 @@
-mazzatorta = read.csv("data/mazzatorta-cv.csv",header=T)
-swiss = read.csv("data/swiss-cv.csv",header=T)
-combined = read.csv("data/combined-cv.csv",header=T)
-
-cv.mazzatorta.p = round(cor.test(-log(mazzatorta$LOAEL_measured_median),-log(mazzatorta$LOAEL_predicted))$p.value,2)
-cv.mazzatorta.r_square = round(cor(-log(mazzatorta$LOAEL_measured_median),-log(mazzatorta$LOAEL_predicted))^2,2)
-cv.mazzatorta.rmse = round(sqrt(mean((-log(mazzatorta$LOAEL_measured_median)+log(mazzatorta$LOAEL_predicted))^2)),2)
-
-cv.swiss.p = round(cor.test(-log(swiss$LOAEL_measured_median),-log(swiss$LOAEL_predicted))$p.value,2)
-cv.swiss.r_square = round(cor(-log(swiss$LOAEL_measured_median),-log(swiss$LOAEL_predicted))^2,2)
-cv.swiss.rmse = round(sqrt(mean((-log(swiss$LOAEL_measured_median)+log(swiss$LOAEL_predicted))^2)),2)
-
-cv.combined.p = round(cor.test(-log(combined$LOAEL_measured_median),-log(combined$LOAEL_predicted))$p.value,2)
-cv.combined.r_square = round(cor(-log(combined$LOAEL_measured_median),-log(combined$LOAEL_predicted))^2,2)
-cv.combined.rmse = round(sqrt(mean((-log(combined$LOAEL_measured_median)+log(combined$LOAEL_predicted))^2)),2)
-
diff --git a/paper/dataset-variability.R b/paper/dataset-variability.R
new file mode 100644
index 0000000..b0e3c76
--- /dev/null
+++ b/paper/dataset-variability.R
@@ -0,0 +1,24 @@
+library(ggplot2)
+library(grid)
+library(gridExtra)
+
+m = read.csv("data/mazzatorta.csv",header=T)
+s = read.csv("data/swiss.csv",header=T)
+
+m.dupsmi = unique(m$SMILES[duplicated(m$SMILES)])
+s.dupsmi = unique(s$SMILES[duplicated(s$SMILES)])
+m.dup = m[m$SMILES %in% m.dupsmi,]
+s.dup = s[s$SMILES %in% s.dupsmi,]
+
+m.dup$LOAEL= -log10(m.dup$LOAEL)
+s.dup$LOAEL= -log10(s.dup$LOAEL)
+m.dup$SMILES <- reorder(m.dup$SMILES,m.dup$LOAEL)
+s.dup$SMILES <- reorder(s.dup$SMILES,s.dup$LOAEL)
+
+p1 <- ggplot(m.dup, aes(SMILES,LOAEL),ymin = min(LOAEL), ymax=max(LOAEL)) + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank()) + geom_point() + ggtitle("Mazzatorta") + ylim(-1,4)
+p2 <- ggplot(s.dup, aes(SMILES,LOAEL),ymin = min(LOAEL), ymax=max(LOAEL)) + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank()) + geom_point() + ggtitle("Swiss Federal Office") + ylim(-1,4)
+
+pdf('figure/dataset-variability.pdf')
+grid.arrange(p1,p2,ncol=1)
+dev.off()
+
diff --git a/paper/figure/dataset-variability.pdf b/paper/figure/dataset-variability.pdf
new file mode 100644
index 0000000..4e2b0ad
--- /dev/null
+++ b/paper/figure/dataset-variability.pdf
diff --git a/paper/figure/matching-ClC(C)Cl.png b/paper/figure/matching-ClC(C)Cl.png
new file mode 100644
index 0000000..8641b24
--- /dev/null
+++ b/paper/figure/matching-ClC(C)Cl.png
diff --git a/paper/figure/pc-small-compounds-highlighted.png b/paper/figure/pc-small-compounds-highlighted.png
new file mode 100644
index 0000000..a893a65
--- /dev/null
+++ b/paper/figure/pc-small-compounds-highlighted.png
diff --git a/paper/figure/unnamed-chunk-4-1.png b/paper/figure/unnamed-chunk-4-1.png
new file mode 100644
index 0000000..27b9a79
--- /dev/null
+++ b/paper/figure/unnamed-chunk-4-1.png
diff --git a/paper/loael.Rmd b/paper/loael.Rmd
index 29456a6..98c9e81 100644
--- a/paper/loael.Rmd
+++ b/paper/loael.Rmd
@@ -10,15 +10,23 @@ abstract: " "
 documentclass: achemso
 bibliography: references.bibtex
 bibliographystyle: achemso
+figPrefix: Figure
+eqnPrefix: Equation
+tblPrefix: Table
 output:
   pdf_document:
     fig_caption: yes
 ...
 
+```{r echo=F}
+rsquare <- function(x,y) { cor(x,y,use='complete')^2 }
+rmse <-  function(x,y) { sqrt(mean((x-y)^2,na.rm=TRUE)) }
+```
+
 Introduction
 ============
 
-Christoph + Elena + Benoit
+Elena + Benoit
 
 The quality and reproducibility of (Q)SAR and  read-across predictions is a controversial topic in the toxicological risk-assessment community. Although model predictions can be validated with various procedures it is rarely possible to put the results into the context of experimental variability, because replicate experiments are rarely available.
 
@@ -82,11 +90,15 @@ for `r length(unique(s$SMILES))` unique chemical structures.
 
 ### Preprocessing
 
-Chemical structures in both datasets were initially represented as SMILES strings
-[@doi:10.1021/ci00057a005]. Syntactically incorrect and missing SMILES were
-generated from other identifiers (e.g names, CAS numbers). Unique smiles from the OpenBabel library [@OBoyle2011] were used for the identification of duplicated structures. 
+Chemical structures in both datasets were initially represented as SMILES
+strings [@doi:10.1021/ci00057a005]. Syntactically incorrect and missing SMILES
+were generated from other identifiers (e.g names, CAS numbers). Unique smiles
+from the OpenBabel library [@OBoyle2011] were used for the identification of
+duplicated structures. 
 
-Studies with undefined or empty LOAEL entries were removed from the datasets. LOAEL values were converted to mmol/kg_bw/day units. For prediction, validation and visualisation purposes -log10 transformations are used.
+Studies with undefined or empty LOAEL entries were removed from the datasets.
+LOAEL values were converted to mmol/kg_bw/day units. For prediction, validation
+and visualisation purposes -log10 transformations are used.
 
 David: please check if we have missed something
 
@@ -94,11 +106,16 @@ David: please check if we have missed something
 
 Two derived datasets were obtained from the original datasets: 
 
-The *test* dataset contains data of compounds that occur in both datasets. Exact duplications of LOAEL values were removed, because it is very likely that they originate from the same study. 
-The test dataset has `r length(t$SMILES)` LOAEL values for `r length(unique(t$SMILES))` unique chemical structures.
+The *test* dataset contains data of compounds that occur in both datasets.
+Exact duplications of LOAEL values were removed, because it is very likely that
+they originate from the same study.  The test dataset has `r length(t$SMILES)`
+LOAEL values for `r length(unique(t$SMILES))` unique chemical structures.
 
-The *combined* dataset is the union of the Mazzatorta and the Swiss Federal Office dataset and it is used to build predictive models. Exact LOAEL duplications were removed, as for the test dataset.
-The combined dataset has `r length(c$SMILES)` LOAEL values for `r length(unique(c$SMILES))` unique chemical structures.
+The *combined* dataset is the union of the Mazzatorta and the Swiss Federal
+Office dataset and it is used to build predictive models. Exact LOAEL
+duplications were removed, as for the test dataset.  The combined dataset has
+`r length(c$SMILES)` LOAEL values for `r length(unique(c$SMILES))` unique
+chemical structures.
 
 Algorithms
 ----------
@@ -128,19 +145,25 @@ algorithms:
 
 ### Neighbor identification
 
-Similarity calculations are based on MolPrint2D fingerprints [@doi:10.1021/ci034207y] from the OpenBabel chemoinformatics library [@OBoyle2011].
-
-The MolPrint2D fingerprint uses atom environments as molecular
-representation, which resemble basically the chemical concept of
-functional groups. For each atom in a molecule it represents the
-chemical environment using the atom types of connected atoms.
-
-MolPrint2D fingerprints are generated dynamically from chemical structures and do not rely on predefined lists of fragments (such as OpenBabel FP3, FP4 or MACCs fingerprints or lists of toxocophores/toxicophobes). This has the advantage the they may capture substructures of toxicological relevance that are not included in other fingerprints.
-Preliminary experiments have shown that predictions with MolPrint2D
-fingerprints are indeed more accurate than other OpenBabel fingerprints.
-
-From MolPrint2D fingerprints we can construct a feature vector with all
-atom environments of a compound, which can be used to calculate chemical
+Similarity calculations are based on MolPrint2D fingerprints
+[@doi:10.1021/ci034207y] from the OpenBabel chemoinformatics library
+[@OBoyle2011].
+
+The MolPrint2D fingerprint uses atom environments as molecular representation,
+which resemble basically the chemical concept of functional groups. For each
+atom in a molecule it represents the chemical environment using the atom types
+of connected atoms.
+
+MolPrint2D fingerprints are generated dynamically from chemical structures and
+do not rely on predefined lists of fragments (such as OpenBabel FP3, FP4 or
+MACCs fingerprints or lists of toxocophores/toxicophobes). This has the
+advantage the they may capture substructures of toxicological relevance that
+are not included in other fingerprints.  Preliminary experiments have shown
+that predictions with MolPrint2D fingerprints are indeed more accurate than
+other OpenBabel fingerprints.
+
+From MolPrint2D fingerprints we can construct a feature vector with all atom
+environments of a compound, which can be used to calculate chemical
 similarities.
 
 [//]: # https://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format
@@ -151,31 +174,44 @@ total number of atom environments $A \cup B$ (Jaccard/Tanimoto index, [@eq:jacca
 
 $$ sim = \frac{|A \cap B|}{|A \cup B|} $$ {#eq:jaccard}
 
-A threshold of $sim < 0.1$ is used for the identification of neighbors for local QSAR models.
-Compounds with the same structure as the query structure are eliminated from the neighbors to obtain an unbiased prediction.
+A threshold of $sim < 0.1$ is used for the identification of neighbors for
+local QSAR models.  Compounds with the same structure as the query structure
+are eliminated from the neighbors to obtain an unbiased prediction.
 
 ### Local QSAR models and predictions
 
-Only similar compounds (*neighbors*) are used for local QSAR models.
-In this investigation we are using a weighted partial least squares regression (PLS) algorithm for the prediction of quantitative properties.
-First all fingerprint features with identical values across all neighbors are removed.
-The reamining set of features is used as descriptors for creating a local weighted PLS model with atom environments as descriptors and model similarities as weights. The `plsr` function of the `pls` R package [@pls] is used for this purpose.
-Finally the local PLS model is applied to predict the activity of the query compound.
+Only similar compounds (*neighbors*) are used for local QSAR models.  In this
+investigation we are using a weighted partial least squares regression (PLS)
+algorithm for the prediction of quantitative properties.  First all fingerprint
+features with identical values across all neighbors are removed.  The reamining
+set of features is used as descriptors for creating a local weighted PLS model
+with atom environments as descriptors and model similarities as weights. The
+`plsr` function of the `pls` R package [@pls] is used for this purpose.
+Finally the local PLS model is applied to predict the activity of the query
+compound.
 
-If PLS modelling or prediction fails, the program resorts to using the weighted mean of the
-neighbors LOAEL values, where the contribution of each neighbor is
+If PLS modelling or prediction fails, the program resorts to using the weighted
+mean of the neighbors LOAEL values, where the contribution of each neighbor is
 weighted by its similarity to the query compound.
 
-### Validation
+### Applicability domain
+
+Christoph: TODO
 
-Two types of validations are used within this study:
+### Validation
 
-For the comparison of experimental variability with predictive accuracies we are using a test set of compounds that occur in both datasets. The *Mazzatorta*, *Swiss Federal Office* and *combined* datasets are used as training data for read across predictions. In order to obtain unbiased predictions *all* information from the test compound is removed from the training set prior to predictions. This procedure is hardcoded into the prediction algorithm in order to prevent validation errors.
+For the comparison of experimental variability with predictive accuracies we
+are using a test set of compounds that occur in both datasets. The
+*Mazzatorta*, *Swiss Federal Office* and *combined* datasets are used as
+training data for read across predictions. In order to obtain unbiased
+predictions *all* information from the test compound is removed from the
+training set prior to predictions. This procedure is hardcoded into the
+prediction algorithm in order to prevent validation errors.  Traditional
+10-fold crossvalidation results are provided as additional information for all
+three models. 
 
 TODO: treatment of duplicates
 
-In addition traditional 10-fold crossvalidation results are provided. 
-
 Christoph: check if these specifications have changed at submission
 
 Results
@@ -183,140 +219,192 @@ Results
 
 ### Dataset comparison
 
-Christoph + Elena
+Elena
 
 The main objective of this section is to compare the content of both
 databases in terms of structural composition and LOAEL values, to
 estimate the experimental variability of LOAEL values and to establish a
 baseline for evaluating prediction performance.
 
-#### Applicability domain
 
 ##### Ches-Mapper analysis
 
-Martin
-
-CheS-Mapper (Chemical Space Mapping and Visualization in 3D,
-http://ches-mapper.org/, @Gütlein2012) can be
-used to analyze the relationship between the structure of chemical
-compounds, their physico-chemical properties, and biological or toxic
-effects. CheS-Mapper embeds a dataset into 3D space, such that compounds
-with similar feature values are close to each other. The following two
-screenshots visualise the comparison. The datasets are embeded into 3D
-Space based on structural fragments from three Smart list (OpenBabel
-FP3, OpenBabel FP4 and OpenBabel MACCS).
+We applied the visualization tool CheS-Mapper (Chemical Space Mapping and Visualization in 3D,
+http://ches-mapper.org, @Gütlein2012) to compare both datasets. CheS-Mapper can be used to analyze the relationship between the structure of chemical compounds, their physico-chemical properties, and biological or toxic effects. It embeds a dataset into 3D space, such that compounds with similar feature values are close to each other. CheS-Mapper is generic and can be employed with different kinds of features. [@fig:ches-mapper-pc] shows an embedding that is based on physico-chemical (PC) descriptors: we determined that both datasets have very similar PC feature values.
+
+![Compounds from the Mazzatorta and the Swiss dataset are highlighted in red and green. Compounds that occur in both datasets are highlighted in magenta. In this example, CheS-Mapper applied a principal components analysis to map compounds according to their physico-chemical (PC) feature values into 3D space. Both datasets have in general similar PC feature values. As an exception, the Mazzatorta dataset includes most of the tiny compound structures: we have selected the 78 smallest compounds (with 10 atoms and less, marked with a blue box in the screen-shot) and found that 61 of these compounds occur in the Mazzatorta dataset, whereas only 19 are contained in the Swiss dataset (p-value 3.7E-7).](figure/pc-small-compounds-highlighted.png){#fig:ches-mapper-pc}
+
+We extended CheS-Mapper with a functionality to mine the same MolPrint2D features that are utilized for model building in this work. Applying a minimum frequency of 3 yields 760 distinguished MolPrint2D fragments for the composed dataset of 671 unique compounds. Again, a visual inspection confirmed that both datasets are structurally very similar. However, CheS-Mapper allows the detection of features that help to distinguish groups of selected compounds from the entire dataset. Hence, we found discriminating features for compounds that occur in only one of both datasets, and for the most active or in-active compounds (see [@tbl:molprint]). As an example, [@fig:ches-mapper-alert] shows 9 compounds that match a specific fragment (all other compounds in the dataset do not match this fragment) and have very low mean LOAEL values.
+
+| Selection  | Num selected compounds | Feature name                           | Human-readable feature name          | Feature values entire dataset | Feature values in selection  | P-Value               |
+|------------|------------------------|----------------------------------------|--------------------------------------|-------------------------------|------------------------------|-----------------------|
+|            |                        |                                        |                                      |                               |                              |                       |
+| Mazzatorta | 290                    | 8;1-1-3;2-2-3;                         | O.3 1:C.ar 2:2xC.ar                  | 643× 'no-match', 28× 'match'  | 265× 'no-match', 25× 'match' | 0.005560996217776615  |
+| Mazzatorta | 290                    | 9;1-1-2;2-1-3;2-1-28;                  | O 1:C.2 2:C.ar,N.am                  | 629× 'no-match', 42× 'match'  | 284× 'no-match', 6× 'match'  | 0.006195320799272208  |
+| Mazzatorta | 290                    | 15;1-1-3;2-2-3;                        | Cl 1:C.ar 2:2xC.ar                   | 504× 'no-match', 167× 'match' | 240× 'no-match', 50× 'match' | 0.009255119323774763  |
+|            |                        |                                        |                                      |                               |                              |                       |
+| Swiss      | 226                    | 16;1-1-3;2-2-3;                        | F 1:C.ar 2:2xC.ar                    | 630× 'no-match', 41× 'match'  | 199× 'no-match', 27× 'match' | 0.004142648833225349  |
+| Swiss      | 226                    | 8;1-1-3;2-2-3;                         | O.3 1:C.ar 2:2xC.ar                  | 643× 'no-match', 28× 'match'  | 225× 'no-match', 1× 'match'  | 0.006101869043914521  |
+|            |                        |                                        |                                      |                               |                              |                       |
+| low10      | 67                     | 1;1-1-8;2-1-12;                        | C 1:O.3 2:P.3                        | 642× 'no-match', 29× 'match'  | 52× 'no-match', 15× 'match'  | 2.599701232064433E-9  |
+| low10      | 67                     | 15;1-1-1;2-2-1;2-1-15;                 | Cl 1:C 2:2xC,Cl                      | 662× 'no-match', 9× 'match'   | 59× 'no-match', 8× 'match'   | 3.499196354894707E-8  |
+| low10      | 67                     | 1;1-1-1;1-1-8;2-1-12;                  | C 1:C,O.3 2:P.3                      | 645× 'no-match', 26× 'match'  | 54× 'no-match', 13× 'match'  | 6.053371437442223E-8  |
+| low10      | 67                     | 2;1-1-1;1-1-2;2-3-1;                   | C.2 1:C,C.2 2:3xC                    | 663× 'no-match', 8× 'match'   | 61× 'no-match', 6× 'match'   | 8.936801443204523E-6  |
+| low10      | 67                     | 2;1-1-1;1-1-2;1-1-15;2-3-1;2-2-15;     | C.2 1:C,C.2,Cl 2:3xC,2xCl            | 665× 'no-match', 6× 'match'   | 62× 'no-match', 5× 'match'   | 2.3279183652191726E-5 |
+|            |                        |                                        |                                      |                               |                              |                       |
+| high10     | 67                     | 8;1-1-3;2-2-3;                         | O.3 1:C.ar 2:2xC.ar                  | 643× 'no-match', 28× 'match'  | 57× 'no-match', 10× 'match'  | 1.4617532950766954E-4 |
+| high10     | 67                     | 3;1-2-3;2-1-1;2-2-3;                   | C.ar 1:2xC.ar 2:C,2xC.ar             | 506× 'no-match', 165× 'match' | 64× 'no-match', 3× 'match'   | 1.8132445228380423E-4 |
+| high10     | 67                     | 1;1-1-1;1-1-2;2-1-1;2-1-8;2-1-9;       | C 1:C,C.2 2:C,O.3,O                  | 668× 'no-match', 3× 'match'   | 64× 'no-match', 3× 'match'   | 4.598209084156757E-4  |
+| high10     | 67                     | 1;1-2-1;1-1-8;2-1-1;2-2-8;             | C 1:2xC,O.3 2:C,2xO.3                | 668× 'no-match', 3× 'match'   | 64× 'no-match', 3× 'match'   | 4.598209084156757E-4  |
+| high10     | 67                     | 3;1-1-2;1-2-3;2-1-2;2-2-3;2-1-8;2-1-9; | C.ar 1:C.2,2xC.ar 2:C.2,2xC.ar,O.3,O | 662× 'no-match', 9× 'match'   | 62× 'no-match', 5× 'match'   | 4.613813663041366E-4  |
+
+: Significant MolPrint2D features. The listed features help to distinguish a selection of compounds from the entire dataset (of 671 compounds). We selected compounds that occur in only one of the two datasets, or the top 10 percent of all compounds with the lowest/highest LOAEL values (the mean LOAEL value was computed when a compound occurs in both dataset or was measured multiple times). For each set of selected compounds, we listed the top five most significant fragments with p-value < 0.01 (if available, otherwise less fragments). The MolPrint2D fragments are circular fragments that consist of a center atom, and to layers of neighboring atoms. {#tbl:molprint}
+
+![A CheS-Mapper screen-shot showing 9 compounds that match the MolPrint2D fragment 15;1-1-1;2-2-1;2-1-15; (as SMILES syntax: ClC(C)Cl). Apart from the selected compound (blue box), the other 8 compounds belong to the top 10 percent of compounds with the lowest LOAEL values. I.e., this feature can be regarded as a structural alert in our dataset, as it is matched by only 9 compounds in the entire dataset and 8 of these compounds are highly active.](figure/matching-ClC(C)Cl.png){#fig:ches-mapper-alert}
 
 ##### Distribution of functional groups
 
-Christoph
+```{r echo=F}
+fg = read.csv('data/functional-groups.csv',head=F)
+```
 
-[@fig:fg] shows the frequency of selected functional groups in both
-datasets. A complete table for 138 functional groups from OpenBabel FP4
-fingerprints can be found in the appendix.
+In order to confirm the results of CheS-Mapper analysis we have evaluated the
+frequency of `r length(fg$V1)` functional groups from the OpenBabel FP4
+fingerprint.  [@fig:fg] shows the frequency of selected functional groups in
+both datasets. The complete table for all functional groups can be found in the
+data directory of the supplemental material (`data/functional-groups.csv`).
 
 ![Frequency of functional groups.](figure/functional-groups.pdf){#fig:fg}
 
 ### Experimental variability versus prediction uncertainty 
 
-```{r echo=FALSE}
-source('rmse.R')
-```
-
-Christoph
-
 Duplicated LOAEL values can be found in both datasets and there is a
-substantial overlap of compounds, with LOAEL values in both datasets.
+substantial number of `r length(unique(t$SMILES))` compounds occurring in both
+datasets.  These duplicates allow us to estimate the variability of
+experimental results within individual datasets and between datasets.
 
 ##### Intra dataset variability
 
-TODO: read data from files
+```{r echo=F}
+m.dupsmi = unique(m$SMILES[duplicated(m$SMILES)])
+s.dupsmi = unique(s$SMILES[duplicated(s$SMILES)])
 
-The Mazzatorta dataset has 562 LOAEL values with 439 unique structures,
-the Swiss Federal Office dataset has 493 rat LOAEL values with 381
-unique structures. [@fig:intra] shows the intra-dataset variability, where
-each vertical line represents a single compound and each dot represents
-an individual LOAEL value. The experimental variance of LOAEL values is
-similar in both datasets (p-value: 0.48).
+m.dup = m[m$SMILES %in% m.dupsmi,]
+s.dup = s[s$SMILES %in% s.dupsmi,]
 
-[//]: # p-value: 0.4750771581019402
+m.dupnr = length(m.dupsmi)
+s.dupnr = length(s.dupsmi)
 
-[//]: # ![Intra dataset variability: Each vertical line represents a compound, dots are individual LOAEL values.](loael-dataset-comparison-all-compounds.pdf){#fig:intra}
+m.dup$var = ave(-log10(m.dup$LOAEL),m.dup$SMILES,FUN=var)
+s.dup$var = ave(-log10(s.dup$LOAEL),s.dup$SMILES,FUN=var)
 
-##### Inter dataset variability
+p = t.test(m.dup$var,s.dup$var)$p.value
+```
 
-[@fig:inter] shows the same situation for the combination of the Mazzatorta
-and Swiss Federal Office datasets. Obviously the experimental
-variability is larger than for individual datasets.
+The Mazzatorta dataset has `r length(m$SMILES)` LOAEL values for `r length(levels(m$SMILES))` unique structures, `r m.dupnr` compounds have multiple measurements with an average variance of `r round(mean(m.dup$var,na.rm=T),2)` log10 units [@fig:intra]. 
 
-[//]: # ![Inter dataset variability](loael-dataset-comparison-common-compounds.pdf){#fig:inter}
+The Swiss Federal Office dataset has `r length(s$SMILES)` rat LOAEL values for `r length(levels(s$SMILES))` unique structures, `r s.dupnr` compounds have multiple measurements with a similar variance (average `r round(mean(s.dup$var),2)` log10 units). Variances of both datasets do not show a statistically significant difference with a
+p-value (t-test) of `r round(p,2)`.
 
-##### LOAEL correlation between datasets
+![Variability of LOAEL values in both datasets: Each vertical line represents a compound, dots are individual LOAEL values.](figure/dataset-variability.pdf){#fig:intra}
 
-[@fig:corr] depicts the correlation between LOAEL data from both datasets
-(using means for multiple measurements).
-Identical values were removed from analysis.
+##### Inter dataset variability
 
-[//]: #   MAE: 0.801626064534318
-[//]: # with identical values
+[@fig:comp] shows the experimental LOAEL variability of compounds occurring in both datasets (i.e. the *test* dataset) colored in red (experimental). This is the baseline reference for the comparison with predicted values.
 
+##### LOAEL correlation between datasets
 
 ```{r echo=F}
 data <- read.csv("data/common-median.csv",header=T)
 cor <- cor.test(-log(data$mazzatorta),-log(data$swiss))
-median.r.square <- round(cor(-log(data$mazzatorta),-log(data$swiss),use='complete')^2,2)
-median.rmse <- round(sqrt(mean((-log(data$mazzatorta)+log(data$swiss))^2)),2)
+median.p <- cor$p.value
+median.r.square <- round(rsquare(-log(data$mazzatorta),-log(data$swiss)),2)
+median.rmse <- round(rmse(-log(data$mazzatorta),-log(data$swiss)),2)
 ``` 
 
+[@fig:corr] depicts the correlation between LOAEL values from both datasets. As both datasets contain duplicates we are using medians for the correlation plot and statistics. Please note that the aggregation of duplicated measurements into a single value hides a substantial portion of the real experimental variability.
 Correlation analysis shows a
-significant correlation (p-value < 2.2e-16) with r\^2: `r round(median.r.square,2)`, RMSE: `r round(median.rmse,2)`
+significant (p-value < 2.2e-16) correlation between the experimental data in both datasets with r\^2: `r round(median.r.square,2)`, RMSE: `r round(median.rmse,2)`
 
 ### Local QSAR models
 
-Christoph
+In order to compare the perfomance of in silico models with experimental variability we are using compounds that occur in both datasets as a test set (`r  length(t$SMILES)` measurements, `r  length(unique(t$SMILES))` compounds).
 
-In order to compare the perfomance of in silico models with experimental variability we are using compounds that occur in both datasets as a test set (`r  system("./nr_compounds.sh data/common-test.csv",TRUE)` compounds, `r  system("./nr_measurements.sh data/common-test.csv",TRUE)` measurements).
+The Mazzatorta, the Swiss Federal Office dataset and a combined dataset were used as training data for building `lazar` read across models. Predictions for the test set compounds were made after eliminating all information from the test compound from the corresponding training dataset. [@fig:comp] summarizes the results:
 
-The Mazzatorta, the Swiss Federal Office dataset and a combined dataset were used as training data. Predictions for the test set compounds were made after eliminating all information from the test compound from the corresponding training dataset. [@tbl:common-pred] summarizes the results:
-
-![Comparison of experimental with predicted LOAEL values, each vertical line represents a compound.](figure/test-prediction.pdf){#fig:comp}
+![Comparison of experimental with predicted LOAEL values, each vertical line represents a compound, dots are individual measurements (red) or predictions (green).](figure/test-prediction.pdf){#fig:comp}
 
 ```{r echo=F}
-source("test-correlation.R")
+mazzatorta = read.csv("data/mazzatorta-test-predictions.csv",header=T)
+swiss = read.csv("data/swiss-test-predictions.csv",header=T)
+combined = read.csv("data/combined-test-predictions.csv",header=T)
+
+mazzatorta.r_square = round(rsquare(-log(mazzatorta$LOAEL_measured_median),-log(mazzatorta$LOAEL_predicted)),2)
+mazzatorta.rmse = round(rmse(-log(mazzatorta$LOAEL_measured_median),-log(mazzatorta$LOAEL_predicted)),2)
+swiss.r_square = round(rsquare(-log(swiss$LOAEL_measured_median),-log(swiss$LOAEL_predicted)),2)
+swiss.rmse = round(rmse(-log(swiss$LOAEL_measured_median),-log(swiss$LOAEL_predicted)),2)
+combined.r_square = round(rsquare(-log(combined$LOAEL_measured_median),-log(combined$LOAEL_predicted)),2)
+combined.rmse = round(rmse(-log(combined$LOAEL_measured_median),-log(combined$LOAEL_predicted)),2)
 ```
 
-Training data | $r^2$                     | RMSE
+TODO: nr unpredicted, nr predictions outside of experimental values
+
+Correlation analysis has been perfomed between individual predictions and the median of exprimental data.
+All correlations are statistically highly significant with a p-value < 2.2e-16.
+These results are presented in [@fig:corr] and [@tbl:cv]. Please bear in mind that the aggregation of experimental data into a single value actually hides experimental variability.
+
+Training data | $r^2$                     | RMSE                    
 --------------|---------------------------|-------------------------
-Experimental | `r median.r.square`      | `r median.rmse`
-Mazzatorta | `r mazzatorta.r_square`      | `r mazzatorta.rmse`
-Swiss Federal Office |`r swiss.r_square`  | `r swiss.rmse`
-Combined             | `r combined.r_square` | `r combined.rmse`
+Experimental | `r median.r.square`      | `r median.rmse`           
+Mazzatorta | `r mazzatorta.r_square`      | `r mazzatorta.rmse` 
+Swiss Federal Office |`r swiss.r_square`  | `r swiss.rmse` 
+Combined             | `r combined.r_square` | `r combined.rmse` 
 
 : Comparison of model predictions with experimental variability. {#tbl:common-pred}
 
+![Correlation of experimental with predicted LOAEL values (test set)](figure/test-correlation.pdf){#fig:corr}
+
 ```{r echo=F}
-source("crossvalidation.R")
+mazzatorta = read.csv("data/mazzatorta-cv.csv",header=T)
+swiss = read.csv("data/swiss-cv.csv",header=T)
+combined = read.csv("data/combined-cv.csv",header=T)
+
+cv.mazzatorta.r_square = round(rsquare(-log(mazzatorta$LOAEL_measured_median),-log(mazzatorta$LOAEL_predicted)),2)
+cv.mazzatorta.rmse = round(rmse(-log(mazzatorta$LOAEL_measured_median),-log(mazzatorta$LOAEL_predicted)),2)
+cv.swiss.r_square = round(rsquare(-log(swiss$LOAEL_measured_median),-log(swiss$LOAEL_predicted)),2)
+cv.swiss.rmse = round(rmse(-log(swiss$LOAEL_measured_median),-log(swiss$LOAEL_predicted)),2)
+cv.combined.r_square = round(rsquare(-log(combined$LOAEL_measured_median),-log(combined$LOAEL_predicted)),2)
+cv.combined.rmse = round(rmse(-log(combined$LOAEL_measured_median),-log(combined$LOAEL_predicted)),2)
 ```
-Traditional 10-fold cross-validation results are summarised in [@tbl:cv]:
+TODO: repeated CV
+
+Traditional 10-fold cross-validation results are summarised in [@tbl:cv] and [@fig:cv].
+All correlations are statistically highly significant with a p-value < 2.2e-16.
 
 Training dataset | $r^2$ | RMSE 
 -----------------|-------|------
-Mazzatorta | `r round(cv.mazzatorta.r_square,2)`  | `r round(cv.mazzatorta.rmse,2)`
-Swiss Federal Office | `r round(cv.swiss.r_square,2)`  | `r round(cv.swiss.rmse,2)`
-Combined | `r round(cv.combined.r_square,2)`  | `r round(cv.combined.rmse,2)`
+Mazzatorta | `r round(cv.mazzatorta.r_square,2)`  | `r round(cv.mazzatorta.rmse,2)` 
+Swiss Federal Office | `r round(cv.swiss.r_square,2)`  | `r round(cv.swiss.rmse,2)` 
+Combined | `r round(cv.combined.r_square,2)`  | `r round(cv.combined.rmse,2)` 
 
 : 10-fold crossvalidation results {#tbl:cv}
 
-![Correlation of experimental with predicted LOAEL values (test set)](figure/test-correlation.pdf){}
 
-![Correlation of experimental with predicted LOAEL values (10-fold crossvalidation)](figure/crossvalidation.pdf){}
+![Correlation of experimental with predicted LOAEL values (10-fold crossvalidation)](figure/crossvalidation.pdf){#fig:cv}
 
 
 Discussion
 ==========
 
-### Elena + Benoit
+Elena + Benoit
 
-### 
+- both datasets are structurally similar
+- LOAEL values have similar variability in both datasets
+- the Mazzatorta dataset has a small portion of very toxic compounds (low LOAEL, high -log10(LOAEL))
+- lazar read across predictions fall within the experimental variability of LOAEL values
+- predictions are slightly less accurate at extreme (high/low) LOAEL values, this can be explained by the algorithms used
+- the original Mazzatorta paper has "better" results (R^2 0.54, RMSE 0.7) , but the model is likely to be overfitted (using genetic algorithms for feature selection *prior* to crossvalidation must lead to overfitted models)
+- beware of over-optimisations and the race for "better" validation results
 
 Summary
 =======
diff --git a/paper/loael.md b/paper/loael.md
index 69ee1ff..0df226a 100644
--- a/paper/loael.md
+++ b/paper/loael.md
@@ -10,15 +10,20 @@ abstract: " "
 documentclass: achemso
 bibliography: references.bibtex
 bibliographystyle: achemso
+figPrefix: Figure
+eqnPrefix: Equation
+tblPrefix: Table
 output:
   pdf_document:
     fig_caption: yes
 ...
 
+
+
 Introduction
 ============
 
-Christoph + Elena + Benoit
+Elena + Benoit
 
 The quality and reproducibility of (Q)SAR and  read-across predictions is a controversial topic in the toxicological risk-assessment community. Although model predictions can be validated with various procedures it is rarely possible to put the results into the context of experimental variability, because replicate experiments are rarely available.
 
@@ -75,11 +80,15 @@ for 381 unique chemical structures.
 
 ### Preprocessing
 
-Chemical structures in both datasets were initially represented as SMILES strings
-[@doi:10.1021/ci00057a005]. Syntactically incorrect and missing SMILES were
-generated from other identifiers (e.g names, CAS numbers). Unique smiles from the OpenBabel library [@OBoyle2011] were used for the identification of duplicated structures. 
+Chemical structures in both datasets were initially represented as SMILES
+strings [@doi:10.1021/ci00057a005]. Syntactically incorrect and missing SMILES
+were generated from other identifiers (e.g names, CAS numbers). Unique smiles
+from the OpenBabel library [@OBoyle2011] were used for the identification of
+duplicated structures. 
 
-Studies with undefined or empty LOAEL entries were removed from the datasets. LOAEL values were converted to mmol/kg_bw/day units. For prediction, validation and visualisation purposes -log10 transformations are used.
+Studies with undefined or empty LOAEL entries were removed from the datasets.
+LOAEL values were converted to mmol/kg_bw/day units. For prediction, validation
+and visualisation purposes -log10 transformations are used.
 
 David: please check if we have missed something
 
@@ -87,11 +96,16 @@ David: please check if we have missed something
 
 Two derived datasets were obtained from the original datasets: 
 
-The *test* dataset contains data of compounds that occur in both datasets. Exact duplications of LOAEL values were removed, because it is very likely that they originate from the same study. 
-The test dataset has 391 LOAEL values for 155 unique chemical structures.
+The *test* dataset contains data of compounds that occur in both datasets.
+Exact duplications of LOAEL values were removed, because it is very likely that
+they originate from the same study.  The test dataset has 391
+LOAEL values for 155 unique chemical structures.
 
-The *combined* dataset is the union of the Mazzatorta and the Swiss Federal Office dataset and it is used to build predictive models. Exact LOAEL duplications were removed, as for the test dataset.
-The combined dataset has 1014 LOAEL values for 671 unique chemical structures.
+The *combined* dataset is the union of the Mazzatorta and the Swiss Federal
+Office dataset and it is used to build predictive models. Exact LOAEL
+duplications were removed, as for the test dataset.  The combined dataset has
+1014 LOAEL values for 671 unique
+chemical structures.
 
 Algorithms
 ----------
@@ -121,19 +135,25 @@ algorithms:
 
 ### Neighbor identification
 
-Similarity calculations are based on MolPrint2D fingerprints [@doi:10.1021/ci034207y] from the OpenBabel chemoinformatics library [@OBoyle2011].
-
-The MolPrint2D fingerprint uses atom environments as molecular
-representation, which resemble basically the chemical concept of
-functional groups. For each atom in a molecule it represents the
-chemical environment using the atom types of connected atoms.
-
-MolPrint2D fingerprints are generated dynamically from chemical structures and do not rely on predefined lists of fragments (such as OpenBabel FP3, FP4 or MACCs fingerprints or lists of toxocophores/toxicophobes). This has the advantage the they may capture substructures of toxicological relevance that are not included in other fingerprints.
-Preliminary experiments have shown that predictions with MolPrint2D
-fingerprints are indeed more accurate than other OpenBabel fingerprints.
-
-From MolPrint2D fingerprints we can construct a feature vector with all
-atom environments of a compound, which can be used to calculate chemical
+Similarity calculations are based on MolPrint2D fingerprints
+[@doi:10.1021/ci034207y] from the OpenBabel chemoinformatics library
+[@OBoyle2011].
+
+The MolPrint2D fingerprint uses atom environments as molecular representation,
+which resemble basically the chemical concept of functional groups. For each
+atom in a molecule it represents the chemical environment using the atom types
+of connected atoms.
+
+MolPrint2D fingerprints are generated dynamically from chemical structures and
+do not rely on predefined lists of fragments (such as OpenBabel FP3, FP4 or
+MACCs fingerprints or lists of toxocophores/toxicophobes). This has the
+advantage the they may capture substructures of toxicological relevance that
+are not included in other fingerprints.  Preliminary experiments have shown
+that predictions with MolPrint2D fingerprints are indeed more accurate than
+other OpenBabel fingerprints.
+
+From MolPrint2D fingerprints we can construct a feature vector with all atom
+environments of a compound, which can be used to calculate chemical
 similarities.
 
 [//]: # https://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format
@@ -144,31 +164,44 @@ total number of atom environments $A \cup B$ (Jaccard/Tanimoto index, [@eq:jacca
 
 $$ sim = \frac{|A \cap B|}{|A \cup B|} $$ {#eq:jaccard}
 
-A threshold of $sim < 0.1$ is used for the identification of neighbors for local QSAR models.
-Compounds with the same structure as the query structure are eliminated from the neighbors to obtain an unbiased prediction.
+A threshold of $sim < 0.1$ is used for the identification of neighbors for
+local QSAR models.  Compounds with the same structure as the query structure
+are eliminated from the neighbors to obtain an unbiased prediction.
 
 ### Local QSAR models and predictions
 
-Only similar compounds (*neighbors*) are used for local QSAR models.
-In this investigation we are using a weighted partial least squares regression (PLS) algorithm for the prediction of quantitative properties.
-First all fingerprint features with identical values across all neighbors are removed.
-The reamining set of features is used as descriptors for creating a local weighted PLS model with atom environments as descriptors and model similarities as weights. The `plsr` function of the `pls` R package [@pls] is used for this purpose.
-Finally the local PLS model is applied to predict the activity of the query compound.
+Only similar compounds (*neighbors*) are used for local QSAR models.  In this
+investigation we are using a weighted partial least squares regression (PLS)
+algorithm for the prediction of quantitative properties.  First all fingerprint
+features with identical values across all neighbors are removed.  The reamining
+set of features is used as descriptors for creating a local weighted PLS model
+with atom environments as descriptors and model similarities as weights. The
+`plsr` function of the `pls` R package [@pls] is used for this purpose.
+Finally the local PLS model is applied to predict the activity of the query
+compound.
 
-If PLS modelling or prediction fails, the program resorts to using the weighted mean of the
-neighbors LOAEL values, where the contribution of each neighbor is
+If PLS modelling or prediction fails, the program resorts to using the weighted
+mean of the neighbors LOAEL values, where the contribution of each neighbor is
 weighted by its similarity to the query compound.
 
-### Validation
+### Applicability domain
+
+Christoph: TODO
 
-Two types of validations are used within this study:
+### Validation
 
-For the comparison of experimental variability with predictive accuracies we are using a test set of compounds that occur in both datasets. The *Mazzatorta*, *Swiss Federal Office* and *combined* datasets are used as training data for read across predictions. In order to obtain unbiased predictions *all* information from the test compound is removed from the training set prior to predictions. This is hardcoded into the prediction algorithm in order to prevent validation errors.
+For the comparison of experimental variability with predictive accuracies we
+are using a test set of compounds that occur in both datasets. The
+*Mazzatorta*, *Swiss Federal Office* and *combined* datasets are used as
+training data for read across predictions. In order to obtain unbiased
+predictions *all* information from the test compound is removed from the
+training set prior to predictions. This procedure is hardcoded into the
+prediction algorithm in order to prevent validation errors.  Traditional
+10-fold crossvalidation results are provided as additional information for all
+three models. 
 
 TODO: treatment of duplicates
 
-In addition traditional 10-fold crossvalidation results are provided. 
-
 Christoph: check if these specifications have changed at submission
 
 Results
@@ -176,129 +209,148 @@ Results
 
 ### Dataset comparison
 
-Christoph + Elena
+Elena
 
 The main objective of this section is to compare the content of both
 databases in terms of structural composition and LOAEL values, to
 estimate the experimental variability of LOAEL values and to establish a
 baseline for evaluating prediction performance.
 
-#### Applicability domain
 
 ##### Ches-Mapper analysis
 
-Martin
-
-CheS-Mapper (Chemical Space Mapping and Visualization in 3D,
-http://ches-mapper.org/, @Gütlein2012) can be
-used to analyze the relationship between the structure of chemical
-compounds, their physico-chemical properties, and biological or toxic
-effects. CheS-Mapper embeds a dataset into 3D space, such that compounds
-with similar feature values are close to each other. The following two
-screenshots visualise the comparison. The datasets are embeded into 3D
-Space based on structural fragments from three Smart list (OpenBabel
-FP3, OpenBabel FP4 and OpenBabel MACCS).
+We applied the visualization tool CheS-Mapper (Chemical Space Mapping and Visualization in 3D,
+http://ches-mapper.org, @Gütlein2012) to compare both datasets. CheS-Mapper can be used to analyze the relationship between the structure of chemical compounds, their physico-chemical properties, and biological or toxic effects. It embeds a dataset into 3D space, such that compounds with similar feature values are close to each other. CheS-Mapper is generic and can be employed with different kinds of features. [@fig:ches-mapper-pc] shows an embedding that is based on physico-chemical (PC) descriptors: we determined that both datasets have very similar PC feature values.
+
+![Compounds from the Mazzatorta and the Swiss dataset are highlighted in red and green. Compounds that occur in both datasets are highlighted in magenta. In this example, CheS-Mapper applied a principal components analysis to map compounds according to their physico-chemical (PC) feature values into 3D space. Both datasets have in general similar PC feature values. As an exception, the Mazzatorta dataset includes most of the tiny compound structures: we have selected the 78 smallest compounds (with 10 atoms and less, marked with a blue box in the screen-shot) and found that 61 of these compounds occur in the Mazzatorta dataset, whereas only 19 are contained in the Swiss dataset (p-value 3.7E-7).](figure/pc-small-compounds-highlighted.png){#fig:ches-mapper-pc}
+
+We extended CheS-Mapper with a functionality to mine the same MolPrint2D features that are utilized for model building in this work. Applying a minimum frequency of 3 yields 760 distinguished MolPrint2D fragments for the composed dataset of 671 unique compounds. Again, a visual inspection confirmed that both datasets are structurally very similar. However, CheS-Mapper allows the detection of features that help to distinguish groups of selected compounds from the entire dataset. Hence, we found discriminating features for compounds that occur in only one of both datasets, and for the most active or in-active compounds (see [@tbl:molprint]). As an example, [@fig:ches-mapper-alert] shows 9 compounds that match a specific fragment (all other compounds in the dataset do not match this fragment) and have very low mean LOAEL values.
+
+| Selection  | Num selected compounds | Feature name                           | Human-readable feature name          | Feature values entire dataset | Feature values in selection  | P-Value               |
+|------------|------------------------|----------------------------------------|--------------------------------------|-------------------------------|------------------------------|-----------------------|
+|            |                        |                                        |                                      |                               |                              |                       |
+| Mazzatorta | 290                    | 8;1-1-3;2-2-3;                         | O.3 1:C.ar 2:2xC.ar                  | 643× 'no-match', 28× 'match'  | 265× 'no-match', 25× 'match' | 0.005560996217776615  |
+| Mazzatorta | 290                    | 9;1-1-2;2-1-3;2-1-28;                  | O 1:C.2 2:C.ar,N.am                  | 629× 'no-match', 42× 'match'  | 284× 'no-match', 6× 'match'  | 0.006195320799272208  |
+| Mazzatorta | 290                    | 15;1-1-3;2-2-3;                        | Cl 1:C.ar 2:2xC.ar                   | 504× 'no-match', 167× 'match' | 240× 'no-match', 50× 'match' | 0.009255119323774763  |
+|            |                        |                                        |                                      |                               |                              |                       |
+| Swiss      | 226                    | 16;1-1-3;2-2-3;                        | F 1:C.ar 2:2xC.ar                    | 630× 'no-match', 41× 'match'  | 199× 'no-match', 27× 'match' | 0.004142648833225349  |
+| Swiss      | 226                    | 8;1-1-3;2-2-3;                         | O.3 1:C.ar 2:2xC.ar                  | 643× 'no-match', 28× 'match'  | 225× 'no-match', 1× 'match'  | 0.006101869043914521  |
+|            |                        |                                        |                                      |                               |                              |                       |
+| low10      | 67                     | 1;1-1-8;2-1-12;                        | C 1:O.3 2:P.3                        | 642× 'no-match', 29× 'match'  | 52× 'no-match', 15× 'match'  | 2.599701232064433E-9  |
+| low10      | 67                     | 15;1-1-1;2-2-1;2-1-15;                 | Cl 1:C 2:2xC,Cl                      | 662× 'no-match', 9× 'match'   | 59× 'no-match', 8× 'match'   | 3.499196354894707E-8  |
+| low10      | 67                     | 1;1-1-1;1-1-8;2-1-12;                  | C 1:C,O.3 2:P.3                      | 645× 'no-match', 26× 'match'  | 54× 'no-match', 13× 'match'  | 6.053371437442223E-8  |
+| low10      | 67                     | 2;1-1-1;1-1-2;2-3-1;                   | C.2 1:C,C.2 2:3xC                    | 663× 'no-match', 8× 'match'   | 61× 'no-match', 6× 'match'   | 8.936801443204523E-6  |
+| low10      | 67                     | 2;1-1-1;1-1-2;1-1-15;2-3-1;2-2-15;     | C.2 1:C,C.2,Cl 2:3xC,2xCl            | 665× 'no-match', 6× 'match'   | 62× 'no-match', 5× 'match'   | 2.3279183652191726E-5 |
+|            |                        |                                        |                                      |                               |                              |                       |
+| high10     | 67                     | 8;1-1-3;2-2-3;                         | O.3 1:C.ar 2:2xC.ar                  | 643× 'no-match', 28× 'match'  | 57× 'no-match', 10× 'match'  | 1.4617532950766954E-4 |
+| high10     | 67                     | 3;1-2-3;2-1-1;2-2-3;                   | C.ar 1:2xC.ar 2:C,2xC.ar             | 506× 'no-match', 165× 'match' | 64× 'no-match', 3× 'match'   | 1.8132445228380423E-4 |
+| high10     | 67                     | 1;1-1-1;1-1-2;2-1-1;2-1-8;2-1-9;       | C 1:C,C.2 2:C,O.3,O                  | 668× 'no-match', 3× 'match'   | 64× 'no-match', 3× 'match'   | 4.598209084156757E-4  |
+| high10     | 67                     | 1;1-2-1;1-1-8;2-1-1;2-2-8;             | C 1:2xC,O.3 2:C,2xO.3                | 668× 'no-match', 3× 'match'   | 64× 'no-match', 3× 'match'   | 4.598209084156757E-4  |
+| high10     | 67                     | 3;1-1-2;1-2-3;2-1-2;2-2-3;2-1-8;2-1-9; | C.ar 1:C.2,2xC.ar 2:C.2,2xC.ar,O.3,O | 662× 'no-match', 9× 'match'   | 62× 'no-match', 5× 'match'   | 4.613813663041366E-4  |
+
+: Significant MolPrint2D features. The listed features help to distinguish a selection of compounds from the entire dataset (of 671 compounds). We selected compounds that occur in only one of the two datasets, or the top 10 percent of all compounds with the lowest/highest LOAEL values (the mean LOAEL value was computed when a compound occurs in both dataset or was measured multiple times). For each set of selected compounds, we listed the top five most significant fragments with p-value < 0.01 (if available, otherwise less fragments). The MolPrint2D fragments are circular fragments that consist of a center atom, and to layers of neighboring atoms. {#tbl:molprint}
+
+![A CheS-Mapper screen-shot showing 9 compounds that match the MolPrint2D fragment 15;1-1-1;2-2-1;2-1-15; (as SMILES syntax: ClC(C)Cl). Apart from the selected compound (blue box), the other 8 compounds belong to the top 10 percent of compounds with the lowest LOAEL values. I.e., this feature can be regarded as a structural alert in our dataset, as it is matched by only 9 compounds in the entire dataset and 8 of these compounds are highly active.](figure/matching-ClC(C)Cl.png){#fig:ches-mapper-alert}
 
 ##### Distribution of functional groups
 
-Christoph
 
-[@fig:fg] shows the frequency of selected functional groups in both
-datasets. A complete table for 138 functional groups from OpenBabel FP4
-fingerprints can be found in the appendix.
+
+In order to confirm the results of CheS-Mapper analysis we have evaluated the
+frequency of 141 functional groups from the OpenBabel FP4
+fingerprint.  [@fig:fg] shows the frequency of selected functional groups in
+both datasets. The complete table for all functional groups can be found in the
+data directory of the supplemental material (`data/functional-groups.csv`).
 
 ![Frequency of functional groups.](figure/functional-groups.pdf){#fig:fg}
 
 ### Experimental variability versus prediction uncertainty 
 
-
-
-Christoph
-
 Duplicated LOAEL values can be found in both datasets and there is a
-substantial overlap of compounds, with LOAEL values in both datasets.
+substantial number of 155 compounds occurring in both
+datasets.  These duplicates allow us to estimate the variability of
+experimental results within individual datasets and between datasets.
 
 ##### Intra dataset variability
 
-TODO: read data from files
 
-The Mazzatorta dataset has 562 LOAEL values with 439 unique structures,
-the Swiss Federal Office dataset has 493 rat LOAEL values with 381
-unique structures. [@fig:intra] shows the intra-dataset variability, where
-each vertical line represents a single compound and each dot represents
-an individual LOAEL value. The experimental variance of LOAEL values is
-similar in both datasets (p-value: 0.48).
 
-[//]: # p-value: 0.4750771581019402
+The Mazzatorta dataset has 567 LOAEL values for 445 unique structures, 93 compounds have multiple measurements with an average variance of 0.19 log10 units [@fig:intra]. 
 
-[//]: # ![Intra dataset variability: Each vertical line represents a compound, dots are individual LOAEL values.](loael-dataset-comparison-all-compounds.pdf){#fig:intra}
+The Swiss Federal Office dataset has 493 rat LOAEL values for 381 unique structures, 91 compounds have multiple measurements with a similar variance (average 0.15 log10 units). Variances of both datasets do not show a statistically significant difference with a
+p-value (t-test) of 0.25.
 
-##### Inter dataset variability
+![Variability of LOAEL values in both datasets: Each vertical line represents a compound, dots are individual LOAEL values.](figure/dataset-variability.pdf){#fig:intra}
 
-[@fig:inter] shows the same situation for the combination of the Mazzatorta
-and Swiss Federal Office datasets. Obviously the experimental
-variability is larger than for individual datasets.
+##### Inter dataset variability
 
-[//]: # ![Inter dataset variability](loael-dataset-comparison-common-compounds.pdf){#fig:inter}
+[@fig:comp] shows the experimental LOAEL variability of compounds occurring in both datasets (i.e. the *test* dataset) colored in red (experimental). This is the baseline reference for the comparison with predicted values.
 
 ##### LOAEL correlation between datasets
 
-[@fig:corr] depicts the correlation between LOAEL data from both datasets
-(using means for multiple measurements).
-Identical values were removed from analysis.
-
-[//]: #   MAE: 0.801626064534318
-[//]: # with identical values
-
-
 
 
+[@fig:corr] depicts the correlation between LOAEL values from both datasets. As both datasets contain duplicates we are using medians for the correlation plot and statistics. Please note that the aggregation of duplicated measurements into a single value hides a substantial portion of the real experimental variability.
 Correlation analysis shows a
-significant correlation (p-value < 2.2e-16) with r\^2: 0.58, RMSE: 1.3
+significant (p-value < 2.2e-16) correlation between the experimental data in both datasets with r\^2: 0.58, RMSE: 1.3
 
 ### Local QSAR models
 
-Christoph
+In order to compare the perfomance of in silico models with experimental variability we are using compounds that occur in both datasets as a test set (391 measurements, 155 compounds).
 
-In order to compare the perfomance of in silico models with experimental variability we are using compounds that occur in both datasets as a test set (155 compounds, -1 measurements).
+The Mazzatorta, the Swiss Federal Office dataset and a combined dataset were used as training data for building `lazar` read across models. Predictions for the test set compounds were made after eliminating all information from the test compound from the corresponding training dataset. [@fig:comp] summarizes the results:
 
-The Mazzatorta, the Swiss Federal Office dataset and a combined dataset were used as training data. Predictions for the test set compounds were made after eliminating all information from the test compound from the corresponding training dataset. [@tbl:common-pred] summarizes the results:
+![Comparison of experimental with predicted LOAEL values, each vertical line represents a compound, dots are individual measurements (red) or predictions (green).](figure/test-prediction.pdf){#fig:comp}
 
-![Comparison of experimental with predicted LOAEL values, each vertical line represents a compound.](figure/test-prediction.pdf){#fig:comp}
 
 
+TODO: nr unpredicted, nr predictions outside of experimental values
 
-Training data | $r^2$                     | RMSE
+Correlation analysis has been perfomed between individual predictions and the median of exprimental data.
+All correlations are statistically highly significant with a p-value < 2.2e-16.
+These results are presented in [@fig:corr] and [@tbl:cv]. Please bear in mind that the aggregation of experimental data into a single value actually hides experimental variability.
+
+Training data | $r^2$                     | RMSE                    
 --------------|---------------------------|-------------------------
-Experimental | 0.58      | 1.3
-Mazzatorta | 0.38      | 1.49
-Swiss Federal Office |0.38  | 1.47
-Combined             | 0.38 | 1.47
+Experimental | 0.58      | 1.3           
+Mazzatorta | 0.38      | 1.49 
+Swiss Federal Office |0.38  | 1.47 
+Combined             | 0.38 | 1.47 
 
 : Comparison of model predictions with experimental variability. {#tbl:common-pred}
 
+![Correlation of experimental with predicted LOAEL values (test set)](figure/test-correlation.pdf){#fig:corr}
+
+
+TODO: repeated CV
 
-Traditional 10-fold cross-validation results are summarised in [@tbl:cv]:
+Traditional 10-fold cross-validation results are summarised in [@tbl:cv] and [@fig:cv].
+All correlations are statistically highly significant with a p-value < 2.2e-16.
 
 Training dataset | $r^2$ | RMSE 
 -----------------|-------|------
-Mazzatorta | 0.38  | 2.01
-Swiss Federal Office | 0.3  | 1.67
-Combined | 0.38  | 1.81
+Mazzatorta | 0.38  | 2.01 
+Swiss Federal Office | 0.3  | 1.67 
+Combined | 0.38  | 1.81 
 
 : 10-fold crossvalidation results {#tbl:cv}
 
-![Correlation of experimental with predicted LOAEL values (test set)](figure/test-correlation.pdf){}
 
-![Correlation of experimental with predicted LOAEL values (10-fold crossvalidation)](figure/crossvalidation.pdf){}
+![Correlation of experimental with predicted LOAEL values (10-fold crossvalidation)](figure/crossvalidation.pdf){#fig:cv}
 
 
 Discussion
 ==========
 
-### Elena + Benoit
+Elena + Benoit
 
-### 
+- both datasets are structurally similar
+- LOAEL values have similar variability in both datasets
+- the Mazzatorta dataset has a small portion of very toxic compounds (low LOAEL, high -log10(LOAEL))
+- lazar read across predictions fall within the experimental variability of LOAEL values
+- predictions are slightly less accurate at extreme (high/low) LOAEL values, this can be explained by the algorithms used
+- the original Mazzatorta paper has "better" results (R^2 0.54, RMSE 0.7) , but the model is likely to be overfitted (using genetic algorithms for feature selection *prior* to crossvalidation must lead to overfitted models)
+- beware of over-optimisations and the race for "better" validation results
 
 Summary
 =======
diff --git a/paper/loael.pdf b/paper/loael.pdf
index cef90e5..7560456 100644
--- a/paper/loael.pdf
+++ b/paper/loael.pdf
diff --git a/paper/rmse.rb b/paper/rmse.rb
deleted file mode 100644
index 90245eb..0000000
--- a/paper/rmse.rb
+++ /dev/null
@@ -1,63 +0,0 @@
-require_relative '../../lazar/lib/lazar'
-include OpenTox
-
-old = Dataset.from_csv_file File.join(File.dirname(__FILE__),"..","regression","LOAEL_mg_corrected_smiles_mmol.csv")
-new = Dataset.from_csv_file File.join(File.dirname(__FILE__),"..","regression","swissRat_chron_LOAEL_mmol.csv")
-
-File.open("rmse.R","w+") do |result|
-
-  names = {old => "mazzatorta",new  => "swiss"} 
-
-  [old,new].each do |dataset|
-    rmse = 0
-    nr = 0
-    dataset.compound_ids.each do |cid|
-      c = Compound.find cid
-      values = dataset.values(c,dataset.features.first)
-      if values.size > 1
-        median = -Math.log(values.mean) 
-        values.each do |v|
-          rmse += (-Math.log(v) - median)**2
-          nr += 1
-        end
-      end
-    end
-    rmse = Math.sqrt(rmse/nr)
-    result.puts "#{names[dataset]}.rmse <- #{rmse}"
-  end
-
-  rmse = 0
-  nr = 0
-  (old.compound_ids & new.compound_ids).each do |cid|
-    c = Compound.find cid
-    values = old.values(c,old.features.first) + new.values(c,new.features.first)
-    if values.size > 1
-      median = -Math.log(values.mean) 
-      values.each do |v|
-        rmse += (-Math.log(v) - median)**2
-        nr += 1
-      end
-    end
-  end
-  rmse = Math.sqrt(rmse/nr)
-  result.puts "common.rmse <- #{rmse}"
-
-  rmse = 0
-  nr = 0
-  (old.compound_ids + new.compound_ids).uniq.each do |cid|
-    c = Compound.find cid
-    values = old.values(c,old.features.first) + new.values(c,new.features.first)
-    if values.size > 1
-      median = -Math.log(values.mean) 
-      values.each do |v|
-        rmse += (-Math.log(v) - median)**2
-        nr += 1
-      end
-    end
-  end
-  rmse = Math.sqrt(rmse/nr)
-  result.puts "combined.rmse <- #{rmse}"
-end
-
-#combined_rmse = Math.sqrt(combined_rmse/combined_nr)
-#p "combined: #{combined_rmse}"
diff --git a/paper/test-correlation.R b/paper/test-correlation.R
deleted file mode 100644
index 99d113a..0000000
--- a/paper/test-correlation.R
+++ /dev/null
@@ -1,15 +0,0 @@
-mazzatorta = read.csv("data/mazzatorta-test-predictions.csv",header=T)
-swiss = read.csv("data/swiss-test-predictions.csv",header=T)
-combined = read.csv("data/combined-test-predictions.csv",header=T)
-
-mazzatorta.p = round(cor.test(-log(mazzatorta$LOAEL_measured_median),-log(mazzatorta$LOAEL_predicted))$p.value,2)
-mazzatorta.r_square = round(cor(-log(mazzatorta$LOAEL_measured_median),-log(mazzatorta$LOAEL_predicted))^2,2)
-mazzatorta.rmse = round(sqrt(mean((-log(mazzatorta$LOAEL_measured_median)+log(mazzatorta$LOAEL_predicted))^2)),2)
-
-swiss.p = round(cor.test(-log(swiss$LOAEL_measured_median),-log(swiss$LOAEL_predicted))$p.value,2)
-swiss.r_square = round(cor(-log(swiss$LOAEL_measured_median),-log(swiss$LOAEL_predicted))^2,2)
-swiss.rmse = round(sqrt(mean((-log(swiss$LOAEL_measured_median)+log(swiss$LOAEL_predicted))^2)),2)
-
-combined.p = round(cor.test(-log(combined$LOAEL_measured_median),-log(combined$LOAEL_predicted))$p.value,2)
-combined.r_square = round(cor(-log(combined$LOAEL_measured_median),-log(combined$LOAEL_predicted))^2,2)
-combined.rmse = round(sqrt(mean((-log(combined$LOAEL_measured_median)+log(combined$LOAEL_predicted))^2)),2)
author	Christoph Helma <helma@in-silico.ch>	2016-02-23 12:15:27 +0100
committer	Christoph Helma <helma@in-silico.ch>	2016-02-23 12:15:27 +0100
commit	3c1e3821db9a7ffe813621872a8ba253a09de77c (patch)
tree	e01a1525d611581ef922f5ac8cb4d1f4445653ef
parent	4d67ddb9fe72cb4cba42e508a943e1d4d04fef8c (diff)