summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2017-12-20 16:16:40 +0100
committerChristoph Helma <helma@in-silico.ch>2017-12-20 16:16:40 +0100
commit0983a03eaf0df05b464a4f537947692825560908 (patch)
treee66e24d11f6636673f6ff909c1a5441f9a9fb949
parenta5a9144dd7eb4cb9455c5674325ce6e0cc17af61 (diff)
database names in figures
-rw-r--r--Makefile5
-rw-r--r--data/functional-groups-reduced4R.csv148
-rw-r--r--figures/dataset-variability.pdfbin10212 -> 10180 bytes
-rw-r--r--figures/functional-groups.pdfbin6376 -> 6359 bytes
-rw-r--r--figures/median-correlation.pdfbin5852 -> 5821 bytes
-rw-r--r--loael.Rmd12
-rw-r--r--loael.md14
-rw-r--r--loael.pdfbin472295 -> 428000 bytes
-rw-r--r--loael.tex41
-rwxr-xr-xscripts/dataset-variability.R3
-rwxr-xr-xscripts/functional-groups4R.rb5
-rwxr-xr-xscripts/median-correlation-plot.R2
12 files changed, 115 insertions, 115 deletions
diff --git a/Makefile b/Makefile
index 19cff1d..4da3cc1 100644
--- a/Makefile
+++ b/Makefile
@@ -23,7 +23,7 @@ loael.docx: loael.md
figures/functional-groups.pdf: data/functional-groups-reduced4R.csv
scripts/functional-groups.R
-figures/dataset-variability.pdf: data/mazzatorta_log10.csv data/swiss_log10.csv
+figures/dataset-variability.pdf: data/test_log10_database_fix.csv
scripts/dataset-variability.R
figures/crossvalidation0.pdf: data/training_log10-cv-0.csv
@@ -90,6 +90,9 @@ data/median-correlation.csv: data/mazzatorta_log10.csv data/swiss_log10.csv
data/test_log10.csv: data/mazzatorta_log10.csv data/swiss_log10.csv
scripts/create-testset.rb
+data/test_log10_database_fix.csv: data/test_log10.csv
+ sed 's/mazzatorta/Nestle/' data/test_log10.csv | sed 's/mazzatorta and swiss/Both/' | sed 's/swiss/FSVO/' > data/test_log10_database_fix.csv
+
# Training set
data/training_log10.csv: data/mazzatorta_log10.csv data/swiss_log10.csv
scripts/create-trainingset.rb
diff --git a/data/functional-groups-reduced4R.csv b/data/functional-groups-reduced4R.csv
index 51d6ec8..1cfaf87 100644
--- a/data/functional-groups-reduced4R.csv
+++ b/data/functional-groups-reduced4R.csv
@@ -1,74 +1,74 @@
-Alkene, 39, Mazzatorta
-Alkene, 30, Swiss Federal Office
-Alkylchloride, 71, Mazzatorta
-Alkylchloride, 41, Swiss Federal Office
-Alkylfluoride, 52, Mazzatorta
-Alkylfluoride, 74, Swiss Federal Office
-Alcohol, 44, Mazzatorta
-Alcohol, 27, Swiss Federal Office
-Dialkylether, 35, Mazzatorta
-Dialkylether, 32, Swiss Federal Office
-Alkylarylether, 62, Mazzatorta
-Alkylarylether, 90, Swiss Federal Office
-Diarylether, 35, Mazzatorta
-Diarylether, 47, Swiss Federal Office
-Amine, 66, Mazzatorta
-Amine, 41, Swiss Federal Office
-Primary arom amine, 26, Mazzatorta
-Primary arom amine, 15, Swiss Federal Office
-Ketone, 21, Mazzatorta
-Ketone, 25, Swiss Federal Office
-Chloroalkene, 39, Mazzatorta
-Chloroalkene, 21, Swiss Federal Office
-Carboxylic acid, 33, Mazzatorta
-Carboxylic acid, 38, Swiss Federal Office
-Carboxylic acid derivative, 215, Mazzatorta
-Carboxylic acid derivative, 227, Swiss Federal Office
-Amide, 38, Mazzatorta
-Amide, 60, Swiss Federal Office
-Secondary amide, 22, Mazzatorta
-Secondary amide, 36, Swiss Federal Office
-Imidolactone, 13, Mazzatorta
-Imidolactone, 32, Swiss Federal Office
-Nitrile, 35, Mazzatorta
-Nitrile, 39, Swiss Federal Office
-Vinylogous ester, 113, Mazzatorta
-Vinylogous ester, 120, Swiss Federal Office
-Vinylogous halide, 11, Mazzatorta
-Vinylogous halide, 27, Swiss Federal Office
-Carbonic acid derivatives, 109, Mazzatorta
-Carbonic acid derivatives, 131, Swiss Federal Office
-Urethan, 34, Mazzatorta
-Urethan, 35, Swiss Federal Office
-Phenol, 27, Mazzatorta
-Phenol, 9, Swiss Federal Office
-Arylchloride, 142, Mazzatorta
-Arylchloride, 163, Swiss Federal Office
-Arylfluoride, 22, Mazzatorta
-Arylfluoride, 41, Swiss Federal Office
-Oxoarene, 32, Mazzatorta
-Oxoarene, 29, Swiss Federal Office
-Heteroaromatic, 147, Mazzatorta
-Heteroaromatic, 205, Swiss Federal Office
-Nitro, 42, Mazzatorta
-Nitro, 31, Swiss Federal Office
-Sulfonic derivative, 24, Mazzatorta
-Sulfonic derivative, 29, Swiss Federal Office
-Sulfenic derivative, 48, Mazzatorta
-Sulfenic derivative, 34, Swiss Federal Office
-Phosphoric acid derivative, 70, Mazzatorta
-Phosphoric acid derivative, 44, Swiss Federal Office
-Aromatic, 402, Mazzatorta
-Aromatic, 396, Swiss Federal Office
-Heterocyclic, 228, Mazzatorta
-Heterocyclic, 272, Swiss Federal Office
-Trifluoromethyl, 44, Mazzatorta
-Trifluoromethyl, 63, Swiss Federal Office
-1-3-Tautomerizable, 265, Mazzatorta
-1-3-Tautomerizable, 296, Swiss Federal Office
-1-5-Tautomerizable, 124, Mazzatorta
-1-5-Tautomerizable, 148, Swiss Federal Office
-Michael acceptor, 33, Mazzatorta
-Michael acceptor, 33, Swiss Federal Office
-CH-acidic, 60, Mazzatorta
-CH-acidic, 73, Swiss Federal Office
+Alkene, 39, Nestle
+Alkene, 30, FSVO
+Alkylchloride, 71, Nestle
+Alkylchloride, 41, FSVO
+Alkylfluoride, 52, Nestle
+Alkylfluoride, 74, FSVO
+Alcohol, 44, Nestle
+Alcohol, 27, FSVO
+Dialkylether, 35, Nestle
+Dialkylether, 32, FSVO
+Alkylarylether, 62, Nestle
+Alkylarylether, 90, FSVO
+Diarylether, 35, Nestle
+Diarylether, 47, FSVO
+Amine, 66, Nestle
+Amine, 41, FSVO
+Primary arom amine, 26, Nestle
+Primary arom amine, 15, FSVO
+Ketone, 21, Nestle
+Ketone, 25, FSVO
+Chloroalkene, 39, Nestle
+Chloroalkene, 21, FSVO
+Carboxylic acid, 33, Nestle
+Carboxylic acid, 38, FSVO
+Carboxylic acid derivative, 215, Nestle
+Carboxylic acid derivative, 227, FSVO
+Amide, 38, Nestle
+Amide, 60, FSVO
+Secondary amide, 22, Nestle
+Secondary amide, 36, FSVO
+Imidolactone, 13, Nestle
+Imidolactone, 32, FSVO
+Nitrile, 35, Nestle
+Nitrile, 39, FSVO
+Vinylogous ester, 113, Nestle
+Vinylogous ester, 120, FSVO
+Vinylogous halide, 11, Nestle
+Vinylogous halide, 27, FSVO
+Carbonic acid derivatives, 109, Nestle
+Carbonic acid derivatives, 131, FSVO
+Urethan, 34, Nestle
+Urethan, 35, FSVO
+Phenol, 27, Nestle
+Phenol, 9, FSVO
+Arylchloride, 142, Nestle
+Arylchloride, 163, FSVO
+Arylfluoride, 22, Nestle
+Arylfluoride, 41, FSVO
+Oxoarene, 32, Nestle
+Oxoarene, 29, FSVO
+Heteroaromatic, 147, Nestle
+Heteroaromatic, 205, FSVO
+Nitro, 42, Nestle
+Nitro, 31, FSVO
+Sulfonic derivative, 24, Nestle
+Sulfonic derivative, 29, FSVO
+Sulfenic derivative, 48, Nestle
+Sulfenic derivative, 34, FSVO
+Phosphoric acid derivative, 70, Nestle
+Phosphoric acid derivative, 44, FSVO
+Aromatic, 402, Nestle
+Aromatic, 396, FSVO
+Heterocyclic, 228, Nestle
+Heterocyclic, 272, FSVO
+Trifluoromethyl, 44, Nestle
+Trifluoromethyl, 63, FSVO
+1-3-Tautomerizable, 265, Nestle
+1-3-Tautomerizable, 296, FSVO
+1-5-Tautomerizable, 124, Nestle
+1-5-Tautomerizable, 148, FSVO
+Michael acceptor, 33, Nestle
+Michael acceptor, 33, FSVO
+CH-acidic, 60, Nestle
+CH-acidic, 73, FSVO
diff --git a/figures/dataset-variability.pdf b/figures/dataset-variability.pdf
index 331ce23..5a4ddcc 100644
--- a/figures/dataset-variability.pdf
+++ b/figures/dataset-variability.pdf
Binary files differ
diff --git a/figures/functional-groups.pdf b/figures/functional-groups.pdf
index a44fa34..c0a7732 100644
--- a/figures/functional-groups.pdf
+++ b/figures/functional-groups.pdf
Binary files differ
diff --git a/figures/median-correlation.pdf b/figures/median-correlation.pdf
index 80a550b..74dacfb 100644
--- a/figures/median-correlation.pdf
+++ b/figures/median-correlation.pdf
Binary files differ
diff --git a/loael.Rmd b/loael.Rmd
index 3ff6146..3c4ae5e 100644
--- a/loael.Rmd
+++ b/loael.Rmd
@@ -20,7 +20,7 @@ abstract: |
better than random guessing, but the errors to be expected are higher and
a manual inspection of prediction results is highly recommended.
-documentclass: article
+documentclass: achemso
bibliography: references.bibtex
figPrefix: Figure
eqnPrefix: Equation
@@ -295,7 +295,7 @@ optimizing the number of RF components by bootstrap resampling.
Finally the local RF model is applied to [predict the
activity](https://github.com/opentox/lazar/blob/loael-paper.submission/lib/model.rb#L194-L272)
of the query compound. The RMSE of bootstrapped local model predictions is used
-to construct 95\% prediction intervals at 1.96*RMSE.
+to construct 95\% prediction intervals at 1.96*RMSE. The width of the prediction interval indicates the expected prediction accuracy. The "true" value of a prediction should be with 95\% probability within the prediction interval.
If RF modelling or prediction fails, the program resorts to using the [weighted
mean](https://github.com/opentox/lazar/blob/loael-paper.submission/lib/regression.rb#L6-L16)
@@ -696,8 +696,7 @@ experimental variability [@LoPiparo2014]. In the present
study, a similar approach was applied to build models generating
quantitative predictions of long-term toxicity. Two databases compiling
chronic oral rat lowest adverse effect levels (LOAEL) as endpoint were
-available from different sources. <span id="dataset-comparison-1"
-class="anchor"></span>Our investigations clearly indicated that the
+available from different sources. Our investigations clearly indicated that the
Nestlé and FSVO databases are very similar in terms of chemical
structures and properties as well as distribution of experimental LOAEL
values. The only significant difference that we observed was that the
@@ -710,7 +709,7 @@ chemicals available in the training datasets had at least two
independent studies/LOAELs. These studies were exploited to generate
information on the reproducibility of chronic animal studies and were
used to evaluate prediction performance of the models in the context of
-experimental variability.Considerable variability in the experimental
+experimental variability. Considerable variability in the experimental
data was observed. Study design differences, including dose selection,
dose spacing and route of administration are likely explanation of
experimental variability. High experimental variability has an impact on
@@ -719,8 +718,7 @@ quality by introducing noise into the training data, secondly it
influences accuracy estimates because predictions have to be compared
against noisy data where "true" experimental values are unknown. This
will become obvious in the next section, where comparison of predictions
-with experimental data is discussed.<span id="lazar-predictions"
-class="anchor"></span>The data obtained in the present study indicate
+with experimental data is discussed. The data obtained in the present study indicate
that `lazar` generates reliable predictions for compounds within the
applicability domain of the training data (i.e. predictions without
warnings, which indicates a sufficient number of neighbors with
diff --git a/loael.md b/loael.md
index fe9eb27..0a1397b 100644
--- a/loael.md
+++ b/loael.md
@@ -20,7 +20,7 @@ abstract: |
better than random guessing, but the errors to be expected are higher and
a manual inspection of prediction results is highly recommended.
-documentclass: article
+documentclass: achemso
bibliography: references.bibtex
figPrefix: Figure
eqnPrefix: Equation
@@ -287,7 +287,7 @@ optimizing the number of RF components by bootstrap resampling.
Finally the local RF model is applied to [predict the
activity](https://github.com/opentox/lazar/blob/loael-paper.submission/lib/model.rb#L194-L272)
of the query compound. The RMSE of bootstrapped local model predictions is used
-to construct 95\% prediction intervals at 1.96*RMSE.
+to construct 95\% prediction intervals at 1.96*RMSE. The width of the prediction interval indicates the expected prediction accuracy. The "true" value of a prediction should be with 95\% probability within the prediction interval.
If RF modelling or prediction fails, the program resorts to using the [weighted
mean](https://github.com/opentox/lazar/blob/loael-paper.submission/lib/regression.rb#L6-L16)
@@ -609,8 +609,7 @@ experimental variability [@LoPiparo2014]. In the present
study, a similar approach was applied to build models generating
quantitative predictions of long-term toxicity. Two databases compiling
chronic oral rat lowest adverse effect levels (LOAEL) as endpoint were
-available from different sources. <span id="dataset-comparison-1"
-class="anchor"></span>Our investigations clearly indicated that the
+available from different sources. Our investigations clearly indicated that the
Nestlé and FSVO databases are very similar in terms of chemical
structures and properties as well as distribution of experimental LOAEL
values. The only significant difference that we observed was that the
@@ -623,7 +622,7 @@ chemicals available in the training datasets had at least two
independent studies/LOAELs. These studies were exploited to generate
information on the reproducibility of chronic animal studies and were
used to evaluate prediction performance of the models in the context of
-experimental variability.Considerable variability in the experimental
+experimental variability. Considerable variability in the experimental
data was observed. Study design differences, including dose selection,
dose spacing and route of administration are likely explanation of
experimental variability. High experimental variability has an impact on
@@ -632,8 +631,7 @@ quality by introducing noise into the training data, secondly it
influences accuracy estimates because predictions have to be compared
against noisy data where "true" experimental values are unknown. This
will become obvious in the next section, where comparison of predictions
-with experimental data is discussed.<span id="lazar-predictions"
-class="anchor"></span>The data obtained in the present study indicate
+with experimental data is discussed. The data obtained in the present study indicate
that `lazar` generates reliable predictions for compounds within the
applicability domain of the training data (i.e. predictions without
warnings, which indicates a sufficient number of neighbors with
@@ -713,7 +711,7 @@ where no predictions can be made, because there are no similar compounds in the
and in such cases it is preferable to avoid predictions instead of random guessing.
-->
-Elena: Should we add a GUI screenshot?
+TODO: GUI screenshot
<!--
is covered in
diff --git a/loael.pdf b/loael.pdf
index 82b541d..b1b46fe 100644
--- a/loael.pdf
+++ b/loael.pdf
Binary files differ
diff --git a/loael.tex b/loael.tex
index b5c625b..b82a370 100644
--- a/loael.tex
+++ b/loael.tex
@@ -1,4 +1,4 @@
-\documentclass[]{article}
+\documentclass[]{achemso}
\usepackage{lmodern}
\usepackage{amssymb,amsmath}
\usepackage{ifxetex,ifluatex}
@@ -389,7 +389,9 @@ Finally the local RF model is applied to
\href{https://github.com/opentox/lazar/blob/loael-paper.submission/lib/model.rb\#L194-L272}{predict
the activity} of the query compound. The RMSE of bootstrapped local
model predictions is used to construct 95\% prediction intervals at
-1.96*RMSE.
+1.96*RMSE. The width of the prediction interval indicates the expected
+prediction accuracy. The ``true'' value of a prediction should be with
+95\% probability within the prediction interval.
If RF modelling or prediction fails, the program resorts to using the
\href{https://github.com/opentox/lazar/blob/loael-paper.submission/lib/regression.rb\#L6-L16}{weighted
@@ -725,20 +727,20 @@ experimental variability (Lo Piparo et al. 2014). In the present study,
a similar approach was applied to build models generating quantitative
predictions of long-term toxicity. Two databases compiling chronic oral
rat lowest adverse effect levels (LOAEL) as endpoint were available from
-different sources. \protect\hypertarget{dataset-comparison-1}{}{}Our
-investigations clearly indicated that the Nestlé and FSVO databases are
-very similar in terms of chemical structures and properties as well as
-distribution of experimental LOAEL values. The only significant
-difference that we observed was that the Nestlé one has larger amount of
-small molecules, than the FSVO database. For this reason we pooled both
-databases into a single training dataset for read across predictions.
+different sources. Our investigations clearly indicated that the Nestlé
+and FSVO databases are very similar in terms of chemical structures and
+properties as well as distribution of experimental LOAEL values. The
+only significant difference that we observed was that the Nestlé one has
+larger amount of small molecules, than the FSVO database. For this
+reason we pooled both databases into a single training dataset for read
+across predictions.
An early review of the databases revealed that 155 out of the 671
chemicals available in the training datasets had at least two
independent studies/LOAELs. These studies were exploited to generate
information on the reproducibility of chronic animal studies and were
used to evaluate prediction performance of the models in the context of
-experimental variability.Considerable variability in the experimental
+experimental variability. Considerable variability in the experimental
data was observed. Study design differences, including dose selection,
dose spacing and route of administration are likely explanation of
experimental variability. High experimental variability has an impact on
@@ -747,15 +749,14 @@ quality by introducing noise into the training data, secondly it
influences accuracy estimates because predictions have to be compared
against noisy data where ``true'' experimental values are unknown. This
will become obvious in the next section, where comparison of predictions
-with experimental data is
-discussed.\protect\hypertarget{lazar-predictions}{}{}The data obtained
-in the present study indicate that \texttt{lazar} generates reliable
-predictions for compounds within the applicability domain of the
-training data (i.e.~predictions without warnings, which indicates a
-sufficient number of neighbors with similarity \textgreater{} 0.5 to
-create local random forest models). Correlation analysis shows that
-errors (\(\text{RMSE}\)) and explained variance (\(r^{2}\)) are
-comparable to experimental variability of the training data.
+with experimental data is discussed. The data obtained in the present
+study indicate that \texttt{lazar} generates reliable predictions for
+compounds within the applicability domain of the training data
+(i.e.~predictions without warnings, which indicates a sufficient number
+of neighbors with similarity \textgreater{} 0.5 to create local random
+forest models). Correlation analysis shows that errors (\(\text{RMSE}\))
+and explained variance (\(r^{2}\)) are comparable to experimental
+variability of the training data.
Predictions with a warning (neighbor similarity \textless{} 0.5 and
\textgreater{} 0.2 or weighted average predictions) are more uncertain.
@@ -786,7 +787,7 @@ since evidence suggest that exposure duration has little impact on the
levels of NOAELs/LOAELs (Zarn, Engeli, and Schlatter 2011, Zarn, Engeli,
and Schlatter (2013)).
-Elena: Should we add a GUI screenshot?
+TODO: GUI screenshot
\section{Summary}\label{summary}
diff --git a/scripts/dataset-variability.R b/scripts/dataset-variability.R
index 68271d9..2fa7327 100755
--- a/scripts/dataset-variability.R
+++ b/scripts/dataset-variability.R
@@ -3,10 +3,9 @@ library(ggplot2)
library(grid)
library(gridExtra)
-data <- read.csv("data/test_log10.csv",header=T)
+data <- read.csv("data/test_log10_database_fix.csv",header=T)
data$SMILES <- reorder(data$SMILES,data$LOAEL)
img = ggplot(data,aes(SMILES,LOAEL,ymin = min(LOAEL), ymax=max(LOAEL),color=Dataset)) + geom_point()
img <- img + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank()) + theme(legend.title=element_blank())
-img = img + scale_fill_discrete(breaks=c("Mazzatorta", "Both", "Swiss Federal Office"))
ggsave(file='figures/dataset-variability.pdf', plot=img, width=12,height=8)
diff --git a/scripts/functional-groups4R.rb b/scripts/functional-groups4R.rb
index 0b14b7a..3203b0f 100755
--- a/scripts/functional-groups4R.rb
+++ b/scripts/functional-groups4R.rb
@@ -1,3 +1,4 @@
+#!/usr/bin/env ruby
require 'csv'
csv = []
exclude = [
@@ -20,8 +21,8 @@ CSV.foreach("data/functional-groups.csv") do |row|
keep = false if row[0].match(patt)
end
if keep and [row[1].to_i,row[2].to_i].max >= 25
- csv << [row[0].gsub('_',' '),row[1].to_i,"Mazzatorta"]
- csv << [row[0].gsub('_',' '),row[2].to_i,"Swiss Federal Office"]
+ csv << [row[0].gsub('_',' '),row[1].to_i,"Nestle"]
+ csv << [row[0].gsub('_',' '),row[2].to_i,"FSVO"]
else
p row
end
diff --git a/scripts/median-correlation-plot.R b/scripts/median-correlation-plot.R
index f4b28c2..3ea5ddd 100755
--- a/scripts/median-correlation-plot.R
+++ b/scripts/median-correlation-plot.R
@@ -3,6 +3,6 @@
library(ggplot2)
experimental <- read.csv("data/median-correlation.csv",header=T)
-img = qplot(mazzatorta,swiss,data=experimental,xlab="-log10(LOAEL Mazzatorta median)",ylab="-log10(LOAEL Swiss Federal Office median)") + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4)
+img = qplot(mazzatorta,swiss,data=experimental,xlab="-log10(LOAEL Nestle median)",ylab="-log10(LOAEL FSVO median)") + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4)
ggsave(file='figures/median-correlation.pdf', plot=img,width=12, height=8)