database names in figures

author: Christoph Helma <helma@in-silico.ch> 2017-12-20 16:16:40 +0100
committer: Christoph Helma <helma@in-silico.ch> 2017-12-20 16:16:40 +0100
commit: 0983a03eaf0df05b464a4f537947692825560908 (patch)
tree: e66e24d11f6636673f6ff909c1a5441f9a9fb949
parent: a5a9144dd7eb4cb9455c5674325ce6e0cc17af61 (diff)
12 files changed, 115 insertions, 115 deletions
diff --git a/Makefile b/Makefile
index 19cff1d..4da3cc1 100644
--- a/Makefile
+++ b/Makefile
@@ -23,7 +23,7 @@ loael.docx: loael.md
 figures/functional-groups.pdf: data/functional-groups-reduced4R.csv
 	scripts/functional-groups.R
 
-figures/dataset-variability.pdf: data/mazzatorta_log10.csv data/swiss_log10.csv
+figures/dataset-variability.pdf: data/test_log10_database_fix.csv
 	scripts/dataset-variability.R
 
 figures/crossvalidation0.pdf: data/training_log10-cv-0.csv
@@ -90,6 +90,9 @@ data/median-correlation.csv: data/mazzatorta_log10.csv data/swiss_log10.csv
 data/test_log10.csv: data/mazzatorta_log10.csv data/swiss_log10.csv
 	scripts/create-testset.rb 
 
+data/test_log10_database_fix.csv: data/test_log10.csv
+	sed 's/mazzatorta/Nestle/' data/test_log10.csv | sed 's/mazzatorta and swiss/Both/' | sed 's/swiss/FSVO/' > data/test_log10_database_fix.csv
+
 # Training set
 data/training_log10.csv: data/mazzatorta_log10.csv data/swiss_log10.csv
 	scripts/create-trainingset.rb
diff --git a/data/functional-groups-reduced4R.csv b/data/functional-groups-reduced4R.csv
index 51d6ec8..1cfaf87 100644
--- a/data/functional-groups-reduced4R.csv
+++ b/data/functional-groups-reduced4R.csv
@@ -1,74 +1,74 @@
-Alkene, 39, Mazzatorta
-Alkene, 30, Swiss Federal Office
-Alkylchloride, 71, Mazzatorta
-Alkylchloride, 41, Swiss Federal Office
-Alkylfluoride, 52, Mazzatorta
-Alkylfluoride, 74, Swiss Federal Office
-Alcohol, 44, Mazzatorta
-Alcohol, 27, Swiss Federal Office
-Dialkylether, 35, Mazzatorta
-Dialkylether, 32, Swiss Federal Office
-Alkylarylether, 62, Mazzatorta
-Alkylarylether, 90, Swiss Federal Office
-Diarylether, 35, Mazzatorta
-Diarylether, 47, Swiss Federal Office
-Amine, 66, Mazzatorta
-Amine, 41, Swiss Federal Office
-Primary arom amine, 26, Mazzatorta
-Primary arom amine, 15, Swiss Federal Office
-Ketone, 21, Mazzatorta
-Ketone, 25, Swiss Federal Office
-Chloroalkene, 39, Mazzatorta
-Chloroalkene, 21, Swiss Federal Office
-Carboxylic acid, 33, Mazzatorta
-Carboxylic acid, 38, Swiss Federal Office
-Carboxylic acid derivative, 215, Mazzatorta
-Carboxylic acid derivative, 227, Swiss Federal Office
-Amide, 38, Mazzatorta
-Amide, 60, Swiss Federal Office
-Secondary amide, 22, Mazzatorta
-Secondary amide, 36, Swiss Federal Office
-Imidolactone, 13, Mazzatorta
-Imidolactone, 32, Swiss Federal Office
-Nitrile, 35, Mazzatorta
-Nitrile, 39, Swiss Federal Office
-Vinylogous ester, 113, Mazzatorta
-Vinylogous ester, 120, Swiss Federal Office
-Vinylogous halide, 11, Mazzatorta
-Vinylogous halide, 27, Swiss Federal Office
-Carbonic acid derivatives, 109, Mazzatorta
-Carbonic acid derivatives, 131, Swiss Federal Office
-Urethan, 34, Mazzatorta
-Urethan, 35, Swiss Federal Office
-Phenol, 27, Mazzatorta
-Phenol, 9, Swiss Federal Office
-Arylchloride, 142, Mazzatorta
-Arylchloride, 163, Swiss Federal Office
-Arylfluoride, 22, Mazzatorta
-Arylfluoride, 41, Swiss Federal Office
-Oxoarene, 32, Mazzatorta
-Oxoarene, 29, Swiss Federal Office
-Heteroaromatic, 147, Mazzatorta
-Heteroaromatic, 205, Swiss Federal Office
-Nitro, 42, Mazzatorta
-Nitro, 31, Swiss Federal Office
-Sulfonic derivative, 24, Mazzatorta
-Sulfonic derivative, 29, Swiss Federal Office
-Sulfenic derivative, 48, Mazzatorta
-Sulfenic derivative, 34, Swiss Federal Office
-Phosphoric acid derivative, 70, Mazzatorta
-Phosphoric acid derivative, 44, Swiss Federal Office
-Aromatic, 402, Mazzatorta
-Aromatic, 396, Swiss Federal Office
-Heterocyclic, 228, Mazzatorta
-Heterocyclic, 272, Swiss Federal Office
-Trifluoromethyl, 44, Mazzatorta
-Trifluoromethyl, 63, Swiss Federal Office
-1-3-Tautomerizable, 265, Mazzatorta
-1-3-Tautomerizable, 296, Swiss Federal Office
-1-5-Tautomerizable, 124, Mazzatorta
-1-5-Tautomerizable, 148, Swiss Federal Office
-Michael acceptor, 33, Mazzatorta
-Michael acceptor, 33, Swiss Federal Office
-CH-acidic, 60, Mazzatorta
-CH-acidic, 73, Swiss Federal Office
+Alkene, 39, Nestle
+Alkene, 30, FSVO
+Alkylchloride, 71, Nestle
+Alkylchloride, 41, FSVO
+Alkylfluoride, 52, Nestle
+Alkylfluoride, 74, FSVO
+Alcohol, 44, Nestle
+Alcohol, 27, FSVO
+Dialkylether, 35, Nestle
+Dialkylether, 32, FSVO
+Alkylarylether, 62, Nestle
+Alkylarylether, 90, FSVO
+Diarylether, 35, Nestle
+Diarylether, 47, FSVO
+Amine, 66, Nestle
+Amine, 41, FSVO
+Primary arom amine, 26, Nestle
+Primary arom amine, 15, FSVO
+Ketone, 21, Nestle
+Ketone, 25, FSVO
+Chloroalkene, 39, Nestle
+Chloroalkene, 21, FSVO
+Carboxylic acid, 33, Nestle
+Carboxylic acid, 38, FSVO
+Carboxylic acid derivative, 215, Nestle
+Carboxylic acid derivative, 227, FSVO
+Amide, 38, Nestle
+Amide, 60, FSVO
+Secondary amide, 22, Nestle
+Secondary amide, 36, FSVO
+Imidolactone, 13, Nestle
+Imidolactone, 32, FSVO
+Nitrile, 35, Nestle
+Nitrile, 39, FSVO
+Vinylogous ester, 113, Nestle
+Vinylogous ester, 120, FSVO
+Vinylogous halide, 11, Nestle
+Vinylogous halide, 27, FSVO
+Carbonic acid derivatives, 109, Nestle
+Carbonic acid derivatives, 131, FSVO
+Urethan, 34, Nestle
+Urethan, 35, FSVO
+Phenol, 27, Nestle
+Phenol, 9, FSVO
+Arylchloride, 142, Nestle
+Arylchloride, 163, FSVO
+Arylfluoride, 22, Nestle
+Arylfluoride, 41, FSVO
+Oxoarene, 32, Nestle
+Oxoarene, 29, FSVO
+Heteroaromatic, 147, Nestle
+Heteroaromatic, 205, FSVO
+Nitro, 42, Nestle
+Nitro, 31, FSVO
+Sulfonic derivative, 24, Nestle
+Sulfonic derivative, 29, FSVO
+Sulfenic derivative, 48, Nestle
+Sulfenic derivative, 34, FSVO
+Phosphoric acid derivative, 70, Nestle
+Phosphoric acid derivative, 44, FSVO
+Aromatic, 402, Nestle
+Aromatic, 396, FSVO
+Heterocyclic, 228, Nestle
+Heterocyclic, 272, FSVO
+Trifluoromethyl, 44, Nestle
+Trifluoromethyl, 63, FSVO
+1-3-Tautomerizable, 265, Nestle
+1-3-Tautomerizable, 296, FSVO
+1-5-Tautomerizable, 124, Nestle
+1-5-Tautomerizable, 148, FSVO
+Michael acceptor, 33, Nestle
+Michael acceptor, 33, FSVO
+CH-acidic, 60, Nestle
+CH-acidic, 73, FSVO
diff --git a/figures/dataset-variability.pdf b/figures/dataset-variability.pdf
index 331ce23..5a4ddcc 100644
--- a/figures/dataset-variability.pdf
+++ b/figures/dataset-variability.pdf
diff --git a/figures/functional-groups.pdf b/figures/functional-groups.pdf
index a44fa34..c0a7732 100644
--- a/figures/functional-groups.pdf
+++ b/figures/functional-groups.pdf
diff --git a/figures/median-correlation.pdf b/figures/median-correlation.pdf
index 80a550b..74dacfb 100644
--- a/figures/median-correlation.pdf
+++ b/figures/median-correlation.pdf
diff --git a/loael.Rmd b/loael.Rmd
index 3ff6146..3c4ae5e 100644
--- a/loael.Rmd
+++ b/loael.Rmd
@@ -20,7 +20,7 @@ abstract: |
   better than random guessing, but the errors to be expected are higher and
   a manual inspection of prediction results is highly recommended.
 
-documentclass: article
+documentclass: achemso
 bibliography: references.bibtex
 figPrefix: Figure
 eqnPrefix: Equation
@@ -295,7 +295,7 @@ optimizing the number of RF components by bootstrap resampling.
 Finally the local RF model is applied to [predict the
 activity](https://github.com/opentox/lazar/blob/loael-paper.submission/lib/model.rb#L194-L272)
 of the query compound. The RMSE of bootstrapped local model predictions is used
-to construct 95\% prediction intervals at 1.96*RMSE.
+to construct 95\% prediction intervals at 1.96*RMSE. The width of the prediction interval indicates the expected prediction accuracy. The "true" value of a prediction should be with 95\% probability within the prediction interval.
 
 If RF modelling or prediction fails, the program resorts to using the [weighted
 mean](https://github.com/opentox/lazar/blob/loael-paper.submission/lib/regression.rb#L6-L16)
@@ -696,8 +696,7 @@ experimental variability [@LoPiparo2014]. In the present
 study, a similar approach was applied to build models generating
 quantitative predictions of long-term toxicity. Two databases compiling
 chronic oral rat lowest adverse effect levels (LOAEL) as endpoint were
-available from different sources. <span id="dataset-comparison-1"
-class="anchor"></span>Our investigations clearly indicated that the
+available from different sources. Our investigations clearly indicated that the
 Nestlé and FSVO databases are very similar in terms of chemical
 structures and properties as well as distribution of experimental LOAEL
 values. The only significant difference that we observed was that the
@@ -710,7 +709,7 @@ chemicals available in the training datasets had at least two
 independent studies/LOAELs. These studies were exploited to generate
 information on the reproducibility of chronic animal studies and were
 used to evaluate prediction performance of the models in the context of
-experimental variability.Considerable variability in the experimental
+experimental variability. Considerable variability in the experimental
 data was observed. Study design differences, including dose selection,
 dose spacing and route of administration are likely explanation of
 experimental variability. High experimental variability has an impact on
@@ -719,8 +718,7 @@ quality by introducing noise into the training data, secondly it
 influences accuracy estimates because predictions have to be compared
 against noisy data where "true" experimental values are unknown. This
 will become obvious in the next section, where comparison of predictions
-with experimental data is discussed.<span id="lazar-predictions"
-class="anchor"></span>The data obtained in the present study indicate
+with experimental data is discussed. The data obtained in the present study indicate
 that `lazar` generates reliable predictions for compounds within the
 applicability domain of the training data (i.e. predictions without
 warnings, which indicates a sufficient number of neighbors with
diff --git a/loael.md b/loael.md
index fe9eb27..0a1397b 100644
--- a/loael.md
+++ b/loael.md
@@ -20,7 +20,7 @@ abstract: |
   better than random guessing, but the errors to be expected are higher and
   a manual inspection of prediction results is highly recommended.
 
-documentclass: article
+documentclass: achemso
 bibliography: references.bibtex
 figPrefix: Figure
 eqnPrefix: Equation
@@ -287,7 +287,7 @@ optimizing the number of RF components by bootstrap resampling.
 Finally the local RF model is applied to [predict the
 activity](https://github.com/opentox/lazar/blob/loael-paper.submission/lib/model.rb#L194-L272)
 of the query compound. The RMSE of bootstrapped local model predictions is used
-to construct 95\% prediction intervals at 1.96*RMSE.
+to construct 95\% prediction intervals at 1.96*RMSE. The width of the prediction interval indicates the expected prediction accuracy. The "true" value of a prediction should be with 95\% probability within the prediction interval.
 
 If RF modelling or prediction fails, the program resorts to using the [weighted
 mean](https://github.com/opentox/lazar/blob/loael-paper.submission/lib/regression.rb#L6-L16)
@@ -609,8 +609,7 @@ experimental variability [@LoPiparo2014]. In the present
 study, a similar approach was applied to build models generating
 quantitative predictions of long-term toxicity. Two databases compiling
 chronic oral rat lowest adverse effect levels (LOAEL) as endpoint were
-available from different sources. <span id="dataset-comparison-1"
-class="anchor"></span>Our investigations clearly indicated that the
+available from different sources. Our investigations clearly indicated that the
 Nestlé and FSVO databases are very similar in terms of chemical
 structures and properties as well as distribution of experimental LOAEL
 values. The only significant difference that we observed was that the
@@ -623,7 +622,7 @@ chemicals available in the training datasets had at least two
 independent studies/LOAELs. These studies were exploited to generate
 information on the reproducibility of chronic animal studies and were
 used to evaluate prediction performance of the models in the context of
-experimental variability.Considerable variability in the experimental
+experimental variability. Considerable variability in the experimental
 data was observed. Study design differences, including dose selection,
 dose spacing and route of administration are likely explanation of
 experimental variability. High experimental variability has an impact on
@@ -632,8 +631,7 @@ quality by introducing noise into the training data, secondly it
 influences accuracy estimates because predictions have to be compared
 against noisy data where "true" experimental values are unknown. This
 will become obvious in the next section, where comparison of predictions
-with experimental data is discussed.<span id="lazar-predictions"
-class="anchor"></span>The data obtained in the present study indicate
+with experimental data is discussed. The data obtained in the present study indicate
 that `lazar` generates reliable predictions for compounds within the
 applicability domain of the training data (i.e. predictions without
 warnings, which indicates a sufficient number of neighbors with
@@ -713,7 +711,7 @@ where no predictions can be made, because there are no similar compounds in the
  and in such cases it is preferable to avoid predictions instead of random guessing.
 -->
 
-Elena: Should we add a GUI screenshot?
+TODO: GUI screenshot
 
 <!--
 is covered in
diff --git a/loael.pdf b/loael.pdf
index 82b541d..b1b46fe 100644
--- a/loael.pdf
+++ b/loael.pdf
diff --git a/loael.tex b/loael.tex
index b5c625b..b82a370 100644
--- a/loael.tex
+++ b/loael.tex
@@ -1,4 +1,4 @@
-\documentclass[]{article}
+\documentclass[]{achemso}
 \usepackage{lmodern}
 \usepackage{amssymb,amsmath}
 \usepackage{ifxetex,ifluatex}
@@ -389,7 +389,9 @@ Finally the local RF model is applied to
 \href{https://github.com/opentox/lazar/blob/loael-paper.submission/lib/model.rb\#L194-L272}{predict
 the activity} of the query compound. The RMSE of bootstrapped local
 model predictions is used to construct 95\% prediction intervals at
-1.96*RMSE.
+1.96*RMSE. The width of the prediction interval indicates the expected
+prediction accuracy. The ``true'' value of a prediction should be with
+95\% probability within the prediction interval.
 
 If RF modelling or prediction fails, the program resorts to using the
 \href{https://github.com/opentox/lazar/blob/loael-paper.submission/lib/regression.rb\#L6-L16}{weighted
@@ -725,20 +727,20 @@ experimental variability (Lo Piparo et al. 2014). In the present study,
 a similar approach was applied to build models generating quantitative
 predictions of long-term toxicity. Two databases compiling chronic oral
 rat lowest adverse effect levels (LOAEL) as endpoint were available from
-different sources. \protect\hypertarget{dataset-comparison-1}{}{}Our
-investigations clearly indicated that the Nestlé and FSVO databases are
-very similar in terms of chemical structures and properties as well as
-distribution of experimental LOAEL values. The only significant
-difference that we observed was that the Nestlé one has larger amount of
-small molecules, than the FSVO database. For this reason we pooled both
-databases into a single training dataset for read across predictions.
+different sources. Our investigations clearly indicated that the Nestlé
+and FSVO databases are very similar in terms of chemical structures and
+properties as well as distribution of experimental LOAEL values. The
+only significant difference that we observed was that the Nestlé one has
+larger amount of small molecules, than the FSVO database. For this
+reason we pooled both databases into a single training dataset for read
+across predictions.
 
 An early review of the databases revealed that 155 out of the 671
 chemicals available in the training datasets had at least two
 independent studies/LOAELs. These studies were exploited to generate
 information on the reproducibility of chronic animal studies and were
 used to evaluate prediction performance of the models in the context of
-experimental variability.Considerable variability in the experimental
+experimental variability. Considerable variability in the experimental
 data was observed. Study design differences, including dose selection,
 dose spacing and route of administration are likely explanation of
 experimental variability. High experimental variability has an impact on
@@ -747,15 +749,14 @@ quality by introducing noise into the training data, secondly it
 influences accuracy estimates because predictions have to be compared
 against noisy data where ``true'' experimental values are unknown. This
 will become obvious in the next section, where comparison of predictions
-with experimental data is
-discussed.\protect\hypertarget{lazar-predictions}{}{}The data obtained
-in the present study indicate that \texttt{lazar} generates reliable
-predictions for compounds within the applicability domain of the
-training data (i.e.~predictions without warnings, which indicates a
-sufficient number of neighbors with similarity \textgreater{} 0.5 to
-create local random forest models). Correlation analysis shows that
-errors (\(\text{RMSE}\)) and explained variance (\(r^{2}\)) are
-comparable to experimental variability of the training data.
+with experimental data is discussed. The data obtained in the present
+study indicate that \texttt{lazar} generates reliable predictions for
+compounds within the applicability domain of the training data
+(i.e.~predictions without warnings, which indicates a sufficient number
+of neighbors with similarity \textgreater{} 0.5 to create local random
+forest models). Correlation analysis shows that errors (\(\text{RMSE}\))
+and explained variance (\(r^{2}\)) are comparable to experimental
+variability of the training data.
 
 Predictions with a warning (neighbor similarity \textless{} 0.5 and
 \textgreater{} 0.2 or weighted average predictions) are more uncertain.
@@ -786,7 +787,7 @@ since evidence suggest that exposure duration has little impact on the
 levels of NOAELs/LOAELs (Zarn, Engeli, and Schlatter 2011, Zarn, Engeli,
 and Schlatter (2013)).
 
-Elena: Should we add a GUI screenshot?
+TODO: GUI screenshot
 
 \section{Summary}\label{summary}
 
diff --git a/scripts/dataset-variability.R b/scripts/dataset-variability.R
index 68271d9..2fa7327 100755
--- a/scripts/dataset-variability.R
+++ b/scripts/dataset-variability.R
@@ -3,10 +3,9 @@ library(ggplot2)
 library(grid)
 library(gridExtra)
 
-data <- read.csv("data/test_log10.csv",header=T)
+data <- read.csv("data/test_log10_database_fix.csv",header=T)
 data$SMILES <- reorder(data$SMILES,data$LOAEL)
 img = ggplot(data,aes(SMILES,LOAEL,ymin = min(LOAEL), ymax=max(LOAEL),color=Dataset)) + geom_point()
 img <- img + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank())  + theme(legend.title=element_blank())
-img = img + scale_fill_discrete(breaks=c("Mazzatorta", "Both", "Swiss Federal Office"))
 
 ggsave(file='figures/dataset-variability.pdf', plot=img, width=12,height=8)
diff --git a/scripts/functional-groups4R.rb b/scripts/functional-groups4R.rb
index 0b14b7a..3203b0f 100755
--- a/scripts/functional-groups4R.rb
+++ b/scripts/functional-groups4R.rb
@@ -1,3 +1,4 @@
+#!/usr/bin/env ruby
 require 'csv'
 csv = []
 exclude = [
@@ -20,8 +21,8 @@ CSV.foreach("data/functional-groups.csv") do |row|
     keep = false if row[0].match(patt)
   end
   if keep and [row[1].to_i,row[2].to_i].max >= 25
-    csv << [row[0].gsub('_',' '),row[1].to_i,"Mazzatorta"]
-    csv << [row[0].gsub('_',' '),row[2].to_i,"Swiss Federal Office"]
+    csv << [row[0].gsub('_',' '),row[1].to_i,"Nestle"]
+    csv << [row[0].gsub('_',' '),row[2].to_i,"FSVO"]
   else
     p row
   end
diff --git a/scripts/median-correlation-plot.R b/scripts/median-correlation-plot.R
index f4b28c2..3ea5ddd 100755
--- a/scripts/median-correlation-plot.R
+++ b/scripts/median-correlation-plot.R
@@ -3,6 +3,6 @@
 library(ggplot2)
 
 experimental <- read.csv("data/median-correlation.csv",header=T)
-img = qplot(mazzatorta,swiss,data=experimental,xlab="-log10(LOAEL Mazzatorta median)",ylab="-log10(LOAEL Swiss Federal Office median)") + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4)
+img = qplot(mazzatorta,swiss,data=experimental,xlab="-log10(LOAEL Nestle median)",ylab="-log10(LOAEL FSVO median)") + geom_point() + geom_abline(intercept=0.0) + xlim(-1,4) + ylim(-1,4)
 
 ggsave(file='figures/median-correlation.pdf', plot=img,width=12, height=8)
author	Christoph Helma <helma@in-silico.ch>	2017-12-20 16:16:40 +0100
committer	Christoph Helma <helma@in-silico.ch>	2017-12-20 16:16:40 +0100
commit	0983a03eaf0df05b464a4f537947692825560908 (patch)
tree	e66e24d11f6636673f6ff909c1a5441f9a9fb949
parent	a5a9144dd7eb4cb9455c5674325ce6e0cc17af61 (diff)