prediction interval, discussion

author: Christoph Helma <helma@in-silico.ch> 2016-03-04 15:27:20 +0100
committer: Christoph Helma <helma@in-silico.ch> 2016-03-04 15:27:20 +0100
commit: 7ad7c10c1e708f6b5a3473de24dbeab03d0b74a3 (patch)
tree: 14e7a8c37343f3d878e8116873978861b226b5bc
parent: d3071896a7116670756199f0df7c2a618de2aea3 (diff)
13 files changed, 604 insertions, 402 deletions
diff --git a/Makefile b/Makefile
index 4fd750e..2d290d4 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 
 datasets = data/median-correlation.csv data/test.csv data/training.csv data/mazzatorta.csv data/swiss.csv data/test.json data/training.json data/mazzatorta.json data/swiss.json
 crossvalidations = data/training-cv-0.csv data/training-cv-1.csv data/training-cv-2.csv
-validations = data/training-test-predictions.csv $(crossvalidations)
+validations = data/training-test-predictions.csv $(crossvalidations) data/misclassifications.csv
 figures = figure/functional-groups.pdf  figure/test-prediction.pdf figure/test-correlation.pdf figure/crossvalidation.pdf figure/dataset-variability.pdf
 
 # Paper
@@ -35,6 +35,9 @@ figure/test-correlation.pdf: data/training-test-predictions.csv data/median-corr
 
 # Validations
 
+data/misclassifications.csv: misclassifications.rb data/training-test-predictions.csv
+	ruby misclassifications.rb
+
 data/training-test-predictions.csv: test-validation.rb data/test.csv data/training.csv
 	ruby test-validation.rb training.csv
 
diff --git a/TODO b/TODO
index 0e671e5..c9b9f1c 100644
--- a/TODO
+++ b/TODO
@@ -1,6 +1,7 @@
 # lazar
 
-confidence intervals
+#confidence intervals
+TODO: github tags
 
 #try to increase predictive accuracies
 
@@ -14,7 +15,7 @@ confidence intervals
 warning if query compound contains new functional groups
 
 # analysis
-predictions fit experimental data
+#predictions fit experimental data
 
 #only combined dataset (training)
 #combined -> training dataset
@@ -23,7 +24,7 @@ predictions fit experimental data
 
 smarts f functional groups
 
-error band (expermental variability) for fig 7
+#error band (experimental variability) for fig 7
 
 # paper
 #datasets:
@@ -34,12 +35,12 @@ results:
   #CheSMapper remove text, figures
   #fig 2 smiles -> smarts, leave out?
 
-  variability from paolos paper
-  https://en.wikipedia.org/wiki/Root_mean_square_deviation
-  When the RMSD is normalized by the mean measured value, is usually called coefficient of variation of the RMSD, CV(RMSD). It is analogous to the coefficient of variation with the RMSD taking the place of the standard deviation.
+  #variability from paolos paper
+  #https://en.wikipedia.org/wiki/Root_mean_square_deviation
+  #When the RMSD is normalized by the mean measured value, is usually called coefficient of variation of the RMSD, CV(RMSD). It is analogous to the coefficient of variation with the RMSD taking the place of the standard deviation.
 
 discussion: mispredictions
 
-AD definition
+#AD definition
 
-line nrs in pdf
+#line nrs in pdf
diff --git a/data/training-test-predictions.csv b/data/training-test-predictions.csv
index ca34b33..a9be9df 100644
--- a/data/training-test-predictions.csv
+++ b/data/training-test-predictions.csv
@@ -1,151 +1,148 @@
-SMILES,LOAEL_measured_median,LOAEL_predicted,Confidence,Dataset
-N#Cc1nn(c(c1S(=O)C(F)(F)F)N)c1c(Cl)cc(cc1Cl)C(F)(F)F,0.00013611,0.01148216373256843,1,training-prediction
-OC(=O)C(Oc1ccc(cc1)Oc1ncc(cc1Cl)C(F)(F)F)C,0.00027647,0.012370470701925899,1,training-prediction
-CCOP(=S)(SCSC(C)(C)C)OCC,0.00027736000000000004,0.003520511683277722,1,training-prediction
-CCSCSP(=S)(OCC)OCC,0.00061449,0.0007932432885978219,1,training-prediction
-CCOP(=O)(SC(CC)C)SC(CC)C,0.000872805,0.005020259791233174,1,training-prediction
-CNC(=O)CSP(=S)(OC)OC,0.0010905,0.009515127939063834,1,training-prediction
-COP(=O)(SC)N,0.0020549,0.33146588454802073,1,training-prediction
-CSc1ccc(cc1C)OP(=S)(OC)OC,0.00210185,0.0052770620054827895,1,training-prediction
-CCOP(=S)(Oc1ccc2c(c1)oc(=O)c(c2C)Cl)OCC,0.0033631,0.036592963422387284,1,training-prediction
-CCOP(=S)(Oc1ncn(n1)c1ccccc1)OCC,0.0041492,0.021132003617537098,1,training-prediction
-COc1sc(=O)n(n1)CSP(=S)(OC)OC,0.00471335,0.0098233419774848,1,training-prediction
-CCOP(=S)(OCC)SCSP(=S)(OCC)OCC,0.0049418,0.0028924725896877944,1,training-prediction
-CCOP(=O)(Oc1ccc(c(c1)C)SC)NC(C)C,0.0049447,0.004831305474311618,1,training-prediction
-CC(Cc1ccc(cc1)C(C)(C)C)CN1CC(C)OC(C1)C,0.0056016,0.06867843129491752,1,training-prediction
-COP(=S)(Oc1nc(Cl)c(cc1Cl)Cl)OC,0.00620095,0.008969949496846933,1,training-prediction
-OC(C(Cl)(Cl)Cl)(c1ccc(cc1)Cl)c1ccc(cc1)Cl,0.0067479,0.0575408460077049,1,training-prediction
-COC(=O)N(C(=O)N1COC2(C(=N1)c1ccc(cc1C2)Cl)C(=O)OC)c1ccc(cc1)OC(F)(F)F,0.0068203,0.03706152163770416,1,training-prediction
-Clc1ccc(cc1)C(c1ccccc1Cl)(c1cncnc1)O,0.0075484,0.02455655297557998,1,training-prediction
-COP(=S)(SCn1nnc2c(c1=O)cccc2)OC,0.0076105,0.025703032163536588,1,training-prediction
-N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Br)Br,0.00781875,0.042853347293390985,1,training-prediction
-CCN(c1nc(cc(n1)C)OP(=S)(OC)OC)CC,0.0081878,0.01609250673284384,1,training-prediction
-CCCCOC(=O)C(Oc1ccc(cc1)Oc1ccc(cn1)C(F)(F)F)C,0.00885585,0.00642945433938663,1,training-prediction
-COP(=O)(OC=C(Cl)Cl)OC,0.0100688,0.03379255796926954,1,training-prediction
-CCCSP(=O)(SCCC)OCC,0.010069,0.012657180818924377,1,training-prediction
-O=S1OCC2C(CO1)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,0.012287,0.00022929597195349384,1,training-prediction
-CC(Oc1cc(c(cc1Cl)Cl)n1nc(oc1=O)C(C)(C)C)C,0.0124555,0.0639086556427057,1,training-prediction
-CNC(=O)Oc1cccc2c1OC(C2)(C)C,0.01394355,0.04063021221263945,1,training-prediction
-N#Cc1c(Cl)c(C#N)c(c(c1Cl)Cl)Cl,0.015043,0.0038259351864843435,1,training-prediction
-Fc1ccc(cc1)[Si](c1ccc(cc1)F)(Cn1cncn1)C,0.015853,0.012131065927580155,1,training-prediction
-CCOP(=S)(Oc1cc(C)nc(n1)C(C)C)OCC,0.016429,0.015095535744907523,1,training-prediction
-N#Cc1c(Cl)cccc1Cl,0.0165685,0.09350368116996392,1,training-prediction
-CCOP(=S)(Oc1nc(Cl)c(cc1Cl)Cl)OCC,0.017114,0.0021312647369153546,1,training-prediction
-BrC1COC(C1)(Cn1cncn1)c1ccc(cc1Cl)Cl,0.017185,0.028600129544374974,1,training-prediction
-CCN(C(=O)SCc1ccc(cc1)Cl)CC,0.0180385,0.018489302950787725,1,training-prediction
-CCCN(C(=O)n1cncc1)CCOc1c(Cl)cc(cc1Cl)Cl,0.019912,0.026822534406393834,1,training-prediction
-CCOC(=O)c1cn2nc(cc2nc1C)OP(=S)(OCC)OCC,0.0200875,0.11679267971674383,1,training-prediction
-COP(=S)(SCN1C(=O)c2c(C1=O)cccc2)OC,0.020484,0.03656348401256463,1,training-prediction
-CNC(=O)ON=C(C(=O)N(C)C)SC,0.022348,0.02628660622078008,1,training-prediction
-COC(=O)N(c1ccccc1COc1ccn(n1)c1ccc(cc1)Cl)OC,0.023207,0.09137694501130139,1,training-prediction
-CON(C(=O)Nc1ccc(c(c1)Cl)Cl)C,0.025091,0.013151125795315764,1,training-prediction
-O=C(NC(=O)c1c(F)cccc1F)Nc1ccc(cc1)Cl,0.025428,0.1397706187651392,1,training-prediction
-O=C(N(C)C)Nc1ccc(c(c1)Cl)Cl,0.025741,0.10989045521624603,1,training-prediction
-CCOC(=O)C(Oc1ccc(cc1)Oc1cnc2c(n1)ccc(c2)Cl)C,0.0257509,0.05260415007019946,1,training-prediction
-N#Cc1sc2=c(sc1C#N)c(=O)c1c(c2=O)cccc1,0.0269975,0.20283244217430776,1,training-prediction
-C=CC1(C)OC(=O)N(C1=O)c1cc(Cl)cc(c1)Cl,0.027961,0.15193157204083732,1,training-prediction
-N#CC(c1ccc(c(c1)Oc1ccccc1)F)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,0.028207,0.04274266752996184,1,training-prediction
-CCOC(=O)C(Cc1cc(c(cc1Cl)F)n1nc(n(c1=O)C(F)F)C)Cl,0.029113,0.16324594176713225,1,training-prediction
-CON(C(=O)Nc1ccc(cc1)Br)C,0.0313005,0.05246320059678898,1,training-prediction
-CN1CN(C)CSC1=S,0.03266,0.05748016757235302,1,training-prediction
-ClCC1CN(C(=O)C1Cl)c1cccc(c1)C(F)(F)F,0.033160999999999996,0.04469562555717614,1,training-prediction
-CCCCC(c1ccc(cc1)Cl)(Cn1cncn1)C#N,0.034179,0.1328889568108142,1,training-prediction
-CCOP(=S)(SCn1c(=O)oc2c1ccc(c2)Cl)OCC,0.0345288,0.003341107232217978,1,training-prediction
-CCOCn1c(c2ccc(cc2)Cl)c(c(c1C(F)(F)F)Br)C#N,0.0350825,0.05914997942399124,1,training-prediction
-Clc1ccc(c(c1)Cl)C1(OCCO1)Cn1cncn1,0.0379825,0.06987596762503774,1,training-prediction
-CC(C(c1cncnc1)(c1ccc(cc1)OC(F)(F)F)O)C,0.038746,0.05734371433548944,1,training-prediction
-CC(N(C(=O)SCC(=C(Cl)Cl)Cl)C(C)C)C,0.041029,0.034659116233835524,1,training-prediction
-COP(=S)(Oc1ccc(c(c1)C)[N+](=O)[O-])OC,0.0424911,0.0025094363530126854,1,training-prediction
-O=C1OC(C(=O)N1Nc1ccccc1)(C)c1ccc(cc1)Oc1ccccc1,0.04514,0.0451334899121889,1,training-prediction
-CN(C(=S)SSC(=S)N(C)C)C,0.04783,0.06926194842492117,1,training-prediction
-CN(C(=O)Oc1nc(nc(c1C)C)N(C)C)C,0.0520385,0.030679541298093704,1,training-prediction
-C=CCOC(c1ccc(cc1Cl)Cl)Cn1cncc1,0.053503,0.013704285647549115,1,training-prediction
-COCN(c1c(CC)cccc1CC)C(=O)CCl,0.0537505,0.5608944524211554,1,training-prediction
-CCOc1ccc2c(c1)C(=CC(N2)(C)C)C,0.055221,0.3806288619670735,1,training-prediction
-O=C(c1ccc(cc1S(=O)(=O)C)C(F)(F)F)c1cnoc1C1CC1,0.055661,0.029566929200617712,1,training-prediction
-OC(=O)COc1ccc(cc1C)Cl,0.0573225,0.12085906204575554,1,training-prediction
-CCOC(=O)NCCOc1ccc(cc1)Oc1ccccc1,0.0575765,0.20536079691300863,1,training-prediction
-N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C(c1ccc(cc1)Cl)C(C)C,0.059538,0.00852789812015851,1,training-prediction
-N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)(C)C,0.060099,0.026958947205883887,1,training-prediction
-CC(OC(=O)C(c1ccc(cc1)Br)(c1ccc(cc1)Br)O)C,0.0606145,0.020841190814278196,1,training-prediction
-Clc1ccccc1c1nnc(nn1)c1ccccc1Cl,0.06152150000000001,0.1399543448524091,1,training-prediction
-Clc1cc(ccc1Oc1ccc(c(c1)C(=O)NS(=O)(=O)C)[N+](=O)[O-])C(F)(F)F,0.062678,0.1405937915030098,1,training-prediction
-COc1nc(nc(n1)C)NC(=O)NS(=O)(=O)c1ccccc1Cl,0.062889,0.08792722420062483,1,training-prediction
-CCCCN(SN(C(=O)Oc1cccc2c1OC(C2)(C)C)C)CCCC,0.065695,0.02839365100101789,1,training-prediction
-O=C(NC(=O)c1c(F)cccc1F)Nc1cc(Cl)c(c(c1F)Cl)F,0.06691,0.07597385658090804,1,training-prediction
-CSc1nnc(c(=O)n1N)C(C)(C)C,0.067199,0.01370099999999999,0.11538461538461539,training-prediction
-O=C(NC(=O)c1c(F)cccc1F)Nc1ccc(c(c1)Cl)OC(C(OC(F)(F)F)F)(F)F,0.06758600000000001,0.05038188357688792,1,training-prediction
-CCOc1ccc(cc1)C(COCc1cccc(c1)Oc1ccccc1)(C)C,0.068395,0.1116456809729931,1,training-prediction
-C#CCOS(=O)OC1CCCCC1Oc1ccc(cc1)C(C)(C)C,0.06905,0.11430467855215604,1,training-prediction
-OC(=O)COc1nc(Cl)c(cc1Cl)Cl,0.07213454999999999,0.05814521838332604,1,training-prediction
-O=C(N(C)C)Nc1cccc(c1)C(F)(F)F,0.073957,0.04001363451555916,1,training-prediction
-OC(C(C)(C)C)C(n1cncn1)Oc1ccc(cc1)c1ccccc1,0.074093,0.2057465588486493,1,training-prediction
-CCNc1nc(NC(C)C)nc(n1)Cl,0.077892,0.09557547621401412,1,training-prediction
-O=C(C1=C(C)OCCS1)Nc1ccccc1,0.0811745,0.15573428524718813,1,training-prediction
-CNC(=O)Oc1cc(C)c(c(c1)C)SC,0.0827735,0.017454393396993218,1,training-prediction
-OC(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,0.084527,0.09609312639982802,1,training-prediction
-O=C(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,0.085107,0.0611332279073438,1,training-prediction
-CC(NC(=O)N1CC(=O)N(C1=O)c1cc(Cl)cc(c1)Cl)C,0.08783250000000001,0.06681328467405699,1,training-prediction
-CCC(=O)Nc1ccc(c(c1)Cl)Cl,0.09171,0.2915369419467815,1,training-prediction
-N#CC(c1ccccc1)(Cn1cncn1)CCc1ccc(cc1)Cl,0.092038,0.0965869480256574,1,training-prediction
-ClC(=CC1C(C1(C)C)C(=O)OCc1cccc(c1)Oc1ccccc1)Cl,0.095836,0.12663135322831742,1,training-prediction
-OC(=O)C(Oc1ccc(cc1Cl)Cl)C,0.0967835,0.03322294575694361,1,training-prediction
-COP(=O)(NC(=O)C)SC,0.1023645,0.020049393739713695,1,training-prediction
-O=C1N(c2cc(Cl)cc(c2)Cl)C(=O)C2(C1(C)C2)C,0.1108605,0.117838029035702,1,training-prediction
-COC(=O)c1c(nc(c(c1CC(C)C)C1=NCCS1)C(F)(F)F)C(F)F,0.11151,0.06043692115939837,1,training-prediction
-Clc1ccc(cc1)CCC(C(C)(C)C)(Cn1cncn1)O,0.115167,0.05895345220093288,1,training-prediction
-COC(=O)NC(=S)Nc1ccccc1NC(=S)NC(=O)OC,0.12412500000000001,0.2482244502543378,1,training-prediction
-CCOc1cc(ccc1[N+](=O)[O-])Oc1ccc(cc1Cl)C(F)(F)F,0.12856,0.11079797742359224,1,training-prediction
-CCc1ccc(cc1)C(=O)NN(C(C)(C)C)C(=O)c1cc(C)cc(c1)C,0.13674999999999998,1.1006126894921346,1,training-prediction
-c1scc(n1)c1nc2c([nH]1)cccc2,0.14907,0.10738182341473035,1,training-prediction
-CC(N1C(=O)c2ccccc2NS1(=O)=O)C,0.14983,0.7288084834900276,1,training-prediction
-N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,0.150135,0.030035240719250313,1,training-prediction
-Cc1cccc2c1n1cnnc1s2,0.150605,0.3285321844774098,1,training-prediction
-ClC(SN1C(=O)c2c(C1=O)cccc2)(Cl)Cl,0.15173999999999999,0.20071832491891514,1,training-prediction
-CCC(Nc1c(cc(cc1[N+](=O)[O-])C(C)(C)C)[N+](=O)[O-])C,0.153385,0.050879540298715914,1,training-prediction
-Cc1nc(Nc2ccccc2)nc(c1)C1CC1,0.15802,0.5553748019083816,1,training-prediction
-c1ccc(cc1)Nc1ccccc1,0.16546,0.38230959751715715,1,training-prediction
-Clc1cc(Cl)c(cc1n1nc(n(c1=O)C(F)F)C)NS(=O)(=O)C,0.17304,0.02384153039577781,1,training-prediction
-CCCC1COC(O1)(Cn1cncn1)c1ccc(cc1Cl)Cl,0.176786,0.04467686699937855,1,training-prediction
-C#CCC1=C(C)C(CC1=O)OC(=O)C1C(C1(C)C)C=C(C)C,0.18558899999999998,0.10572035414293104,1,training-prediction
-COc1nc(nc(n1)C)NC(=O)NS(=O)(=O)c1ccccc1CCC(F)(F)F,0.200175,0.07901150381157088,1,training-prediction
-CC(=CC1C(C1(C)C)C(=O)OCc1coc(c1)Cc1ccccc1)C,0.2068305,0.22838174280166534,1,training-prediction
-CCOCN(c1c(C)cccc1CC)C(=O)CCl,0.21666999999999997,0.13739679956582262,1,training-prediction
-CC(Nc1nc(NC(C)C)nc(n1)Cl)C,0.219845,0.06430354395190928,1,training-prediction
-CNC(=O)Oc1ccccc1OC(C)C,0.23418,0.013725243045764276,1,training-prediction
-CCCCCCCCSC(=O)Oc1cc(Cl)nnc1c1ccccc1,0.24081999999999998,0.13659751142830953,1,training-prediction
-O=C1N(OCC1(C)C)Cc1ccccc1Cl,0.253443,0.06439799828920362,1,training-prediction
-Cc1ccc(cc1)N(S(=O)(=O)N(C)C)SC(Cl)(Cl)F,0.25917,0.07245747881555553,1,training-prediction
-CC1N(C(=O)NC2CCCCC2)C(=O)SC1c1ccc(cc1)Cl,0.259436,0.03487898392097964,1,training-prediction
-CCN(C(=O)C(Oc1cccc2c1cccc2)C)CC,0.271895,0.8706009631290119,1,training-prediction
-OC(=O)COc1ccc(cc1Cl)Cl,0.2805,0.08845179227631193,1,training-prediction
-CCOC(=O)CC(C(=O)OCC)SP(=S)(OC)OC,0.292105,0.02711766232686377,1,training-prediction
-CNC(=O)Oc1cccc2c1cccc2,0.29818,0.19393577674782117,1,training-prediction
-Clc1ccc(cc1)CN(C(=O)Nc1ccccc1)C1CCCC1,0.31170800000000004,0.06922818715332912,1,training-prediction
-ClCCOc1ccccc1S(=O)(=O)NC(=O)Nc1nc(C)nc(n1)OC,0.31207450000000003,0.18963884941795092,1,training-prediction
-CCC(n1c(=O)[nH]c(c(c1=O)Br)C)C,0.31690999999999997,0.2868010916268421,1,training-prediction
-ClC(SN1C(=O)C2C(C1=O)CC=CC2)(Cl)Cl,0.32935499999999995,0.07976078042600035,1,training-prediction
-CC(=CC1C(C1(C)C)C(=O)OCc1cccc(c1)Oc1ccccc1)C,0.42802,0.06099139561841655,1,training-prediction
-CC(Oc1ccccn1)COc1ccc(cc1)Oc1ccccc1,0.433615,0.06873630510174474,1,training-prediction
-N#Cc1c[nH]cc1c1cccc2c1OC(O2)(F)F,0.449265,0.7723832498999111,1,training-prediction
-Clc1c(Cl)c([N+](=O)[O-])c(c(c1Cl)Cl)Cl,0.47404,0.051655712417785805,1,training-prediction
-COc1nc(nc(n1)C)NC(=O)NS(=O)(=O)c1ccccc1C(=O)OC,0.48379,0.08591404752056958,1,training-prediction
-Nc1nc(NC2CC2)nc(n1)N,0.514491,0.162564088288422,1,training-prediction
-CCCCOCCOCCOCc1cc2OCOc2cc1CCC,0.51708,0.33736880263337066,1,training-prediction
-COCC(N(c1c(C)cccc1CC)C(=O)CCl)C,0.52855,0.2025217756930338,1,training-prediction
-O=Cc1ccco1,0.62445,0.12487,0.1111111111111111,training-prediction
-[O-][N+](=O)c1cc(Cl)c(c(c1)Cl)N,0.72459,0.09940992923363313,1,training-prediction
-COC(=O)c1c(Cl)c(Cl)c(c(c1Cl)Cl)C(=O)OC,0.768162,0.3068265745479209,1,training-prediction
-COC(=O)NS(=O)(=O)c1ccc(cc1)N,0.78179,0.3077280396919008,1,training-prediction
-OC(=O)c1nc(Cl)c(c(c1Cl)N)Cl,0.8283,1.5935322771354081,1,training-prediction
-CC(C1(C)N=C(NC1=O)c1nc2ccccc2cc1C(=O)O)C,0.8351195,0.5362888665627945,1,training-prediction
-COC(=O)Nc1nc2c([nH]1)cccc2,0.8499450000000001,0.03427857980425423,1,training-prediction
-CC(Oc1cccc(c1)NC(=O)c1ccccc1C(F)(F)F)C,0.907795,0.16556720793841956,1,training-prediction
-O=C(C1(C)CCCCC1)Nc1ccc(c(c1Cl)Cl)O,0.96626,0.38098624299804507,1,training-prediction
-ClCCP(=O)(O)O,0.9723550000000001,2.415446889200535,1,training-prediction
-COc1cccc(c1C)C(=O)NN(C(C)(C)C)C(=O)c1cc(C)cc(c1)C,1.1154,0.7700149356098216,1,training-prediction
-CCC(c1noc(c1)NC(=O)c1c(OC)cccc1OC)(CC)C,1.5855,0.4882234386036716,1,training-prediction
-OC(=O)COc1nc(F)c(c(c1Cl)N)Cl,1.9605,0.07824793915942921,1,training-prediction
-CC(OC(=O)Nc1cccc(c1)Cl)C,2.3402,0.2668742137819341,1,training-prediction
-Oc1ccccc1c1ccccc1,3.1197,0.8616321068040766,1,training-prediction
-OC(=O)CNCP(=O)(O)O,5.5597,0.7008374592406026,0.16666666666666666,training-prediction
+SMILES,LOAEL_measured_median,LOAEL_predicted,RMSE,Dataset
+N#Cc1nn(c(c1S(=O)C(F)(F)F)N)c1c(Cl)cc(cc1Cl)C(F)(F)F,0.00013611,0.01148216373256843,2.50203588100455,training-prediction
+OC(=O)C(Oc1ccc(cc1)Oc1ncc(cc1Cl)C(F)(F)F)C,0.00027647,0.012370470701925918,2.96678720091176,training-prediction
+CCOP(=S)(SCSC(C)(C)C)OCC,0.00027736000000000004,0.003520511683277722,4.096959657662574,training-prediction
+CCSCSP(=S)(OCC)OCC,0.00061449,0.0007932432885978227,4.143716547104354,training-prediction
+CCOP(=O)(SC(CC)C)SC(CC)C,0.000872805,0.005020259791233163,3.141528174312553,training-prediction
+CNC(=O)CSP(=S)(OC)OC,0.0010905,0.009515127939063834,2.7248472071072367,training-prediction
+COP(=O)(SC)N,0.0020549,0.33146588454802073,3.9057367218938923,training-prediction
+CSc1ccc(cc1C)OP(=S)(OC)OC,0.00210185,0.0052770620054827895,3.0208901266689896,training-prediction
+CCOP(=S)(Oc1ccc2c(c1)oc(=O)c(c2C)Cl)OCC,0.0033631,0.036592963422387284,3.4666873902326225,training-prediction
+CCOP(=S)(Oc1ncn(n1)c1ccccc1)OCC,0.0041492,0.02113200361753712,4.101840914099025,training-prediction
+COc1sc(=O)n(n1)CSP(=S)(OC)OC,0.00471335,0.0098233419774848,3.0154352332341747,training-prediction
+CCOP(=S)(OCC)SCSP(=S)(OCC)OCC,0.0049418,0.002892472589687806,3.620603430723068,training-prediction
+CCOP(=O)(Oc1ccc(c(c1)C)SC)NC(C)C,0.0049447,0.004831305474311603,3.0021229335197086,training-prediction
+CC(Cc1ccc(cc1)C(C)(C)C)CN1CC(C)OC(C1)C,0.0056016,0.06867843129491749,2.627434307313138,training-prediction
+COP(=S)(Oc1nc(Cl)c(cc1Cl)Cl)OC,0.00620095,0.008969949496846924,3.371259285544057,training-prediction
+OC(C(Cl)(Cl)Cl)(c1ccc(cc1)Cl)c1ccc(cc1)Cl,0.0067479,0.057540846007704954,3.4962455041242095,training-prediction
+COC(=O)N(C(=O)N1COC2(C(=N1)c1ccc(cc1C2)Cl)C(=O)OC)c1ccc(cc1)OC(F)(F)F,0.0068203,0.03706152163770414,3.5159737673405207,training-prediction
+Clc1ccc(cc1)C(c1ccccc1Cl)(c1cncnc1)O,0.0075484,0.024556552975579942,3.543849679599208,training-prediction
+COP(=S)(SCn1nnc2c(c1=O)cccc2)OC,0.0076105,0.02570303216353656,3.031409127947188,training-prediction
+N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Br)Br,0.00781875,0.042853347293390964,2.5868941564090173,training-prediction
+CCN(c1nc(cc(n1)C)OP(=S)(OC)OC)CC,0.0081878,0.016092506732843858,3.175584622821418,training-prediction
+CCCCOC(=O)C(Oc1ccc(cc1)Oc1ccc(cn1)C(F)(F)F)C,0.00885585,0.006429454339386637,3.0257820867252248,training-prediction
+COP(=O)(OC=C(Cl)Cl)OC,0.0100688,0.03379255796926954,2.096256646792533,training-prediction
+CCCSP(=O)(SCCC)OCC,0.010069,0.012657180818924377,3.0700803912827355,training-prediction
+O=S1OCC2C(CO1)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,0.012287,0.00022929597195349384,2.8126091146317136,training-prediction
+CC(Oc1cc(c(cc1Cl)Cl)n1nc(oc1=O)C(C)(C)C)C,0.0124555,0.0639086556427057,2.318069263072499,training-prediction
+CNC(=O)Oc1cccc2c1OC(C2)(C)C,0.01394355,0.04063021221263945,3.2081619962916004,training-prediction
+N#Cc1c(Cl)c(C#N)c(c(c1Cl)Cl)Cl,0.015043,0.0038259351864843435,2.828036622058189,training-prediction
+Fc1ccc(cc1)[Si](c1ccc(cc1)F)(Cn1cncn1)C,0.015853,0.012131065927580167,2.134105831278874,training-prediction
+CCOP(=S)(Oc1cc(C)nc(n1)C(C)C)OCC,0.016429,0.015095535744907507,2.7118315853213457,training-prediction
+N#Cc1c(Cl)cccc1Cl,0.0165685,0.09350368116996392,3.7543450446163025,training-prediction
+CCOP(=S)(Oc1nc(Cl)c(cc1Cl)Cl)OCC,0.017114,0.0021312647369153568,3.362583568267852,training-prediction
+BrC1COC(C1)(Cn1cncn1)c1ccc(cc1Cl)Cl,0.017185,0.028600129544374957,3.4125586919570363,training-prediction
+CCN(C(=O)SCc1ccc(cc1)Cl)CC,0.0180385,0.018489302950787715,3.44833198841459,training-prediction
+CCCN(C(=O)n1cncc1)CCOc1c(Cl)cc(cc1Cl)Cl,0.019912,0.02682253440639382,3.593069894269287,training-prediction
+CCOC(=O)c1cn2nc(cc2nc1C)OP(=S)(OCC)OCC,0.0200875,0.11679267971674347,3.181779963781186,training-prediction
+COP(=S)(SCN1C(=O)c2c(C1=O)cccc2)OC,0.020484,0.03656348401256467,3.5017614037305633,training-prediction
+CNC(=O)ON=C(C(=O)N(C)C)SC,0.022348,0.02628660622078013,2.334954711107271,training-prediction
+COC(=O)N(c1ccccc1COc1ccn(n1)c1ccc(cc1)Cl)OC,0.023207,0.09137694501130163,3.547065192836642,training-prediction
+CON(C(=O)Nc1ccc(c(c1)Cl)Cl)C,0.025091,0.013151125795315764,3.3304793383947975,training-prediction
+O=C(NC(=O)c1c(F)cccc1F)Nc1ccc(cc1)Cl,0.025428,0.13977061876513977,3.3600677057761423,training-prediction
+O=C(N(C)C)Nc1ccc(c(c1)Cl)Cl,0.025741,0.10989045521624598,3.1599772326141378,training-prediction
+CCOC(=O)C(Oc1ccc(cc1)Oc1cnc2c(n1)ccc(c2)Cl)C,0.0257509,0.052604150070199376,3.787253425794392,training-prediction
+N#Cc1sc2=c(sc1C#N)c(=O)c1c(c2=O)cccc1,0.0269975,0.2028324421743077,1.9535985499071897,training-prediction
+C=CC1(C)OC(=O)N(C1=O)c1cc(Cl)cc(c1)Cl,0.027961,0.1519315720408373,3.1812227539091382,training-prediction
+N#CC(c1ccc(c(c1)Oc1ccccc1)F)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,0.028207,0.04274266752996182,2.8273565466689945,training-prediction
+CCOC(=O)C(Cc1cc(c(cc1Cl)F)n1nc(n(c1=O)C(F)F)C)Cl,0.029113,0.16324594176713234,2.3613166734445716,training-prediction
+CON(C(=O)Nc1ccc(cc1)Br)C,0.0313005,0.05246320059678898,2.707752797433881,training-prediction
+CN1CN(C)CSC1=S,0.03266,0.05748016757235302,1.8159095313681286,training-prediction
+ClCC1CN(C(=O)C1Cl)c1cccc(c1)C(F)(F)F,0.033160999999999996,0.04469562555717614,2.824390578593975,training-prediction
+CCCCC(c1ccc(cc1)Cl)(Cn1cncn1)C#N,0.034179,0.13288895681081428,3.192951497627121,training-prediction
+CCOP(=S)(SCn1c(=O)oc2c1ccc(c2)Cl)OCC,0.0345288,0.003341107232217985,3.5818281864905543,training-prediction
+CCOCn1c(c2ccc(cc2)Cl)c(c(c1C(F)(F)F)Br)C#N,0.0350825,0.05914997942399128,3.8236633577384587,training-prediction
+Clc1ccc(c(c1)Cl)C1(OCCO1)Cn1cncn1,0.0379825,0.06987596762503753,3.495719295382636,training-prediction
+CC(C(c1cncnc1)(c1ccc(cc1)OC(F)(F)F)O)C,0.038746,0.05734371433548944,2.3447135027449666,training-prediction
+CC(N(C(=O)SCC(=C(Cl)Cl)Cl)C(C)C)C,0.041029,0.034659116233835524,2.430667251784089,training-prediction
+COP(=S)(Oc1ccc(c(c1)C)[N+](=O)[O-])OC,0.0424911,0.0025094363530126854,2.941002431011367,training-prediction
+O=C1OC(C(=O)N1Nc1ccccc1)(C)c1ccc(cc1)Oc1ccccc1,0.04514,0.04513348991218881,2.8472334793287173,training-prediction
+CN(C(=S)SSC(=S)N(C)C)C,0.04783,0.06926194842492117,2.521510285864016,training-prediction
+CN(C(=O)Oc1nc(nc(c1C)C)N(C)C)C,0.0520385,0.030679541298093704,2.0064360672931305,training-prediction
+C=CCOC(c1ccc(cc1Cl)Cl)Cn1cncc1,0.053503,0.013704285647549115,3.6544479256510884,training-prediction
+COCN(c1c(CC)cccc1CC)C(=O)CCl,0.0537505,0.5608944524211552,2.278196981596353,training-prediction
+CCOc1ccc2c(c1)C(=CC(N2)(C)C)C,0.055221,0.3806288619670735,1.8356811722691329,training-prediction
+O=C(c1ccc(cc1S(=O)(=O)C)C(F)(F)F)c1cnoc1C1CC1,0.055661,0.029566929200617712,3.3344323904112727,training-prediction
+OC(=O)COc1ccc(cc1C)Cl,0.0573225,0.12085906204575551,3.651142123730359,training-prediction
+CCOC(=O)NCCOc1ccc(cc1)Oc1ccccc1,0.0575765,0.20536079691300743,3.980747743098337,training-prediction
+N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C(c1ccc(cc1)Cl)C(C)C,0.059538,0.00852789812015851,3.2887713627130757,training-prediction
+N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)(C)C,0.060099,0.02695894720588386,3.039677019615921,training-prediction
+CC(OC(=O)C(c1ccc(cc1)Br)(c1ccc(cc1)Br)O)C,0.0606145,0.020841190814278196,1.7310945627516126,training-prediction
+Clc1ccccc1c1nnc(nn1)c1ccccc1Cl,0.06152150000000001,0.13995434485240923,3.466968103576205,training-prediction
+Clc1cc(ccc1Oc1ccc(c(c1)C(=O)NS(=O)(=O)C)[N+](=O)[O-])C(F)(F)F,0.062678,0.1405937915030098,3.1451566046433825,training-prediction
+COc1nc(nc(n1)C)NC(=O)NS(=O)(=O)c1ccccc1Cl,0.062889,0.08792722420062474,3.3826365723208114,training-prediction
+CCCCN(SN(C(=O)Oc1cccc2c1OC(C2)(C)C)C)CCCC,0.065695,0.028393651001017862,2.4553879784399055,training-prediction
+O=C(NC(=O)c1c(F)cccc1F)Nc1cc(Cl)c(c(c1F)Cl)F,0.06691,0.07597385658090808,2.8880531225052466,training-prediction
+O=C(NC(=O)c1c(F)cccc1F)Nc1ccc(c(c1)Cl)OC(C(OC(F)(F)F)F)(F)F,0.06758600000000001,0.05038188357688795,3.046389333757553,training-prediction
+CCOc1ccc(cc1)C(COCc1cccc(c1)Oc1ccccc1)(C)C,0.068395,0.11164568097299299,3.4233813002692206,training-prediction
+C#CCOS(=O)OC1CCCCC1Oc1ccc(cc1)C(C)(C)C,0.06905,0.11430467855215601,2.3194446452006643,training-prediction
+OC(=O)COc1nc(Cl)c(cc1Cl)Cl,0.07213454999999999,0.05814521838332604,2.647453514097005,training-prediction
+O=C(N(C)C)Nc1cccc(c1)C(F)(F)F,0.073957,0.04001363451555918,3.32792411627734,training-prediction
+OC(C(C)(C)C)C(n1cncn1)Oc1ccc(cc1)c1ccccc1,0.074093,0.20574655884864926,2.3446334070839496,training-prediction
+CCNc1nc(NC(C)C)nc(n1)Cl,0.077892,0.09557547621401412,1.9039742236221406,training-prediction
+O=C(C1=C(C)OCCS1)Nc1ccccc1,0.0811745,0.15573428524718802,2.930572314573659,training-prediction
+CNC(=O)Oc1cc(C)c(c(c1)C)SC,0.0827735,0.01745439339699323,2.586471594221142,training-prediction
+OC(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,0.084527,0.09609312639982798,3.374324054325346,training-prediction
+O=C(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,0.085107,0.06113322790734389,3.3271222293523866,training-prediction
+CC(NC(=O)N1CC(=O)N(C1=O)c1cc(Cl)cc(c1)Cl)C,0.08783250000000001,0.06681328467405695,3.5828942027397437,training-prediction
+CCC(=O)Nc1ccc(c(c1)Cl)Cl,0.09171,0.2915369419467815,3.688388602411302,training-prediction
+N#CC(c1ccccc1)(Cn1cncn1)CCc1ccc(cc1)Cl,0.092038,0.0965869480256572,3.241595818170502,training-prediction
+ClC(=CC1C(C1(C)C)C(=O)OCc1cccc(c1)Oc1ccccc1)Cl,0.095836,0.12663135322831737,3.15715357466162,training-prediction
+OC(=O)C(Oc1ccc(cc1Cl)Cl)C,0.0967835,0.03322294575694369,3.417804916802632,training-prediction
+COP(=O)(NC(=O)C)SC,0.1023645,0.020049393739713695,3.878261669415961,training-prediction
+O=C1N(c2cc(Cl)cc(c2)Cl)C(=O)C2(C1(C)C2)C,0.1108605,0.117838029035702,3.3995325666572502,training-prediction
+COC(=O)c1c(nc(c(c1CC(C)C)C1=NCCS1)C(F)(F)F)C(F)F,0.11151,0.06043692115939849,3.982497872375784,training-prediction
+Clc1ccc(cc1)CCC(C(C)(C)C)(Cn1cncn1)O,0.115167,0.05895345220093288,3.1895822956702626,training-prediction
+COC(=O)NC(=S)Nc1ccccc1NC(=S)NC(=O)OC,0.12412500000000001,0.24822445025433784,3.0892157988934805,training-prediction
+CCOc1cc(ccc1[N+](=O)[O-])Oc1ccc(cc1Cl)C(F)(F)F,0.12856,0.11079797742359244,3.300058402766625,training-prediction
+CCc1ccc(cc1)C(=O)NN(C(C)(C)C)C(=O)c1cc(C)cc(c1)C,0.13674999999999998,1.1006126894921355,2.3315137189940756,training-prediction
+c1scc(n1)c1nc2c([nH]1)cccc2,0.14907,0.10738182341473035,3.018929919179956,training-prediction
+CC(N1C(=O)c2ccccc2NS1(=O)=O)C,0.14983,0.7288084834900276,3.1454859847588374,training-prediction
+N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,0.150135,0.030035240719250237,2.7242814565365965,training-prediction
+Cc1cccc2c1n1cnnc1s2,0.150605,0.3285321844774099,2.2740925300929558,training-prediction
+ClC(SN1C(=O)c2c(C1=O)cccc2)(Cl)Cl,0.15173999999999999,0.20071832491891514,3.1033827451063387,training-prediction
+CCC(Nc1c(cc(cc1[N+](=O)[O-])C(C)(C)C)[N+](=O)[O-])C,0.153385,0.050879540298715914,3.152674591902268,training-prediction
+Cc1nc(Nc2ccccc2)nc(c1)C1CC1,0.15802,0.5553748019083816,2.666933643769351,training-prediction
+c1ccc(cc1)Nc1ccccc1,0.16546,0.3823095975171572,3.446253862110085,training-prediction
+Clc1cc(Cl)c(cc1n1nc(n(c1=O)C(F)F)C)NS(=O)(=O)C,0.17304,0.02384153039577781,1.2178052432990008,training-prediction
+CCCC1COC(O1)(Cn1cncn1)c1ccc(cc1Cl)Cl,0.176786,0.04467686699937855,3.341815479540678,training-prediction
+C#CCC1=C(C)C(CC1=O)OC(=O)C1C(C1(C)C)C=C(C)C,0.18558899999999998,0.10572035414293121,2.1480487748901496,training-prediction
+COc1nc(nc(n1)C)NC(=O)NS(=O)(=O)c1ccccc1CCC(F)(F)F,0.200175,0.07901150381157088,2.4253343320904417,training-prediction
+CC(=CC1C(C1(C)C)C(=O)OCc1coc(c1)Cc1ccccc1)C,0.2068305,0.22838174280166487,2.2337568783760804,training-prediction
+CCOCN(c1c(C)cccc1CC)C(=O)CCl,0.21666999999999997,0.13739679956582262,2.5193852538571737,training-prediction
+CC(Nc1nc(NC(C)C)nc(n1)Cl)C,0.219845,0.06430354395190928,2.1960439575422828,training-prediction
+CNC(=O)Oc1ccccc1OC(C)C,0.23418,0.01372524304576431,3.146482979301219,training-prediction
+CCCCCCCCSC(=O)Oc1cc(Cl)nnc1c1ccccc1,0.24081999999999998,0.13659751142830953,2.2713062185985957,training-prediction
+O=C1N(OCC1(C)C)Cc1ccccc1Cl,0.253443,0.06439799828920355,3.0695908806093946,training-prediction
+Cc1ccc(cc1)N(S(=O)(=O)N(C)C)SC(Cl)(Cl)F,0.25917,0.07245747881555553,1.3950486742546513,training-prediction
+CC1N(C(=O)NC2CCCCC2)C(=O)SC1c1ccc(cc1)Cl,0.259436,0.03487898392097961,3.1089917030303966,training-prediction
+CCN(C(=O)C(Oc1cccc2c1cccc2)C)CC,0.271895,0.8706009631290124,2.5383305675909127,training-prediction
+OC(=O)COc1ccc(cc1Cl)Cl,0.2805,0.08845179227631189,3.4880362878035625,training-prediction
+CCOC(=O)CC(C(=O)OCC)SP(=S)(OC)OC,0.292105,0.027117662326863728,3.1743301112895788,training-prediction
+CNC(=O)Oc1cccc2c1cccc2,0.29818,0.19393577674782086,3.148794658645996,training-prediction
+Clc1ccc(cc1)CN(C(=O)Nc1ccccc1)C1CCCC1,0.31170800000000004,0.0692281871533292,3.6138846095129655,training-prediction
+ClCCOc1ccccc1S(=O)(=O)NC(=O)Nc1nc(C)nc(n1)OC,0.31207450000000003,0.18963884941795092,2.2887209265857713,training-prediction
+CCC(n1c(=O)[nH]c(c(c1=O)Br)C)C,0.31690999999999997,0.2868010916268421,1.0000000000000004,training-prediction
+ClC(SN1C(=O)C2C(C1=O)CC=CC2)(Cl)Cl,0.32935499999999995,0.07976078042600035,2.7514377571569284,training-prediction
+CC(=CC1C(C1(C)C)C(=O)OCc1cccc(c1)Oc1ccccc1)C,0.42802,0.06099139561841661,3.0358460869991877,training-prediction
+CC(Oc1ccccn1)COc1ccc(cc1)Oc1ccccc1,0.433615,0.0687363051017446,3.2841958991836697,training-prediction
+N#Cc1c[nH]cc1c1cccc2c1OC(O2)(F)F,0.449265,0.7723832498999122,2.3805791967901175,training-prediction
+Clc1c(Cl)c([N+](=O)[O-])c(c(c1Cl)Cl)Cl,0.47404,0.05165571241778583,3.38825585540646,training-prediction
+COc1nc(nc(n1)C)NC(=O)NS(=O)(=O)c1ccccc1C(=O)OC,0.48379,0.08591404752056958,3.146777183988822,training-prediction
+Nc1nc(NC2CC2)nc(n1)N,0.514491,0.162564088288422,1.807175568931721,training-prediction
+CCCCOCCOCCOCc1cc2OCOc2cc1CCC,0.51708,0.33736880263337077,1.2267229745657229,training-prediction
+COCC(N(c1c(C)cccc1CC)C(=O)CCl)C,0.52855,0.20252177569303392,2.5393659750875903,training-prediction
+[O-][N+](=O)c1cc(Cl)c(c(c1)Cl)N,0.72459,0.09940992923363318,3.53198294945023,training-prediction
+COC(=O)c1c(Cl)c(Cl)c(c(c1Cl)Cl)C(=O)OC,0.768162,0.306826574547921,3.3273598548158856,training-prediction
+COC(=O)NS(=O)(=O)c1ccc(cc1)N,0.78179,0.3077280396919012,2.965384174767014,training-prediction
+OC(=O)c1nc(Cl)c(c(c1Cl)N)Cl,0.8283,1.5935322771354081,2.7844642412771186,training-prediction
+CC(C1(C)N=C(NC1=O)c1nc2ccccc2cc1C(=O)O)C,0.8351195,0.5362888665627945,1.8629635190181286,training-prediction
+COC(=O)Nc1nc2c([nH]1)cccc2,0.8499450000000001,0.034278579804254194,2.951850978563035,training-prediction
+CC(Oc1cccc(c1)NC(=O)c1ccccc1C(F)(F)F)C,0.907795,0.16556720793841953,3.2058426416353965,training-prediction
+O=C(C1(C)CCCCC1)Nc1ccc(c(c1Cl)Cl)O,0.96626,0.38098624299804523,2.756429191919761,training-prediction
+ClCCP(=O)(O)O,0.9723550000000001,2.415446889200535,1.7323244335698151,training-prediction
+COc1cccc(c1C)C(=O)NN(C(C)(C)C)C(=O)c1cc(C)cc(c1)C,1.1154,0.7700149356098193,2.7023653447607545,training-prediction
+CCC(c1noc(c1)NC(=O)c1c(OC)cccc1OC)(CC)C,1.5855,0.4882234386036716,2.503057223550509,training-prediction
+OC(=O)COc1nc(F)c(c(c1Cl)N)Cl,1.9605,0.07824793915942921,3.068409923510699,training-prediction
+CC(OC(=O)Nc1cccc(c1)Cl)C,2.3402,0.2668742137819336,3.5127783187534583,training-prediction
+Oc1ccccc1c1ccccc1,3.1197,0.8616321068040744,2.9403623685003084,training-prediction
diff --git a/data/training-test-predictions.id b/data/training-test-predictions.id
index 2dcc43a..4bce6b7 100644
--- a/data/training-test-predictions.id
+++ b/data/training-test-predictions.id
@@ -1 +1 @@
-56d5de732b72ed162f000005
+56d6eed42b72ed2d41000005
diff --git a/dataset-variability.R b/dataset-variability.R
index b0e3c76..775fd03 100644
--- a/dataset-variability.R
+++ b/dataset-variability.R
@@ -18,7 +18,16 @@ s.dup$SMILES <- reorder(s.dup$SMILES,s.dup$LOAEL)
 p1 <- ggplot(m.dup, aes(SMILES,LOAEL),ymin = min(LOAEL), ymax=max(LOAEL)) + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank()) + geom_point() + ggtitle("Mazzatorta") + ylim(-1,4)
 p2 <- ggplot(s.dup, aes(SMILES,LOAEL),ymin = min(LOAEL), ymax=max(LOAEL)) + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank()) + geom_point() + ggtitle("Swiss Federal Office") + ylim(-1,4)
 
-pdf('figure/dataset-variability.pdf')
-grid.arrange(p1,p2,ncol=1)
-dev.off()
+#pdf('figure/dataset-variability.pdf')
+#grid.arrange(p1,p2,ncol=1)
+#dev.off()
 
+data <- read.csv("data/test.csv",header=T)
+data$LOAEL = -log(data$LOAEL)
+data$SMILES <- reorder(data$SMILES,data$LOAEL)
+img = ggplot(data,aes(SMILES,LOAEL,ymin = min(LOAEL), ymax=max(LOAEL),color=Dataset)) + geom_point()
+img <- img + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank())  + theme(legend.title=element_blank())
+img = img + scale_fill_discrete(breaks=c("Mazzatorta", "Both", "Swiss Federal Office"))
+img = img 
+
+ggsave(file='figure/dataset-variability.pdf', plot=img, width=12,height=8)
diff --git a/figure/dataset-variability.pdf b/figure/dataset-variability.pdf
index fdcddf8..aad1c21 100644
--- a/figure/dataset-variability.pdf
+++ b/figure/dataset-variability.pdf
diff --git a/figure/test-correlation.pdf b/figure/test-correlation.pdf
index 6097332..d3f49a7 100644
--- a/figure/test-correlation.pdf
+++ b/figure/test-correlation.pdf
diff --git a/figure/test-prediction.pdf b/figure/test-prediction.pdf
index 3403b8d..2083f96 100644
--- a/figure/test-prediction.pdf
+++ b/figure/test-prediction.pdf
diff --git a/loael.Rmd b/loael.Rmd
index 8fb301f..bbedf99 100644
--- a/loael.Rmd
+++ b/loael.Rmd
@@ -16,6 +16,9 @@ tblPrefix: Table
 output:
   pdf_document:
     fig_caption: yes
+header-includes:
+  - \usepackage{lineno}
+  - \linenumbers
 ...
 
 ```{r echo=F}
@@ -28,12 +31,23 @@ Introduction
 
 Elena + Benoit
 
-The quality and reproducibility of (Q)SAR and  read-across predictions is a controversial topic in the toxicological risk-assessment community. Although model predictions can be validated with various procedures it is rarely possible to put the results into the context of experimental variability, because replicate experiments are rarely available.
+The quality and reproducibility of (Q)SAR and  read-across predictions is a
+controversial topic in the toxicological risk-assessment community. Although
+model predictions can be validated with various procedures it is rarely
+possible to put the results into the context of experimental variability,
+because replicate experiments are rarely available.
 
-With missing information about the variability of experimental toxicity data it is hard to judge the performance of predictive models and it is tempting for model developments to use aggressive model optimisation methods that lead to impressive validation results, but also to overfitted models with little practical relevance.
+With missing information about the variability of experimental toxicity data it
+is hard to judge the performance of predictive models and it is tempting for
+model developments to use aggressive model optimisation methods that lead to
+impressive validation results, but also to overfitted models with little
+practical relevance.
 
-In this study we intent to compare model predictions with experimental variability with chronic oral rat lowest adverse effect levels (LOAEL) as toxicity endpoint.
-We are using two datasets, one from [@mazzatorta08] (*Mazzatorta* dataset) and one from the Swiss Federal Office of TODO (*Swiss Federal Office* dataset).
+In this study we intent to compare model predictions with experimental
+variability with chronic oral rat lowest adverse effect levels (LOAEL) as
+toxicity endpoint.  We are using two datasets, one from [@mazzatorta08]
+(*Mazzatorta* dataset) and one from the Swiss Federal Office of TODO (*Swiss
+Federal Office* dataset).
 
 Elena: do you have a reference and the name of the department?
 
@@ -44,21 +58,26 @@ t = read.csv("data/test.csv",header=T)
 c = read.csv("data/training.csv",header=T)
 ```
 
-`r length(unique(t$SMILES))` compounds are common in both datasets and we use them as a test set in our investigation. For this test set we will
+`r length(unique(t$SMILES))` compounds are common in both datasets and we use
+them as a *test* set in our investigation. For the Mazzatorta and Swiss Federal Office datasets we will
 
 - compare the structural diversity of both datasets
 - compare the LOAEL values in both datasets
-- build prediction models based on the Mazzatorta, Swiss Federal Office datasets and a combination of both
+- build prediction models 
 - predict LOAELs of the training set
 - compare predictions with experimental variability
 
-With this investigation we also want to support the idea of reproducible research, by providing all datasets and programs that have been used to generate this manuscript under a TODO license.
+With this investigation we also want to support the idea of reproducible
+research, by providing all datasets and programs that have been used to
+generate this manuscript under TODO creative/scientific commons? (datasets) and
+GPL (programs) licenses.
 
-A self-contained docker image with all program dependencies required for the reproduction of these results is available from TODO.
+A self-contained docker image with all program dependencies required for the
+reproduction of these results is available from TODO.
 
-Source code and datasets for the reproduction of this manuscript can be downloaded from the GitHub repository TODO. The lazar framework [@Maunz2013] is also available under a GPL License from https://github.com/opentox/lazar.
-
-TODO: github tags
+Source code and datasets for the reproduction of this manuscript can be
+downloaded from the GitHub repository TODO. The lazar framework [@Maunz2013] is
+also available under a GPL License from https://github.com/opentox/lazar.
 
 Elena: please check if this is publication strategy is ok for the Swiss Federal Office
 
@@ -83,13 +102,14 @@ Elena + Swiss Federal Office contribution (input)
 
 The original Swiss Federal Office dataset has chronic toxicity data for rats,
 mice and multi generation effects. For the purpose of this study only rat LOAEL
-data was used. This leads to the *Swiss Federal Office*  dataset with `r length(s$SMILES)` rat LOAEL
-values for `r length(unique(s$SMILES))` unique chemical structures.
+data with oral administration was used. This leads to the *Swiss Federal
+Office*  dataset with `r length(s$SMILES)` rat LOAEL values for `r
+length(unique(s$SMILES))` unique chemical structures.
 
 ### Preprocessing
 
 Chemical structures (represented as SMILES [@doi:10.1021/ci00057a005]) in both
-datasets were checked for correctness, syntactically incorrect and missing
+datasets were checked for correctness. Syntactically incorrect and missing
 SMILES were generated from other identifiers (e.g names, CAS numbers). Unique
 smiles from the OpenBabel library [@OBoyle2011] were used for the
 identification of duplicated structures. 
@@ -106,42 +126,37 @@ Two derived datasets were obtained from the original datasets:
 The *test* dataset contains data of compounds that occur in both datasets.
 LOAEL values equal at five significant digits were considered as duplicates
 originating from the same study/publication and only one instance was kept in
-the test dataset.  Exact duplications of LOAEL values were removed, because it
-is very likely that they originate from the same study.  The test dataset has
+the test dataset.  The test dataset has
 `r length(t$SMILES)` LOAEL values for `r length(unique(t$SMILES))` unique
 chemical structures.
 
 The *training* dataset is the union of the Mazzatorta and the Swiss Federal
 Office dataset and it is used to build predictive models. LOAEL duplicates were
-removed, as for the test dataset.  The training dataset has `r
-length(c$SMILES)` LOAEL values for `r length(unique(c$SMILES))` unique chemical
-structures.
+removed using the same criteria as for the test dataset.  The training dataset
+has `r length(c$SMILES)` LOAEL values for `r length(unique(c$SMILES))` unique
+chemical structures.
 
 Algorithms
 ----------
 
-In this study we are using the modular lazar (*la*zy *s*tructure
-*a*ctivity *r*elationships) framework [@Maunz2013] for model
-development and validation.
+In this study we are using the modular lazar (*la*zy *s*tructure *a*ctivity
+*r*elationships) framework [@Maunz2013] for model development and validation.
 
-lazar follows the following basic workflow: For a given chemical
-structure lazar 
+lazar follows the following basic workflow: For a given chemical structure
+lazar 
 
-- searches in a database for similar structures (*neighbors*)
-with experimental data, 
-- builds a local QSAR model with these neighbors
-and 
-- uses this model to predict the unknown activity of the query
-compound.
+- searches in a database for similar structures (*neighbors*) with experimental
+  data, 
+- builds a local QSAR model with these neighbors and 
+- uses this model to predict the unknown activity of the query compound.
 
-This procedure resembles an automated version of *read across*
-predictions in toxicology, in machine learning terms it would be
-classified as a *k-nearest-neighbor* algorithm.
+This procedure resembles an automated version of *read across* predictions in
+toxicology, in machine learning terms it would be classified as a
+*k-nearest-neighbor* algorithm.
 
-Apart from this basic workflow lazar is completely modular and allows
-the researcher to use any algorithm for similarity searches and
-local QSAR modelling. Within this study we are using the following
-algorithms:
+Apart from this basic workflow lazar is completely modular and allows the
+researcher to use any algorithm for similarity searches and local QSAR
+modelling. Within this study we are using the following algorithms:
 
 ### Neighbor identification
 
@@ -174,78 +189,96 @@ total number of atom environments $A \cup B$ (Jaccard/Tanimoto index, [@eq:jacca
 
 $$ sim = \frac{|A \cap B|}{|A \cup B|} $$ {#eq:jaccard}
 
-A threshold of $sim < 0.1$ is used for the identification of neighbors for
+A threshold of $sim > 0.1$ is used for the identification of neighbors for
 local QSAR models.  Compounds with the same structure as the query structure
-are eliminated from the neighbors to obtain an unbiased prediction.
+are eliminated from the neighbors to obtain  unbiased predictions in the presence of duplicates.
 
 ### Local QSAR models and predictions
 
-Only similar compounds (*neighbors*) are used for local QSAR models.  In this
-investigation we are using a weighted partial least squares regression (PLS)
-algorithm for the prediction of quantitative properties.  First all fingerprint
-features with identical values across all neighbors are removed.  The reamining
-set of features is used as descriptors for creating a local weighted PLS model
-with atom environments as descriptors and model similarities as weights. The
-`pls` method from the `caret` R package [@Kuhn08] is used for this purpose.
+Only similar compounds (*neighbors*) above the threshold are used for local
+QSAR models.  In this investigation we are using a weighted partial least
+squares regression (PLS) algorithm for the prediction of quantitative
+properties.  First all fingerprint features with identical values across all
+neighbors are removed.  The reamining set of features is used as descriptors
+for creating a local weighted PLS model with atom environments as descriptors
+and model similarities as weights. The `pls` method from the `caret` R package
+[@Kuhn08] is used for this purpose.  Models are trained with the default
+`caret` settings, optimizing the number of PLS components by bootstrap
+resampling.
 
 Finally the local PLS model is applied to predict the activity of the query
-compound.
+compound. The RMSE of bootstrapped model predictions is used to construct 95\%
+prediction intervals at 1.96*RMSE.
 
 If PLS modelling or prediction fails, the program resorts to using the weighted
 mean of the neighbors LOAEL values, where the contribution of each neighbor is
 weighted by its similarity to the query compound.
 
-default settings for tuning
-
 ### Applicability domain
 
-Christoph: TODO
+The applicability domain of lazar models is determined by the structural
+diversity of the training data. If no similar compounds are found in the
+training data no predictions will be generated. If the query compounds contains
+substructures that are not covered by training examples a warning is issued.
 
-Prediction intervals were obtained from the `predict` function.
+Local regression models consider neighbor similarities to the query compound,
+by weighting the contribution of each neighbor is weighted by its similarity
+index. The variability of local model predictions is reflected in the
+prediction interval.
 
 ### Validation
 
 For the comparison of experimental variability with predictive accuracies we
-are using a test set of compounds that occur in both datasets. The
-*Mazzatorta*, *Swiss Federal Office* and *training* datasets are used as
-training data for read across predictions. In order to obtain unbiased
-predictions *all* information from the test compound is removed from the
-training set prior to predictions. This procedure is hardcoded into the
-prediction algorithm in order to prevent validation errors.  Traditional
-10-fold crossvalidation results are provided as additional information for all
-three models. 
+are using a test set of compounds that occur in both datasets. Unbiased read
+across predictions are obtained from the *training* dataset, by removing *all*
+information from the test compound from the training set prior to predictions.
+This procedure is hardcoded into the prediction algorithm in order to prevent
+validation errors. As we have only a single test set no model or parameter
+optimisations were performed in order to avoid overfitting a single dataset.
 
-TODO: treatment of duplicates
-
-Christoph: check if these specifications have changed at submission
+Results from 3 repeated 10-fold crossvalidations with independent training/test
+set splits are provided as additional information to the test set results.
 
 Results
 =======
 
 ### Dataset comparison
 
-Elena
-
 The main objective of this section is to compare the content of both
 databases in terms of structural composition and LOAEL values, to
 estimate the experimental variability of LOAEL values and to establish a
 baseline for evaluating prediction performance.
 
-
 ##### Ches-Mapper analysis
 
-We applied the visualization tool CheS-Mapper (Chemical Space Mapping and Visualization in 3D,
-http://ches-mapper.org, @Gütlein2012) to compare both datasets. CheS-Mapper can be used to analyze the relationship between the structure of chemical compounds, their physico-chemical properties, and biological or toxic effects. It embeds a dataset into 3D space, such that compounds with similar feature values are close to each other. CheS-Mapper is generic and can be employed with different kinds of features. [@fig:ches-mapper-pc] shows an embedding that is based on physico-chemical (PC) descriptors.
+We applied the visualization tool CheS-Mapper (Chemical Space Mapping and
+Visualization in 3D, http://ches-mapper.org, @Gütlein2012) to compare both
+datasets. CheS-Mapper can be used to analyze the relationship between the
+structure of chemical compounds, their physico-chemical properties, and
+biological or toxic effects. It embeds a dataset into 3D space, such that
+compounds with similar feature values are close to each other. CheS-Mapper is
+generic and can be employed with different kinds of features.
+[@fig:ches-mapper-pc] shows an embedding that is based on physico-chemical (PC)
+descriptors.
 
-![Compounds from the Mazzatorta and the Swiss Federal Office dataset are highlighted in red and green. Compounds that occur in both datasets are highlighted in magenta. ](figure/pc-small-compounds-highlighted.png){#fig:ches-mapper-pc}
+![Compounds from the Mazzatorta and the Swiss Federal Office dataset are highlighted in red and green. Compounds that occur in both datasets are highlighted in magenta.](figure/pc-small-compounds-highlighted.png){#fig:ches-mapper-pc}
 
-Martin: explain  light colors at bottom of histograms
+Martin: please explain light colors at bottom of histograms
 
-In this example, CheS-Mapper applied a principal components analysis to map compounds according to their physico-chemical (PC) feature values into 3D space. Both datasets have in general very similar PC feature values. As an exception, the Mazzatorta dataset includes most of the tiny compound structures: we have selected the 78 smallest compounds (with 10 atoms and less, marked with a blue box in the screen-shot) and found that 61 of these compounds occur in the Mazzatorta dataset, whereas only 19 are contained in the Swiss dataset (p-value 3.7E-7).
+In this example, CheS-Mapper applied a principal components analysis to map
+compounds according to their physico-chemical (PC) feature values into 3D
+space. Both datasets have in general very similar PC feature values. As an
+exception, the Mazzatorta dataset includes most of the tiny compound
+structures: we have selected the 78 smallest compounds (with 10 atoms and less,
+marked with a blue box in the screen-shot) and found that 61 of these compounds
+occur in the Mazzatorta dataset, whereas only 19 are contained in the Swiss
+dataset (p-value 3.7E-7).
 
-This result was confirmed for structural features (fingerprints) including MolPrint2D features that are utilized for model building in this work.
+This result was confirmed for structural features (fingerprints) including
+MolPrint2D features that are utilized for model building in this work.
 
-In general we concluded that both datasets are very similar, in terms of chemical structures and physico-chemical properties. 
+In general we concluded that both datasets are very similar, in terms of
+chemical structures and physico-chemical properties. 
 
 ##### Distribution of functional groups
 
@@ -254,11 +287,11 @@ fg = read.csv('data/functional-groups.csv',head=F)
 ```
 
 In order to confirm the results of CheS-Mapper analysis we have evaluated the
-frequency of functional groups from the OpenBabel FP4
-fingerprint. [@fig:fg] shows the frequency of functional groups 
-in
-both datasets. `r length(fg$V1)` functional groups with a frequency > 25 are depicted, the complete table for all functional groups can be found in the
-data directory of the supplemental material (`data/functional-groups.csv`).
+frequency of functional groups from the OpenBabel FP4 fingerprint. [@fig:fg]
+shows the frequency of functional groups in both datasets. `r length(fg$V1)`
+functional groups with a frequency > 25 are depicted, the complete table for
+all functional groups can be found in the data directory of the supplemental
+material (`data/functional-groups.csv`).
  
 ![Frequency of functional groups.](figure/functional-groups.pdf){#fig:fg}
 
@@ -272,27 +305,42 @@ experimental results within individual datasets and between datasets.
 ##### Intra dataset variability
 
 ```{r echo=F}
-m.dupsmi = unique(m$SMILES[duplicated(m$SMILES)])
-s.dupsmi = unique(s$SMILES[duplicated(s$SMILES)])
+m.dupsmi <- unique(m$SMILES[duplicated(m$SMILES)])
+s.dupsmi <- unique(s$SMILES[duplicated(s$SMILES)])
+c.dupsmi <- unique(c$SMILES[duplicated(c$SMILES)])
 
-m.dup = m[m$SMILES %in% m.dupsmi,]
-s.dup = s[s$SMILES %in% s.dupsmi,]
+m.dup <- m[m$SMILES %in% m.dupsmi,]
+s.dup <- s[s$SMILES %in% s.dupsmi,]
+c.dup <- c[c$SMILES %in% c.dupsmi,]
 
-m.dupnr = length(m.dupsmi)
-s.dupnr = length(s.dupsmi)
+m.dupnr <- length(m.dupsmi)
+s.dupnr <- length(s.dupsmi)
+c.dupnr <- length(c.dupsmi)
 
-m.dup$var = ave(-log10(m.dup$LOAEL),m.dup$SMILES,FUN=var)
-s.dup$var = ave(-log10(s.dup$LOAEL),s.dup$SMILES,FUN=var)
+m.dup$sd <- ave(-log10(m.dup$LOAEL),m.dup$SMILES,FUN=sd)
+s.dup$sd <- ave(-log10(s.dup$LOAEL),s.dup$SMILES,FUN=sd)
+c.dup$sd <- ave(-log10(c.dup$LOAEL),c.dup$SMILES,FUN=sd)
+t$sd <- ave(-log10(t$LOAEL),t$SMILES,FUN=sd)
 
-p = t.test(m.dup$var,s.dup$var)$p.value
+p = t.test(m.dup$sd,s.dup$sd)$p.value
 ```
 
-The Mazzatorta dataset has `r length(m$SMILES)` LOAEL values for `r length(levels(m$SMILES))` unique structures, `r m.dupnr` compounds have multiple measurements with an average variance of `r round(mean(m.dup$var,na.rm=T),2)` log10 units [@fig:intra]. 
+The Mazzatorta dataset has `r length(m$SMILES)` LOAEL values for
+`r length(levels(m$SMILES))` unique structures, `r m.dupnr`
+compounds have multiple measurements with a mean standard deviation of
+`r round(mean(m.dup$sd),2)` log10 units (@mazzatorta08, [@fig:intra]). 
+
+The Swiss Federal Office dataset has `r length(s$SMILES)` rat LOAEL values for
+`r length(levels(s$SMILES))` unique structures, `r s.dupnr` compounds have
+multiple measurements with a mean standard deviation of
+`r round(mean(s.dup$sd),2)` log10 units.
 
-The Swiss Federal Office dataset has `r length(s$SMILES)` rat LOAEL values for `r length(levels(s$SMILES))` unique structures, `r s.dupnr` compounds have multiple measurements with a similar variance (average `r round(mean(s.dup$var),2)` log10 units). Variances of both datasets do not show a statistically significant difference with a
-p-value (t-test) of `r round(p,2)`.
+Standard deviations of both datasets do not show
+a statistically significant difference with a p-value (t-test) of `r round(p,2)`.
+The combined test set has a mean standard deviation of `r round(mean(c.dup$sd),2)` 
+log10 units.
 
-![Distribution and variability of LOAEL values in both datasets: Each vertical line represents a compound, dots are individual LOAEL values.](figure/dataset-variability.pdf){#fig:intra}
+![Distribution and variability of LOAEL values in both datasets. Each vertical line represents a compound, dots are individual LOAEL values.](figure/dataset-variability.pdf){#fig:intra}
 
 ##### Inter dataset variability
 
@@ -308,34 +356,52 @@ median.r.square <- round(rsquare(-log(data$mazzatorta),-log(data$swiss)),2)
 median.rmse <- round(rmse(-log(data$mazzatorta),-log(data$swiss)),2)
 ``` 
 
-[@fig:corr] depicts the correlation between LOAEL values from both datasets. As both datasets contain duplicates we are using medians for the correlation plot and statistics. Please note that the aggregation of duplicated measurements into a single value hides a substantial portion of the real experimental variability.
-Correlation analysis shows a
-significant (p-value < 2.2e-16) correlation between the experimental data in both datasets with r\^2: `r round(median.r.square,2)`, RMSE: `r round(median.rmse,2)`
+[@fig:corr] depicts the correlation between LOAEL values from both datasets. As
+both datasets contain duplicates we are using medians for the correlation plot
+and statistics. Please note that the aggregation of duplicated measurements
+into a single median value hides a substantial portion of the experimental
+variability.  Correlation analysis shows a significant (p-value < 2.2e-16)
+correlation between the experimental data in both datasets with r\^2:
+`r round(median.r.square,2)`, RMSE: `r round(median.rmse,2)`
 
 ### Local QSAR models
 
-In order to compare the perfomance of in silico models with experimental variability we are using compounds that occur in both datasets as a test set (`r  length(t$SMILES)` measurements, `r  length(unique(t$SMILES))` compounds).
-
-The Mazzatorta, the Swiss Federal Office dataset and a combined dataset were used as training data for building `lazar` read across models. Predictions for the test set compounds were made after eliminating all information from the test compound from the corresponding training dataset. [@fig:comp] summarizes the results:
-
-![Comparison of experimental with predicted LOAEL values, each vertical line represents a compound, dots are individual measurements (red) or predictions (green).](figure/test-prediction.pdf){#fig:comp}
-
 ```{r echo=F}
 training = read.csv("data/training-test-predictions.csv",header=T)
 training.r_square = round(rsquare(-log(training$LOAEL_measured_median),-log(training$LOAEL_predicted)),2)
 training.rmse = round(rmse(-log(training$LOAEL_measured_median),-log(training$LOAEL_predicted)),2)
+misclassifications = read.csv("data/misclassifications.csv",header=T)
+incorrect_predictions = length(misclassifications$SMILES)
+correct_predictions = length(training$SMILES)-incorrect_predictions
 ```
 
-TODO: nr unpredicted, nr predictions outside of experimental values
+In order to compare the performance of in silico read across models with experimental
+variability we are using compounds that occur in both datasets as a test set
+(`r  length(t$SMILES)` measurements, `r  length(unique(t$SMILES))` compounds).
+`lazar` read across predictions
+were obtained for `r length(unique(t$SMILES))` compounds, `r  length(unique(t$SMILES)) - length(training$SMILES)`
+predictions failed, because no similar compounds were found in the training data (i.e. they were not covered by the applicability domain of the training data).
+
+
+Experimental data and 95\% prediction intervals did not overlap in `r incorrect_predictions` cases
+(`r round(100*incorrect_predictions/length(training$SMILES))`\%),
+`r length(which(sign(misclassifications$Distance) == 1))` predictions were too high and
+`r length(which(sign(misclassifications$Distance) == -1))` predictions too low (after -log10 transformation).
+
+[@fig:comp] shows a comparison of predicted with experimental values:
+
+![Comparison of experimental with predicted LOAEL values. Each vertical line represents a compound, dots are individual measurements (red) or predictions (green).](figure/test-prediction.pdf){#fig:comp}
 
-Correlation analysis has been perfomed between individual predictions and the median of exprimental data.
-All correlations are statistically highly significant with a p-value < 2.2e-16.
-These results are presented in [@fig:corr] and [@tbl:cv]. Please bear in mind that the aggregation of experimental data into a single value actually hides experimental variability.
+Correlation analysis was performed between individual predictions and the
+median of experimental data.  All correlations are statistically highly
+significant with a p-value < 2.2e-16.  These results are presented in
+[@fig:corr] and [@tbl:cv]. Please bear in mind that the aggregation of
+experimental data into a single median value hides experimental variability.
 
-Training data | $r^2$                     | RMSE                    
+Comparison    | $r^2$                     | RMSE                    
 --------------|---------------------------|-------------------------
-Experimental | `r median.r.square`      | `r median.rmse`           
-Combined             | `r training.r_square` | `r training.rmse` 
+Mazzatorta vs. Swiss | `r median.r.square`      | `r median.rmse`           
+Prediction vs. Test median             | `r training.r_square` | `r training.rmse` 
 
 : Comparison of model predictions with experimental variability. {#tbl:common-pred}
 
@@ -352,38 +418,78 @@ t2 = read.csv("data/training-cv-2.csv",header=T)
 cv.t2.r_square = round(rsquare(-log(t2$LOAEL_measured_median),-log(t2$LOAEL_predicted)),2)
 cv.t2.rmse = round(rmse(-log(t2$LOAEL_measured_median),-log(t2$LOAEL_predicted)),2)
 ```
-TODO: repeated CV
 
-Traditional 10-fold cross-validation results are summarised in [@tbl:cv] and [@fig:cv].
-All correlations are statistically highly significant with a p-value < 2.2e-16.
+For a further assessment of model performance three independent 
+10-fold cross-validations were performed. Results are summarised in [@tbl:cv] and [@fig:cv].
+All correlations of predicted with experimental values are statistically highly significant with a p-value < 2.2e-16.
 
-Training dataset | $r^2$ | RMSE 
------------------|-------|------
-Combined | `r round(cv.t0.r_square,2)`  | `r round(cv.t0.rmse,2)` 
-Combined | `r round(cv.t1.r_square,2)`  | `r round(cv.t1.rmse,2)` 
-Combined | `r round(cv.t2.r_square,2)`  | `r round(cv.t2.rmse,2)` 
-
-: 10-fold crossvalidation results {#tbl:cv}
+ $r^2$ | RMSE | Nr. predicted
+-------|------|----------------
+`r round(cv.t0.r_square,2)`  | `r round(cv.t0.rmse,2)` | `r length(unique(t0$SMILES))`/`r length(unique(c$SMILES))`
+`r round(cv.t1.r_square,2)`  | `r round(cv.t1.rmse,2)` | `r length(unique(t1$SMILES))`/`r length(unique(c$SMILES))`
+`r round(cv.t2.r_square,2)`  | `r round(cv.t2.rmse,2)` | `r length(unique(t2$SMILES))`/`r length(unique(c$SMILES))`
 
+: Results from 3 independent 10-fold crossvalidations {#tbl:cv}
 
 ![Correlation of experimental with predicted LOAEL values (10-fold crossvalidation)](figure/crossvalidation.pdf){#fig:cv}
 
-
 Discussion
 ==========
 
 Elena + Benoit
 
-- both datasets are structurally similar
-- LOAEL values have similar variability in both datasets
-- the Mazzatorta dataset has a small portion of very toxic compounds (low LOAEL, high -log10(LOAEL))
-- lazar read across predictions fall within the experimental variability of LOAEL values
-- predictions are slightly less accurate at extreme (high/low) LOAEL values, this can be explained by the algorithms used
-- the original Mazzatorta paper has "better" results (R^2 0.54, RMSE 0.7) , but the model is likely to be overfitted (using genetic algorithms for feature selection *prior* to crossvalidation must lead to overfitted models)
-- beware of over-optimisations and the race for "better" validation results
+### Dataset comparison
+
+Our investigations clearly indicate that the Mazzatorta and Swiss Federal Office datasets are very similar in terms of chemical structures and properties and the distribution of experimental LOAEL values. The only minor difference that we have observed is that the Mazzatorta dataset has a larger number of highly toxic compounds [@fig:intra] and a larger amount of small molecules, than the Swiss Federal Office dataset. For this reason we have pooled both dataset into a single training dataset for read across predictions.
+
+[@fig:intra] and [@fig:corr] and [@tbl:common-pred] show however considerable variability in the experimental data. 
+High experimental variability 
+has an impact on model building and on model validation.
+First it influences model quality by introducing noise into the training data, secondly it influences accuracy estimates because predictions have to be compared against noisy data where "true" experimental values are unknown.
+This will become obvious in the next section, where we compare predictions with experimental data.
+
+### Local QSAR models
+
+[@fig:comp], [@fig:corr], [@tbl:common-pred]
+and the fact that experimental data is covered in
+`r round(100*correct_predictions/length(training$SMILES))`\% by the `lazar`
+prediction interval shows that `lazar` read across predictions fit well into
+the experimental variability of LOAEL values.
+
+It is tempting to increase the "quality" of predictions by performing parameter or algorithm optimisations, but this may lead to overfitted models, because the training set is known beforehand.
+As prediction accuracies correspond well to experimental accuracies, and the visual inspection of predictions does not show obvious anomalies, we consider our model as a robust method for LOAEL estimations.
+Prediction accuracies that are lower than experimental variability would be a clear sign for a model that is overfitted for a particular test set.
+
+The graphical interface provides intuitive means of inspecting the rationales and data used for read across predictions. In order to show how such an inspection can help to identify problematic predictions
+we present a brief analysis of the two most severe mispredictions:
+
+```{r echo=F}
+smi = "COP(=O)(SC)N"
+misclass = training[which(training$SMILES==smi),]
+med = round(-log10(misclass[,2]),2)
+pred = round(-log10(misclass[,3]),2)
+pi = round(log10(misclass[,4]),2)
+```
+
+The compound with the largest deviation of prediction intervals is (amino-methylsulfanyl-phosphoryl)oxymethane (SMILES `r smi`) with an experimental median of `r med` and a prediction interval of `r pred` +/- `r pi`. In this case the prediction is based on two neighbors with very low similarity (0.1 and 0.13). Such cases can be eliminated by raising the similarity threshold for neighbors, but that could come at the cost of a larger number of unpredicted compounds. The graphical user interface shows for each prediction neighbors and similarities for a critical examination which should make the detection of similar cases rather straightforward.
+
+```{r echo=F}
+smi = "O=S1OCC2C(CO1)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl"
+misclass = training[which(training$SMILES==smi),]
+med = round(-log10(misclass[,2]),2)
+pred = round(-log10(misclass[,3]),2)
+pi = round(log10(misclass[,4]),2)
+```
+
+The compound with second largest deviation of prediction intervals is
+Endosulfan (SMILES `r smi`)
+with an experimental median of `r med` and a prediction interval of `r pred` +/- `r pi`. In this case the prediction is based on 5 neighbors with similarities between 0.33 and 0.4. All of them are polychlorinated compound, but none of them contains sulfur or is a sulfurous acid ester. Again such problems are easily identified from a visual inspection of neighbors, and we want to stress the importance of inspecting rationales for predictions in the graphical interface before accepting a prediction.
 
 Summary
 =======
 
+- beware of over-optimisations and the race for "better" validation results
+- reproducible research
+
 References
 ==========
diff --git a/loael.md b/loael.md
index 9f6224e..4e58344 100644
--- a/loael.md
+++ b/loael.md
@@ -16,6 +16,9 @@ tblPrefix: Table
 output:
   pdf_document:
     fig_caption: yes
+header-includes:
+  - \usepackage{lineno}
+  - \linenumbers
 ...
 
 
@@ -25,32 +28,48 @@ Introduction
 
 Elena + Benoit
 
-The quality and reproducibility of (Q)SAR and  read-across predictions is a controversial topic in the toxicological risk-assessment community. Although model predictions can be validated with various procedures it is rarely possible to put the results into the context of experimental variability, because replicate experiments are rarely available.
+The quality and reproducibility of (Q)SAR and  read-across predictions is a
+controversial topic in the toxicological risk-assessment community. Although
+model predictions can be validated with various procedures it is rarely
+possible to put the results into the context of experimental variability,
+because replicate experiments are rarely available.
 
-With missing information about the variability of experimental toxicity data it is hard to judge the performance of predictive models and it is tempting for model developments to use aggressive model optimisation methods that lead to impressive validation results, but also to overfitted models with little practical relevance.
+With missing information about the variability of experimental toxicity data it
+is hard to judge the performance of predictive models and it is tempting for
+model developments to use aggressive model optimisation methods that lead to
+impressive validation results, but also to overfitted models with little
+practical relevance.
 
-In this study we intent to compare model predictions with experimental variability with chronic oral rat lowest adverse effect levels (LOAEL) as toxicity endpoint.
-We are using two datasets, one from [@mazzatorta08] (*Mazzatorta* dataset) and one from the Swiss Federal Office of TODO (*Swiss Federal Office* dataset).
+In this study we intent to compare model predictions with experimental
+variability with chronic oral rat lowest adverse effect levels (LOAEL) as
+toxicity endpoint.  We are using two datasets, one from [@mazzatorta08]
+(*Mazzatorta* dataset) and one from the Swiss Federal Office of TODO (*Swiss
+Federal Office* dataset).
 
 Elena: do you have a reference and the name of the department?
 
 
 
-155 compounds are common in both datasets and we use them as a test set in our investigation. For this test set we will
+155 compounds are common in both datasets and we use
+them as a *test* set in our investigation. For the Mazzatorta and Swiss Federal Office datasets we will
 
 - compare the structural diversity of both datasets
 - compare the LOAEL values in both datasets
-- build prediction models based on the Mazzatorta, Swiss Federal Office datasets and a combination of both
+- build prediction models 
 - predict LOAELs of the training set
 - compare predictions with experimental variability
 
-With this investigation we also want to support the idea of reproducible research, by providing all datasets and programs that have been used to generate this manuscript under a TODO license.
+With this investigation we also want to support the idea of reproducible
+research, by providing all datasets and programs that have been used to
+generate this manuscript under TODO creative/scientific commons? (datasets) and
+GPL (programs) licenses.
 
-A self-contained docker image with all program dependencies required for the reproduction of these results is available from TODO.
+A self-contained docker image with all program dependencies required for the
+reproduction of these results is available from TODO.
 
-Source code and datasets for the reproduction of this manuscript can be downloaded from the GitHub repository TODO. The lazar framework [@Maunz2013] is also available under a GPL License from https://github.com/opentox/lazar.
-
-TODO: github tags
+Source code and datasets for the reproduction of this manuscript can be
+downloaded from the GitHub repository TODO. The lazar framework [@Maunz2013] is
+also available under a GPL License from https://github.com/opentox/lazar.
 
 Elena: please check if this is publication strategy is ok for the Swiss Federal Office
 
@@ -75,13 +94,14 @@ Elena + Swiss Federal Office contribution (input)
 
 The original Swiss Federal Office dataset has chronic toxicity data for rats,
 mice and multi generation effects. For the purpose of this study only rat LOAEL
-data was used. This leads to the *Swiss Federal Office*  dataset with 493 rat LOAEL
-values for 381 unique chemical structures.
+data with oral administration was used. This leads to the *Swiss Federal
+Office*  dataset with 493 rat LOAEL values for `r
+length(unique(s$SMILES))` unique chemical structures.
 
 ### Preprocessing
 
 Chemical structures (represented as SMILES [@doi:10.1021/ci00057a005]) in both
-datasets were checked for correctness, syntactically incorrect and missing
+datasets were checked for correctness. Syntactically incorrect and missing
 SMILES were generated from other identifiers (e.g names, CAS numbers). Unique
 smiles from the OpenBabel library [@OBoyle2011] were used for the
 identification of duplicated structures. 
@@ -98,42 +118,37 @@ Two derived datasets were obtained from the original datasets:
 The *test* dataset contains data of compounds that occur in both datasets.
 LOAEL values equal at five significant digits were considered as duplicates
 originating from the same study/publication and only one instance was kept in
-the test dataset.  Exact duplications of LOAEL values were removed, because it
-is very likely that they originate from the same study.  The test dataset has
+the test dataset.  The test dataset has
 375 LOAEL values for 155 unique
 chemical structures.
 
 The *training* dataset is the union of the Mazzatorta and the Swiss Federal
 Office dataset and it is used to build predictive models. LOAEL duplicates were
-removed, as for the test dataset.  The training dataset has `r
-length(c$SMILES)` LOAEL values for 671 unique chemical
-structures.
+removed using the same criteria as for the test dataset.  The training dataset
+has 998 LOAEL values for 671 unique
+chemical structures.
 
 Algorithms
 ----------
 
-In this study we are using the modular lazar (*la*zy *s*tructure
-*a*ctivity *r*elationships) framework [@Maunz2013] for model
-development and validation.
+In this study we are using the modular lazar (*la*zy *s*tructure *a*ctivity
+*r*elationships) framework [@Maunz2013] for model development and validation.
 
-lazar follows the following basic workflow: For a given chemical
-structure lazar 
+lazar follows the following basic workflow: For a given chemical structure
+lazar 
 
-- searches in a database for similar structures (*neighbors*)
-with experimental data, 
-- builds a local QSAR model with these neighbors
-and 
-- uses this model to predict the unknown activity of the query
-compound.
+- searches in a database for similar structures (*neighbors*) with experimental
+  data, 
+- builds a local QSAR model with these neighbors and 
+- uses this model to predict the unknown activity of the query compound.
 
-This procedure resembles an automated version of *read across*
-predictions in toxicology, in machine learning terms it would be
-classified as a *k-nearest-neighbor* algorithm.
+This procedure resembles an automated version of *read across* predictions in
+toxicology, in machine learning terms it would be classified as a
+*k-nearest-neighbor* algorithm.
 
-Apart from this basic workflow lazar is completely modular and allows
-the researcher to use any algorithm for similarity searches and
-local QSAR modelling. Within this study we are using the following
-algorithms:
+Apart from this basic workflow lazar is completely modular and allows the
+researcher to use any algorithm for similarity searches and local QSAR
+modelling. Within this study we are using the following algorithms:
 
 ### Neighbor identification
 
@@ -166,89 +181,107 @@ total number of atom environments $A \cup B$ (Jaccard/Tanimoto index, [@eq:jacca
 
 $$ sim = \frac{|A \cap B|}{|A \cup B|} $$ {#eq:jaccard}
 
-A threshold of $sim < 0.1$ is used for the identification of neighbors for
+A threshold of $sim > 0.1$ is used for the identification of neighbors for
 local QSAR models.  Compounds with the same structure as the query structure
-are eliminated from the neighbors to obtain an unbiased prediction.
+are eliminated from the neighbors to obtain  unbiased predictions in the presence of duplicates.
 
 ### Local QSAR models and predictions
 
-Only similar compounds (*neighbors*) are used for local QSAR models.  In this
-investigation we are using a weighted partial least squares regression (PLS)
-algorithm for the prediction of quantitative properties.  First all fingerprint
-features with identical values across all neighbors are removed.  The reamining
-set of features is used as descriptors for creating a local weighted PLS model
-with atom environments as descriptors and model similarities as weights. The
-`pls` method from the `caret` R package [@Kuhn08] is used for this purpose.
+Only similar compounds (*neighbors*) above the threshold are used for local
+QSAR models.  In this investigation we are using a weighted partial least
+squares regression (PLS) algorithm for the prediction of quantitative
+properties.  First all fingerprint features with identical values across all
+neighbors are removed.  The reamining set of features is used as descriptors
+for creating a local weighted PLS model with atom environments as descriptors
+and model similarities as weights. The `pls` method from the `caret` R package
+[@Kuhn08] is used for this purpose.  Models are trained with the default
+`caret` settings, optimizing the number of PLS components by bootstrap
+resampling.
 
 Finally the local PLS model is applied to predict the activity of the query
-compound.
+compound. The RMSE of bootstrapped model predictions is used to construct 95\%
+prediction intervals at 1.96*RMSE.
 
 If PLS modelling or prediction fails, the program resorts to using the weighted
 mean of the neighbors LOAEL values, where the contribution of each neighbor is
 weighted by its similarity to the query compound.
 
-default settings for tuning
-
 ### Applicability domain
 
-Christoph: TODO
+The applicability domain of lazar models is determined by the structural
+diversity of the training data. If no similar compounds are found in the
+training data no predictions will be generated. If the query compounds contains
+substructures that are not covered by training examples a warning is issued.
 
-Prediction intervals were obtained from the `predict` function.
+Local regression models consider neighbor similarities to the query compound,
+by weighting the contribution of each neighbor is weighted by its similarity
+index. The variability of local model predictions is reflected in the
+prediction interval.
 
 ### Validation
 
 For the comparison of experimental variability with predictive accuracies we
-are using a test set of compounds that occur in both datasets. The
-*Mazzatorta*, *Swiss Federal Office* and *training* datasets are used as
-training data for read across predictions. In order to obtain unbiased
-predictions *all* information from the test compound is removed from the
-training set prior to predictions. This procedure is hardcoded into the
-prediction algorithm in order to prevent validation errors.  Traditional
-10-fold crossvalidation results are provided as additional information for all
-three models. 
-
-TODO: treatment of duplicates
+are using a test set of compounds that occur in both datasets. Unbiased read
+across predictions are obtained from the *training* dataset, by removing *all*
+information from the test compound from the training set prior to predictions.
+This procedure is hardcoded into the prediction algorithm in order to prevent
+validation errors. As we have only a single test set no model or parameter
+optimisations were performed in order to avoid overfitting a single dataset.
 
-Christoph: check if these specifications have changed at submission
+Results from 3 repeated 10-fold crossvalidations with independent training/test
+set splits are provided as additional information to the test set results.
 
 Results
 =======
 
 ### Dataset comparison
 
-Elena
-
 The main objective of this section is to compare the content of both
 databases in terms of structural composition and LOAEL values, to
 estimate the experimental variability of LOAEL values and to establish a
 baseline for evaluating prediction performance.
 
-
 ##### Ches-Mapper analysis
 
-We applied the visualization tool CheS-Mapper (Chemical Space Mapping and Visualization in 3D,
-http://ches-mapper.org, @Gütlein2012) to compare both datasets. CheS-Mapper can be used to analyze the relationship between the structure of chemical compounds, their physico-chemical properties, and biological or toxic effects. It embeds a dataset into 3D space, such that compounds with similar feature values are close to each other. CheS-Mapper is generic and can be employed with different kinds of features. [@fig:ches-mapper-pc] shows an embedding that is based on physico-chemical (PC) descriptors.
+We applied the visualization tool CheS-Mapper (Chemical Space Mapping and
+Visualization in 3D, http://ches-mapper.org, @Gütlein2012) to compare both
+datasets. CheS-Mapper can be used to analyze the relationship between the
+structure of chemical compounds, their physico-chemical properties, and
+biological or toxic effects. It embeds a dataset into 3D space, such that
+compounds with similar feature values are close to each other. CheS-Mapper is
+generic and can be employed with different kinds of features.
+[@fig:ches-mapper-pc] shows an embedding that is based on physico-chemical (PC)
+descriptors.
 
-![Compounds from the Mazzatorta and the Swiss Federal Office dataset are highlighted in red and green. Compounds that occur in both datasets are highlighted in magenta. ](figure/pc-small-compounds-highlighted.png){#fig:ches-mapper-pc}
+![Compounds from the Mazzatorta and the Swiss Federal Office dataset are highlighted in red and green. Compounds that occur in both datasets are highlighted in magenta.](figure/pc-small-compounds-highlighted.png){#fig:ches-mapper-pc}
 
-Martin: explain  light colors at bottom of histograms
+Martin: please explain light colors at bottom of histograms
 
-In this example, CheS-Mapper applied a principal components analysis to map compounds according to their physico-chemical (PC) feature values into 3D space. Both datasets have in general very similar PC feature values. As an exception, the Mazzatorta dataset includes most of the tiny compound structures: we have selected the 78 smallest compounds (with 10 atoms and less, marked with a blue box in the screen-shot) and found that 61 of these compounds occur in the Mazzatorta dataset, whereas only 19 are contained in the Swiss dataset (p-value 3.7E-7).
+In this example, CheS-Mapper applied a principal components analysis to map
+compounds according to their physico-chemical (PC) feature values into 3D
+space. Both datasets have in general very similar PC feature values. As an
+exception, the Mazzatorta dataset includes most of the tiny compound
+structures: we have selected the 78 smallest compounds (with 10 atoms and less,
+marked with a blue box in the screen-shot) and found that 61 of these compounds
+occur in the Mazzatorta dataset, whereas only 19 are contained in the Swiss
+dataset (p-value 3.7E-7).
 
-This result was confirmed for structural features (fingerprints) including MolPrint2D features that are utilized for model building in this work.
+This result was confirmed for structural features (fingerprints) including
+MolPrint2D features that are utilized for model building in this work.
 
-In general we concluded that both datasets are very similar, in terms of chemical structures and physico-chemical properties. 
+In general we concluded that both datasets are very similar, in terms of
+chemical structures and physico-chemical properties. 
 
 ##### Distribution of functional groups
 
 
 
 In order to confirm the results of CheS-Mapper analysis we have evaluated the
-frequency of functional groups from the OpenBabel FP4
-fingerprint. [@fig:fg] shows the frequency of functional groups 
-in
-both datasets. 139 functional groups with a frequency > 25 are depicted, the complete table for all functional groups can be found in the
-data directory of the supplemental material (`data/functional-groups.csv`).
+frequency of functional groups from the OpenBabel FP4 fingerprint. [@fig:fg]
+shows the frequency of functional groups in both datasets. 139
+functional groups with a frequency > 25 are depicted, the complete table for
+all functional groups can be found in the data directory of the supplemental
+material (`data/functional-groups.csv`).
  
 ![Frequency of functional groups.](figure/functional-groups.pdf){#fig:fg}
 
@@ -263,12 +296,22 @@ experimental results within individual datasets and between datasets.
 
 
 
-The Mazzatorta dataset has 567 LOAEL values for 445 unique structures, 93 compounds have multiple measurements with an average variance of 0.19 log10 units [@fig:intra]. 
+The Mazzatorta dataset has 567 LOAEL values for
+445 unique structures, 93
+compounds have multiple measurements with a mean standard deviation of
+0.32 log10 units (@mazzatorta08, [@fig:intra]). 
+
+The Swiss Federal Office dataset has 493 rat LOAEL values for
+381 unique structures, 91 compounds have
+multiple measurements with a mean standard deviation of
+0.29 log10 units.
 
-The Swiss Federal Office dataset has 493 rat LOAEL values for 381 unique structures, 91 compounds have multiple measurements with a similar variance (average 0.15 log10 units). Variances of both datasets do not show a statistically significant difference with a
-p-value (t-test) of 0.25.
+Standard deviations of both datasets do not show
+a statistically significant difference with a p-value (t-test) of 0.21.
+The combined test set has a mean standard deviation of 0.33 
+log10 units.
 
-![Distribution and variability of LOAEL values in both datasets: Each vertical line represents a compound, dots are individual LOAEL values.](figure/dataset-variability.pdf){#fig:intra}
+![Distribution and variability of LOAEL values in both datasets. Each vertical line represents a compound, dots are individual LOAEL values.](figure/dataset-variability.pdf){#fig:intra}
 
 ##### Inter dataset variability
 
@@ -278,68 +321,111 @@ p-value (t-test) of 0.25.
 
 
 
-[@fig:corr] depicts the correlation between LOAEL values from both datasets. As both datasets contain duplicates we are using medians for the correlation plot and statistics. Please note that the aggregation of duplicated measurements into a single value hides a substantial portion of the real experimental variability.
-Correlation analysis shows a
-significant (p-value < 2.2e-16) correlation between the experimental data in both datasets with r\^2: 0.49, RMSE: 1.41
+[@fig:corr] depicts the correlation between LOAEL values from both datasets. As
+both datasets contain duplicates we are using medians for the correlation plot
+and statistics. Please note that the aggregation of duplicated measurements
+into a single median value hides a substantial portion of the experimental
+variability.  Correlation analysis shows a significant (p-value < 2.2e-16)
+correlation between the experimental data in both datasets with r\^2:
+0.49, RMSE: 1.41
 
 ### Local QSAR models
 
-In order to compare the perfomance of in silico models with experimental variability we are using compounds that occur in both datasets as a test set (375 measurements, 155 compounds).
 
-The Mazzatorta, the Swiss Federal Office dataset and a combined dataset were used as training data for building `lazar` read across models. Predictions for the test set compounds were made after eliminating all information from the test compound from the corresponding training dataset. [@fig:comp] summarizes the results:
 
-![Comparison of experimental with predicted LOAEL values, each vertical line represents a compound, dots are individual measurements (red) or predictions (green).](figure/test-prediction.pdf){#fig:comp}
+In order to compare the performance of in silico read across models with experimental
+variability we are using compounds that occur in both datasets as a test set
+(375 measurements, 155 compounds).
+`lazar` read across predictions
+were obtained for 155 compounds, 8
+predictions failed, because no similar compounds were found in the training data (i.e. they were not covered by the applicability domain of the training data).
 
 
+Experimental data and 95\% prediction intervals did not overlap in 13 cases
+(9\%),
+8 predictions were too high and
+5 predictions too low (after -log10 transformation).
 
-TODO: nr unpredicted, nr predictions outside of experimental values
+[@fig:comp] shows a comparison of predicted with experimental values:
 
-Correlation analysis has been perfomed between individual predictions and the median of exprimental data.
-All correlations are statistically highly significant with a p-value < 2.2e-16.
-These results are presented in [@fig:corr] and [@tbl:cv]. Please bear in mind that the aggregation of experimental data into a single value actually hides experimental variability.
+![Comparison of experimental with predicted LOAEL values. Each vertical line represents a compound, dots are individual measurements (red) or predictions (green).](figure/test-prediction.pdf){#fig:comp}
 
-Training data | $r^2$                     | RMSE                    
+Correlation analysis was performed between individual predictions and the
+median of experimental data.  All correlations are statistically highly
+significant with a p-value < 2.2e-16.  These results are presented in
+[@fig:corr] and [@tbl:cv]. Please bear in mind that the aggregation of
+experimental data into a single median value hides experimental variability.
+
+Comparison    | $r^2$                     | RMSE                    
 --------------|---------------------------|-------------------------
-Experimental | 0.49      | 1.41           
-Combined             | 0.4 | 1.47 
+Mazzatorta vs. Swiss | 0.49      | 1.41           
+Prediction vs. Test median             | 0.4 | 1.47 
 
 : Comparison of model predictions with experimental variability. {#tbl:common-pred}
 
 ![Correlation of experimental with predicted LOAEL values (test set)](figure/test-correlation.pdf){#fig:corr}
 
 
-TODO: repeated CV
-
-Traditional 10-fold cross-validation results are summarised in [@tbl:cv] and [@fig:cv].
-All correlations are statistically highly significant with a p-value < 2.2e-16.
 
-Training dataset | $r^2$ | RMSE 
------------------|-------|------
-Combined | 0.4  | 1.8 
-Combined | 0.38  | 1.84 
-Combined | 0.4  | 1.81 
+For a further assessment of model performance three independent 
+10-fold cross-validations were performed. Results are summarised in [@tbl:cv] and [@fig:cv].
+All correlations of predicted with experimental values are statistically highly significant with a p-value < 2.2e-16.
 
-: 10-fold crossvalidation results {#tbl:cv}
+ $r^2$ | RMSE | Nr. predicted
+-------|------|----------------
+0.4  | 1.8 | 630/671
+0.38  | 1.84 | 631/671
+0.4  | 1.81 | 635/671
 
+: Results from 3 independent 10-fold crossvalidations {#tbl:cv}
 
 ![Correlation of experimental with predicted LOAEL values (10-fold crossvalidation)](figure/crossvalidation.pdf){#fig:cv}
 
-
 Discussion
 ==========
 
 Elena + Benoit
 
-- both datasets are structurally similar
-- LOAEL values have similar variability in both datasets
-- the Mazzatorta dataset has a small portion of very toxic compounds (low LOAEL, high -log10(LOAEL))
-- lazar read across predictions fall within the experimental variability of LOAEL values
-- predictions are slightly less accurate at extreme (high/low) LOAEL values, this can be explained by the algorithms used
-- the original Mazzatorta paper has "better" results (R^2 0.54, RMSE 0.7) , but the model is likely to be overfitted (using genetic algorithms for feature selection *prior* to crossvalidation must lead to overfitted models)
-- beware of over-optimisations and the race for "better" validation results
+### Dataset comparison
+
+Our investigations clearly indicate that the Mazzatorta and Swiss Federal Office datasets are very similar in terms of chemical structures and properties and the distribution of experimental LOAEL values. The only minor difference that we have observed is that the Mazzatorta dataset has a larger number of highly toxic compounds [@fig:intra] and a larger amount of small molecules, than the Swiss Federal Office dataset. For this reason we have pooled both dataset into a single training dataset for read across predictions.
+
+[@fig:intra] and [@fig:corr] and [@tbl:common-pred] show however considerable variability in the experimental data. 
+High experimental variability 
+has an impact on model building and on model validation.
+First it influences model quality by introducing noise into the training data, secondly it influences accuracy estimates because predictions have to be compared against noisy data where "true" experimental values are unknown.
+This will become obvious in the next section, where we compare predictions with experimental data.
+
+### Local QSAR models
+
+[@fig:comp], [@fig:corr], [@tbl:common-pred]
+and the fact that experimental data is covered in
+91\% by the `lazar`
+prediction interval shows that `lazar` read across predictions fit well into
+the experimental variability of LOAEL values.
+
+It is tempting to increase the "quality" of predictions by performing parameter or algorithm optimisations, but this may lead to overfitted models, because the training set is known beforehand.
+As prediction accuracies correspond well to experimental accuracies, and the visual inspection of predictions does not show obvious anomalies, we consider our model as a robust method for LOAEL estimations.
+Prediction accuracies that are lower than experimental variability would be a clear sign for a model that is overfitted for a particular test set.
+
+The graphical interface provides intuitive means of inspecting the rationales and data used for read across predictions. In order to show how such an inspection can help to identify problematic predictions
+we present a brief analysis of the two most severe mispredictions:
+
+
+
+The compound with the largest deviation of prediction intervals is (amino-methylsulfanyl-phosphoryl)oxymethane (SMILES COP(=O)(SC)N) with an experimental median of 2.69 and a prediction interval of 0.48 +/- 0.59. In this case the prediction is based on two neighbors with very low similarity (0.1 and 0.13). Such cases can be eliminated by raising the similarity threshold for neighbors, but that could come at the cost of a larger number of unpredicted compounds. The graphical user interface shows for each prediction neighbors and similarities for a critical examination which should make the detection of similar cases rather straightforward.
+
+
+
+The compound with second largest deviation of prediction intervals is
+Endosulfan (SMILES O=S1OCC2C(CO1)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl)
+with an experimental median of 1.91 and a prediction interval of 3.64 +/- 0.45. In this case the prediction is based on 5 neighbors with similarities between 0.33 and 0.4. All of them are polychlorinated compound, but none of them contains sulfur or is a sulfurous acid ester. Again such problems are easily identified from a visual inspection of neighbors, and we want to stress the importance of inspecting rationales for predictions in the graphical interface before accepting a prediction.
 
 Summary
 =======
 
+- beware of over-optimisations and the race for "better" validation results
+- reproducible research
+
 References
 ==========
diff --git a/loael.pdf b/loael.pdf
index db69185..660f9fc 100644
--- a/loael.pdf
+++ b/loael.pdf
diff --git a/test-prediction-plot.R b/test-prediction-plot.R
index 4cee05e..db003d3 100644
--- a/test-prediction-plot.R
+++ b/test-prediction-plot.R
@@ -15,7 +15,7 @@ data$LOAEL = -log(data$LOAEL)
 data$SMILES <- reorder(data$SMILES,data$LOAEL)
 #img <- ggplot(data, aes(SMILES,LOAEL,ymin = min(LOAEL), ymax=max(LOAEL),shape=Source,color=Type))
 img <- ggplot(data, aes(SMILES,LOAEL,ymin = min(LOAEL), ymax=max(LOAEL),color=Type))
-img <- img + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank())
+img <- img + ylab('-log(LOAEL mg/kg_bw/day)') + xlab('Compound') + theme(axis.text.x = element_blank()) + theme(legend.title=element_blank())
 img <- img + geom_point()
 
 ggsave(file='figure/test-prediction.pdf', plot=img,width=12, height=8)
diff --git a/test-validation.rb b/test-validation.rb
index 71dc4b3..0bbcc42 100644
--- a/test-validation.rb
+++ b/test-validation.rb
@@ -21,6 +21,6 @@ end
 data.sort!{|a,b| a[1] <=> b[1]}
 
 CSV.open(csv_file,"w+") do |csv|
-  csv << ["SMILES","LOAEL_measured_median","LOAEL_predicted","Confidence","Dataset"]
+  csv << ["SMILES","LOAEL_measured_median","LOAEL_predicted","RMSE","Dataset"]
   data.each{|r| csv << r}
 end
author	Christoph Helma <helma@in-silico.ch>	2016-03-04 15:27:20 +0100
committer	Christoph Helma <helma@in-silico.ch>	2016-03-04 15:27:20 +0100
commit	7ad7c10c1e708f6b5a3473de24dbeab03d0b74a3 (patch)
tree	14e7a8c37343f3d878e8116873978861b226b5bc
parent	d3071896a7116670756199f0df7c2a618de2aea3 (diff)