summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2017-04-14 12:25:45 +0200
committerChristoph Helma <helma@in-silico.ch>2017-04-14 12:25:45 +0200
commit8feada761cf87575ce037b5b8339691a7e9ae238 (patch)
treebafc5f713fa3c2f951465ab65ac5fd82bbdc83ce
parent6f527daf4875ce2ed864e8a6f4f30e44b4370561 (diff)
inter dataset sd in mg units
-rw-r--r--Makefile11
-rw-r--r--data/all_mg_dup.csv215
-rw-r--r--data/mazzatorta_mg_dup.csv215
-rw-r--r--data/swiss_mg_dup.csv194
-rw-r--r--loael.Rmd72
-rw-r--r--loael.md63
-rw-r--r--loael.pdfbin348755 -> 348856 bytes
-rwxr-xr-xscripts/all_mg_dup.rb35
-rwxr-xr-xscripts/mazzatorta-unique-smiles.rb1
-rwxr-xr-xscripts/mazzatorta_mg_dup.rb24
-rwxr-xr-xscripts/noael_loael2swiss_mg_dup.rb26
11 files changed, 810 insertions, 46 deletions
diff --git a/Makefile b/Makefile
index 1781669..111fd7b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
# Variables
-datasets = data/median-correlation.csv data/test_log10.csv data/training_log10.csv data/mazzatorta_log10.csv data/swiss_log10.csv
+datasets = data/median-correlation.csv data/test_log10.csv data/training_log10.csv data/mazzatorta_log10.csv data/swiss_log10.csv data/swiss_mg_dup.csv data/mazzatorta_mg_dup.csv data/all_mg_dup.csv
crossvalidations = data/training_log10-cv-0.csv data/training_log10-cv-1.csv data/training_log10-cv-2.csv
validations = data/training-test-predictions.csv $(crossvalidations) data/misclassifications.csv
figures = figures/functional-groups.pdf figures/test-prediction.pdf figures/prediction-test-correlation.pdf figures/dataset-variability.pdf figures/median-correlation.pdf figures/crossvalidation0.pdf figures/crossvalidation1.pdf figures/crossvalidation2.pdf
@@ -105,9 +105,18 @@ data/swiss_log10.csv: data/swiss.csv
data/mazzatorta.csv: data/LOAEL_mg_corrected_smiles_mmol.csv
scripts/mazzatorta-unique-smiles.rb data/LOAEL_mg_corrected_smiles_mmol.csv
+data/mazzatorta_mg_dup.csv: data/LOAEL_mg_corrected_smiles_mmol.csv
+ scripts/mazzatorta_mg_dup.rb data/LOAEL_mg_corrected_smiles_mmol.csv
+
data/swiss.csv: data/NOAEL-LOAEL_SMILES_rat_chron.csv
scripts/noael_loael2mmol.rb data/NOAEL-LOAEL_SMILES_rat_chron.csv
+data/swiss_mg_dup.csv: data/NOAEL-LOAEL_SMILES_rat_chron.csv
+ scripts/noael_loael2swiss_mg_dup.rb data/NOAEL-LOAEL_SMILES_rat_chron.csv
+
+data/all_mg_dup.csv: data/NOAEL-LOAEL_SMILES_rat_chron.csv data/LOAEL_mg_corrected_smiles_mmol.csv
+ scripts/all_mg_dup.rb
+
clean:
rm figures/*pdf
cd data && rm `ls -I "*LOAEL*" -I "*functional*" -I "*SMARTS*"`
diff --git a/data/all_mg_dup.csv b/data/all_mg_dup.csv
new file mode 100644
index 0000000..3e938ed
--- /dev/null
+++ b/data/all_mg_dup.csv
@@ -0,0 +1,215 @@
+SMILES,LOAEL
+OCC(C1OC(=O)C(=C1O)O)O,-3.4844
+OCC(C1OC(=O)C(=C1O)O)O,-3.1915
+CC(c1ccccc1)C,-2.6646
+CC(c1ccccc1)C,-2.5198
+CCc1ccccc1,-2.6107
+CCc1ccccc1,-2.4639
+OCCO,-2.3979
+OCCO,-2.6021
+OCCO,-3.301
+OCCO,-2.9638
+OCCO,-3.0
+C=Cc1ccccc1,-1.3222
+C=Cc1ccccc1,-2.4548
+C=Cc1ccccc1,-2.6021
+ClC(C(c1ccc(cc1)Cl)c1ccc(cc1)Cl)(Cl)Cl,-1.5051
+ClC(C(c1ccc(cc1)Cl)c1ccc(cc1)Cl)(Cl)Cl,0.60206
+COCN(c1c(CC)cccc1CC)C(=O)CCl,-1.1761
+COCN(c1c(CC)cccc1CC)C(=O)CCl,-1.1461
+COP(=S)(SCn1nnc2c(c1=O)cccc2)OC,0.4437
+COP(=S)(SCn1nnc2c(c1=O)cccc2)OC,-0.35218
+COP(=S)(SCn1nnc2c(c1=O)cccc2)OC,-0.41162
+O=C(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,-1.3979
+O=C(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,-2.0569
+ClC(C(SN1C(=O)C2C(C1=O)CC=CC2)(Cl)Cl)Cl,-1.0792
+ClC(C(SN1C(=O)C2C(C1=O)CC=CC2)(Cl)Cl)Cl,-1.1761
+CNC(=O)Oc1cccc2c1cccc2,-1.1931
+CNC(=O)Oc1cccc2c1cccc2,-1.7782
+ClC1CC2C(C1Cl)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,0.56864
+ClC1CC2C(C1Cl)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,0.30103
+ClC1CC2C(C1Cl)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,-0.77815
+ClC1CC2C(C1Cl)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,0.60206
+Nc1ccc(cc1)Cl,-0.77815
+Nc1ccc(cc1)Cl,-1.0969
+CCOP(=S)(Oc1ccc2c(c1)oc(=O)c(c2C)Cl)OCC,0.09691
+CCOP(=S)(Oc1ccc2c(c1)oc(=O)c(c2C)Cl)OCC,-0.23045
+COC(=O)c1c(Cl)c(Cl)c(c(c1Cl)Cl)C(=O)OC,-2.699
+COC(=O)c1c(Cl)c(Cl)c(c(c1Cl)Cl)C(=O)OC,-1.0
+OC(=O)C(Cl)(Cl)C,-1.4498
+OC(=O)C(Cl)(Cl)C,-1.699
+ClCCl,-1.699
+ClCCl,-1.7208
+COP(=O)(OC=C(Cl)Cl)OC,-0.36173
+COP(=O)(OC=C(Cl)Cl)OC,-0.33244
+OC(C(Cl)(Cl)Cl)(c1ccc(cc1)Cl)c1ccc(cc1)Cl,-1.301
+OC(C(Cl)(Cl)Cl)(c1ccc(cc1)Cl)c1ccc(cc1)Cl,-0.39794
+ClC1=C(Cl)C2(C(C1(Cl)C1C2C2CC1C1C2O1)(Cl)Cl)Cl,1.301
+ClC1=C(Cl)C2(C(C1(Cl)C1C2C2CC1C1C2O1)(Cl)Cl)Cl,0.90309
+ClC1=C(Cl)C2(C(C1(Cl)C1C2C2CC1C1C2O1)(Cl)Cl)Cl,1.0
+ClC1=C(Cl)C2(C(C1(Cl)C1C2C2CC1C1C2O1)(Cl)Cl)Cl,0.60206
+O=C(NC(=O)c1c(F)cccc1F)Nc1ccc(cc1)Cl,-0.90309
+O=C(NC(=O)c1c(F)cccc1F)Nc1ccc(cc1)Cl,-0.89209
+O=C(NC(=O)c1c(F)cccc1F)Nc1ccc(cc1)Cl,-0.8451
+CNC(=O)CSP(=S)(OC)OC,0.60206
+CNC(=O)CSP(=S)(OC)OC,-0.69897
+c1ccc(cc1)Nc1ccccc1,-1.4914
+c1ccc(cc1)Nc1ccccc1,-1.3979
+c1ccn2c(c1)c1ccccn1CC2,0.23657
+c1ccn2c(c1)c1ccccn1CC2,0.24413
+CCSCCSP(=S)(OCC)OCC,1.0
+CCSCCSP(=S)(OCC)OCC,1.3979
+CCSCCSP(=S)(OCC)OCC,0.65758
+O=C(N(C)C)Nc1ccc(c(c1)Cl)Cl,-0.77815
+O=C(N(C)C)Nc1ccc(c(c1)Cl)Cl,-0.79588
+ClCCP(=O)(O)O,-2.1761
+ClCCP(=O)(O)O,-2.6493
+ClCCP(=O)(O)O,-1.0792
+Cn1cc(c2cccc(c2)C(F)(F)F)c(=O)c(c1)c1ccccc1,-1.3979
+Cn1cc(c2cccc(c2)C(F)(F)F)c(=O)c(c1)c1ccccc1,-1.5119
+CCP(=S)(Sc1ccccc1)OCC,-0.19866
+CCP(=S)(Sc1ccccc1)OCC,-0.69897
+ClC1C(Cl)C(Cl)C(C(C1Cl)Cl)Cl,-0.69897
+ClC1C(Cl)C(Cl)C(C(C1Cl)Cl)Cl,-0.66276
+ClC1C(Cl)C(Cl)C(C(C1Cl)Cl)Cl,-0.6721
+ClC1C(Cl)C(Cl)C(C(C1Cl)Cl)Cl,-0.90309
+ClC1C(Cl)C(Cl)C(C(C1Cl)Cl)Cl,-0.60206
+C=CCOC(c1ccc(cc1Cl)Cl)Cn1cncc1,-1.6021
+C=CCOC(c1ccc(cc1Cl)Cl)Cn1cncc1,-1.1761
+CCOC(=O)CC(C(=O)OCC)SP(=S)(OC)OC,-1.699
+CCOC(=O)CC(C(=O)OCC)SP(=S)(OC)OC,-2.5563
+COc1sc(=O)n(n1)CSP(=S)(OC)OC,-0.30103
+COc1sc(=O)n(n1)CSP(=S)(OC)OC,0.09691
+COc1sc(=O)n(n1)CSP(=S)(OC)OC,-0.09691
+COc1sc(=O)n(n1)CSP(=S)(OC)OC,-0.20412
+CNC(=O)ON=C(SC)C,-1.0
+CNC(=O)ON=C(SC)C,-1.301
+CCOP(=S)(Oc1ccc(cc1)N(=O)=O)OCC,-0.54407
+CCOP(=S)(Oc1ccc(cc1)N(=O)=O)OCC,0.37675
+COP(=S)(SCN1C(=O)c2c(C1=O)cccc2)OC,-1.301
+COP(=S)(SCN1C(=O)c2c(C1=O)cccc2)OC,-0.95424
+CCN(C(=O)C(=C(OP(=O)(OC)OC)C)Cl)CC,-1.0899
+CCN(C(=O)C(=C(OP(=O)(OC)OC)C)Cl)CC,-0.17609
+CCN(c1nc(cc(n1)C)OP(=S)(OC)OC)CC,-0.39794
+CCN(c1nc(cc(n1)C)OP(=S)(OC)OC)CC,-1.1399
+CCC(=O)Nc1ccc(c(c1)Cl)Cl,-1.301
+CCC(=O)Nc1ccc(c(c1)Cl)Cl,-1.8808
+CCCC1COC(O1)(Cn1cncn1)c1ccc(cc1Cl)Cl,-1.3979
+CCCC1COC(O1)(Cn1cncn1)c1ccc(cc1Cl)Cl,-1.9823
+CCNc1nc(NCC)nc(n1)Cl,-0.69897
+CCNc1nc(NCC)nc(n1)Cl,-0.72428
+ClC=C(c1cc(Cl)c(cc1Cl)Cl)OP(=O)(OC)OC,-2.0
+ClC=C(c1cc(Cl)c(cc1Cl)Cl)OP(=O)(OC)OC,-1.699
+CN(C(=S)SSC(=S)N(C)C)C,-1.1761
+CN(C(=S)SSC(=S)N(C)C)C,-1.0607
+CN(C(=S)SSC(=S)N(C)C)C,-0.73799
+CN(C(=S)SSC(=S)N(C)C)C,-1.0792
+CC(N(c1c(cc(cc1N(=O)=O)C(F)(F)F)N(=O)=O)C(C)C)C,-1.6021
+CC(N(c1c(cc(cc1N(=O)=O)C(F)(F)F)N(=O)=O)C(C)C)C,-3.0
+COP(=O)(SC)N,0.045757
+COP(=O)(SC)N,1.0
+COP(=O)(SC)N,0.5376
+COP(=O)(NC(=O)C)SC,-1.5441
+COP(=O)(NC(=O)C)SC,-0.39794
+CCOP(=S)(Oc1cc(C)nc(n1)C(C)C)OCC,-0.17609
+CCOP(=S)(Oc1cc(C)nc(n1)C(C)C)OCC,-0.76343
+NC1CCCCC1,-1.7672
+NC1CCCCC1,-1.7782
+OC(=O)CNCP(=O)(O)O,-3.0
+OC(=O)CNCP(=O)(O)O,-2.4771
+S=C1NCCN1,0.63827
+S=C1NCCN1,0.60206
+S=C1NCCN1,-0.09691
+CON(C(=O)Nc1ccc(c(c1)Cl)Cl)C,-0.79588
+CON(C(=O)Nc1ccc(c(c1)Cl)Cl)C,-0.39794
+CCOP(=S)(Oc1nc(Cl)c(cc1Cl)Cl)OCC,-1.0
+CCOP(=S)(Oc1nc(Cl)c(cc1Cl)Cl)OCC,-1.2388e-09
+CC(OC(=O)Nc1cccc(c1)Cl)C,-2.699
+CC(OC(=O)Nc1cccc(c1)Cl)C,-3.0
+CCCCOCCOCCOCc1cc2OCOc2cc1CCC,-2.3979
+CCCCOCCOCCOCc1cc2OCOc2cc1CCC,-2.0
+N#CC(c1ccc(c(c1)Oc1ccccc1)F)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.0969
+N#CC(c1ccc(c(c1)Oc1ccccc1)F)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-0.87506
+N#CC(c1ccc(c(c1)Oc1ccccc1)F)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-0.77815
+N#CC(c1ccc(c(c1)Oc1ccccc1)F)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.0792
+ClC1C=CC2C1C1(Cl)C(=C(C2(C1(Cl)Cl)Cl)Cl)Cl,0.60206
+ClC1C=CC2C1C1(Cl)C(=C(C2(C1(Cl)Cl)Cl)Cl)Cl,0.42022
+CCOP(=S)(SCSC(C)(C)C)OCC,1.301
+CCOP(=S)(SCSC(C)(C)C)OCC,-0.30103
+CCOP(=S)(SCSC(C)(C)C)OCC,1.2218
+CN(C=Nc1ccc(cc1C)C)C=Nc1ccc(cc1C)C,-1.0086
+CN(C=Nc1ccc(cc1C)C)C=Nc1ccc(cc1C)C,-1.0
+CC(N1C(=O)c2ccccc2NS1(=O)=O)C,-1.6021
+CC(N1C(=O)c2ccccc2NS1(=O)=O)C,-1.5441
+CCSC(CC1CC(=O)C(=C(NOCC=CCl)CC)C(=O)C1)C,-2.0
+CCSC(CC1CC(=O)C(=C(NOCC=CCl)CC)C(=O)C1)C,-1.9345
+CCCN(C(=O)SCC)CCC,-0.95424
+CCCN(C(=O)SCC)CCC,-1.3979
+CSc1ccc(cc1C)OP(=S)(OC)OC,-0.57403
+CSc1ccc(cc1C)OP(=S)(OC)OC,0.14267
+CSc1ccc(cc1C)OP(=S)(OC)OC,0.34679
+c1scc(n1)c1nc2c([nH]1)cccc2,-0.30103
+c1scc(n1)c1nc2c([nH]1)cccc2,-1.6021
+c1scc(n1)c1nc2c([nH]1)cccc2,-1.4771
+Clc1c(Cl)c([N+](=O)[O-])c(c(c1Cl)Cl)Cl,-2.0969
+Clc1c(Cl)c([N+](=O)[O-])c(c(c1Cl)Cl)Cl,-2.1461
+Nc1ncn[nH]1,-0.39794
+Nc1ncn[nH]1,-0.69897
+O=C(NC(=O)c1c(F)cccc1F)Nc1cc(Cl)c(c(c1F)Cl)F,-1.415
+O=C(NC(=O)c1c(F)cccc1F)Nc1cc(Cl)c(c(c1F)Cl)F,-1.3979
+C=CCC1=C(C)C(CC1=O)OC(=O)C1C(C1(C)C)C=C(C)C,-1.8751
+C=CCC1=C(C)C(CC1=O)OC(=O)C1C(C1(C)C)C=C(C)C,-1.3892
+C=CCC1=C(C)C(CC1=O)OC(=O)C1C(C1(C)C)C=C(C)C,-1.1399
+CON=C(c1ccccc1COc1ccccc1C)C(=O)OC,-2.574
+CON=C(c1ccccc1COc1ccccc1C)C(=O)OC,-2.5682
+CC(N(c1c(cc(cc1N(=O)=O)S(=O)(=O)N)N(=O)=O)C(C)C)C,-1.5666
+CC(N(c1c(cc(cc1N(=O)=O)S(=O)(=O)N)N(=O)=O)C(C)C)C,-1.6532
+CCCN(C(=O)n1cncc1)CCOc1c(Cl)cc(cc1Cl)Cl,-0.87506
+CCCN(C(=O)n1cncc1)CCOc1c(Cl)cc(cc1Cl)Cl,-0.70757
+N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)(C)C,-1.2889
+N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)(C)C,-1.3979
+N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.8751
+N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.699
+ClC1C2OC2C2C1C1(Cl)C(=C(C2(C1(Cl)Cl)Cl)Cl)Cl,0.69897
+ClC1C2OC2C2C1C1(Cl)C(=C(C2(C1(Cl)Cl)Cl)Cl)Cl,1.6021
+N#CC(c1ccccc1)(Cn1cncn1)CCc1ccc(cc1)Cl,-1.6021
+N#CC(c1ccccc1)(Cn1cncn1)CCc1ccc(cc1)Cl,-1.4771
+O=C(C1C(C1(C)C)C=C(C(F)(F)F)Cl)OCc1cccc(c1C)c1ccccc1,-0.69897
+O=C(C1C(C1(C)C)C=C(C(F)(F)F)Cl)OCc1cccc(c1C)c1ccccc1,-0.90309
+[O-][N+](=O)c1cc(Cl)c(c(c1)Cl)N,-2.1761
+[O-][N+](=O)c1cc(Cl)c(c(c1)Cl)N,-2.3802
+CC(Oc1cccc(c1)NC(=O)c1ccccc1C(F)(F)F)C,-2.699
+CC(Oc1cccc(c1)NC(=O)c1ccccc1C(F)(F)F)C,-1.9395
+CCCCNC(=O)n1c(NC(=O)OC)nc2c1cccc2,-2.0607
+CCCCNC(=O)n1c(NC(=O)OC)nc2c1cccc2,-2.3979
+ClC(SN1C(=O)c2c(C1=O)cccc2)(Cl)Cl,-2.699
+ClC(SN1C(=O)c2c(C1=O)cccc2)(Cl)Cl,-1.6021
+ClC(SN1C(=O)c2c(C1=O)cccc2)(Cl)Cl,-1.699
+C#CCOS(=O)OC1CCCCC1Oc1ccc(cc1)C(C)(C)C,-2.0
+C#CCOS(=O)OC1CCCCC1Oc1ccc(cc1)C(C)(C)C,-1.2788
+N#Cc1sc2=c(sc1C#N)c(=O)c1c(c2=O)cccc1,-1.0
+N#Cc1sc2=c(sc1C#N)c(=O)c1c(c2=O)cccc1,-0.77815
+CN(C(=O)Oc1nc(nc(c1C)C)N(C)C)C,-1.0969
+CN(C(=O)Oc1nc(nc(c1C)C)N(C)C)C,-1.0899
+CNc1cnn(c(=O)c1Cl)c1cccc(c1)C(F)(F)F,-1.273
+CNc1cnn(c(=O)c1Cl)c1cccc(c1)C(F)(F)F,-1.7097
+Clc1ccccc1c1nnc(nn1)c1ccccc1Cl,-1.301
+Clc1ccccc1c1nnc(nn1)c1ccccc1Cl,-1.238
+CCCCCCCCc1cc(N(=O)=O)c(c(c1)N(=O)=O)OC(=O)C=CC,-1.699
+CCCCCCCCc1cc(N(=O)=O)c(c(c1)N(=O)=O)OC(=O)C=CC,-1.8062
+CCSC(=O)N1CCCCCC1,-1.1706
+CCSC(=O)N1CCCCCC1,-0.30103
+OC(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,-1.3979
+OC(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,-2.0212
+CCOP(=O)(Oc1ccc(c(c1)C)SC)NC(C)C,-0.17609
+CCOP(=O)(Oc1ccc(c(c1)C)SC)NC(C)C,-0.23045
+Cn1ccc(cc1)c1ccn(cc1)C,-0.57403
+Cn1ccc(cc1)c1ccn(cc1)C,-0.40654
+CCCCC(c1ccc(cc1)Cl)(Cn1cncn1)C#N,-0.993
+CCCCC(c1ccc(cc1)Cl)(Cn1cncn1)C#N,-0.99123
+Fc1ccc(cc1)C(=O)CCCN1CCN(CC1)c1ccccn1,-2.0607
+Fc1ccc(cc1)C(=O)CCCN1CCN(CC1)c1ccccn1,-1.4771
+N#CC(c1c(Cl)ccc(c1Cl)n1ncc(=O)[nH]c1=O)c1ccc(cc1)Cl,-1.1761
+N#CC(c1c(Cl)ccc(c1Cl)n1ncc(=O)[nH]c1=O)c1ccc(cc1)Cl,-1.3617
+COC1CC(OC2C(C)C=CC=C3COC4C3(O)C(C=C(C4O)C)C(=O)OC3CC(CC=C2C)OC2(C3)C=CC(C(O2)C(C)C)C)OC(C1OC1CC(OC)C(C(O1)C)NC(=O)C)C,-1.9287e-16
+COC1CC(OC2C(C)C=CC=C3COC4C3(O)C(C=C(C4O)C)C(=O)OC3CC(CC=C2C)OC2(C3)C=CC(C(O2)C(C)C)C)OC(C1OC1CC(OC)C(C(O1)C)NC(=O)C)C,-0.39794
diff --git a/data/mazzatorta_mg_dup.csv b/data/mazzatorta_mg_dup.csv
new file mode 100644
index 0000000..3e938ed
--- /dev/null
+++ b/data/mazzatorta_mg_dup.csv
@@ -0,0 +1,215 @@
+SMILES,LOAEL
+OCC(C1OC(=O)C(=C1O)O)O,-3.4844
+OCC(C1OC(=O)C(=C1O)O)O,-3.1915
+CC(c1ccccc1)C,-2.6646
+CC(c1ccccc1)C,-2.5198
+CCc1ccccc1,-2.6107
+CCc1ccccc1,-2.4639
+OCCO,-2.3979
+OCCO,-2.6021
+OCCO,-3.301
+OCCO,-2.9638
+OCCO,-3.0
+C=Cc1ccccc1,-1.3222
+C=Cc1ccccc1,-2.4548
+C=Cc1ccccc1,-2.6021
+ClC(C(c1ccc(cc1)Cl)c1ccc(cc1)Cl)(Cl)Cl,-1.5051
+ClC(C(c1ccc(cc1)Cl)c1ccc(cc1)Cl)(Cl)Cl,0.60206
+COCN(c1c(CC)cccc1CC)C(=O)CCl,-1.1761
+COCN(c1c(CC)cccc1CC)C(=O)CCl,-1.1461
+COP(=S)(SCn1nnc2c(c1=O)cccc2)OC,0.4437
+COP(=S)(SCn1nnc2c(c1=O)cccc2)OC,-0.35218
+COP(=S)(SCn1nnc2c(c1=O)cccc2)OC,-0.41162
+O=C(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,-1.3979
+O=C(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,-2.0569
+ClC(C(SN1C(=O)C2C(C1=O)CC=CC2)(Cl)Cl)Cl,-1.0792
+ClC(C(SN1C(=O)C2C(C1=O)CC=CC2)(Cl)Cl)Cl,-1.1761
+CNC(=O)Oc1cccc2c1cccc2,-1.1931
+CNC(=O)Oc1cccc2c1cccc2,-1.7782
+ClC1CC2C(C1Cl)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,0.56864
+ClC1CC2C(C1Cl)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,0.30103
+ClC1CC2C(C1Cl)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,-0.77815
+ClC1CC2C(C1Cl)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,0.60206
+Nc1ccc(cc1)Cl,-0.77815
+Nc1ccc(cc1)Cl,-1.0969
+CCOP(=S)(Oc1ccc2c(c1)oc(=O)c(c2C)Cl)OCC,0.09691
+CCOP(=S)(Oc1ccc2c(c1)oc(=O)c(c2C)Cl)OCC,-0.23045
+COC(=O)c1c(Cl)c(Cl)c(c(c1Cl)Cl)C(=O)OC,-2.699
+COC(=O)c1c(Cl)c(Cl)c(c(c1Cl)Cl)C(=O)OC,-1.0
+OC(=O)C(Cl)(Cl)C,-1.4498
+OC(=O)C(Cl)(Cl)C,-1.699
+ClCCl,-1.699
+ClCCl,-1.7208
+COP(=O)(OC=C(Cl)Cl)OC,-0.36173
+COP(=O)(OC=C(Cl)Cl)OC,-0.33244
+OC(C(Cl)(Cl)Cl)(c1ccc(cc1)Cl)c1ccc(cc1)Cl,-1.301
+OC(C(Cl)(Cl)Cl)(c1ccc(cc1)Cl)c1ccc(cc1)Cl,-0.39794
+ClC1=C(Cl)C2(C(C1(Cl)C1C2C2CC1C1C2O1)(Cl)Cl)Cl,1.301
+ClC1=C(Cl)C2(C(C1(Cl)C1C2C2CC1C1C2O1)(Cl)Cl)Cl,0.90309
+ClC1=C(Cl)C2(C(C1(Cl)C1C2C2CC1C1C2O1)(Cl)Cl)Cl,1.0
+ClC1=C(Cl)C2(C(C1(Cl)C1C2C2CC1C1C2O1)(Cl)Cl)Cl,0.60206
+O=C(NC(=O)c1c(F)cccc1F)Nc1ccc(cc1)Cl,-0.90309
+O=C(NC(=O)c1c(F)cccc1F)Nc1ccc(cc1)Cl,-0.89209
+O=C(NC(=O)c1c(F)cccc1F)Nc1ccc(cc1)Cl,-0.8451
+CNC(=O)CSP(=S)(OC)OC,0.60206
+CNC(=O)CSP(=S)(OC)OC,-0.69897
+c1ccc(cc1)Nc1ccccc1,-1.4914
+c1ccc(cc1)Nc1ccccc1,-1.3979
+c1ccn2c(c1)c1ccccn1CC2,0.23657
+c1ccn2c(c1)c1ccccn1CC2,0.24413
+CCSCCSP(=S)(OCC)OCC,1.0
+CCSCCSP(=S)(OCC)OCC,1.3979
+CCSCCSP(=S)(OCC)OCC,0.65758
+O=C(N(C)C)Nc1ccc(c(c1)Cl)Cl,-0.77815
+O=C(N(C)C)Nc1ccc(c(c1)Cl)Cl,-0.79588
+ClCCP(=O)(O)O,-2.1761
+ClCCP(=O)(O)O,-2.6493
+ClCCP(=O)(O)O,-1.0792
+Cn1cc(c2cccc(c2)C(F)(F)F)c(=O)c(c1)c1ccccc1,-1.3979
+Cn1cc(c2cccc(c2)C(F)(F)F)c(=O)c(c1)c1ccccc1,-1.5119
+CCP(=S)(Sc1ccccc1)OCC,-0.19866
+CCP(=S)(Sc1ccccc1)OCC,-0.69897
+ClC1C(Cl)C(Cl)C(C(C1Cl)Cl)Cl,-0.69897
+ClC1C(Cl)C(Cl)C(C(C1Cl)Cl)Cl,-0.66276
+ClC1C(Cl)C(Cl)C(C(C1Cl)Cl)Cl,-0.6721
+ClC1C(Cl)C(Cl)C(C(C1Cl)Cl)Cl,-0.90309
+ClC1C(Cl)C(Cl)C(C(C1Cl)Cl)Cl,-0.60206
+C=CCOC(c1ccc(cc1Cl)Cl)Cn1cncc1,-1.6021
+C=CCOC(c1ccc(cc1Cl)Cl)Cn1cncc1,-1.1761
+CCOC(=O)CC(C(=O)OCC)SP(=S)(OC)OC,-1.699
+CCOC(=O)CC(C(=O)OCC)SP(=S)(OC)OC,-2.5563
+COc1sc(=O)n(n1)CSP(=S)(OC)OC,-0.30103
+COc1sc(=O)n(n1)CSP(=S)(OC)OC,0.09691
+COc1sc(=O)n(n1)CSP(=S)(OC)OC,-0.09691
+COc1sc(=O)n(n1)CSP(=S)(OC)OC,-0.20412
+CNC(=O)ON=C(SC)C,-1.0
+CNC(=O)ON=C(SC)C,-1.301
+CCOP(=S)(Oc1ccc(cc1)N(=O)=O)OCC,-0.54407
+CCOP(=S)(Oc1ccc(cc1)N(=O)=O)OCC,0.37675
+COP(=S)(SCN1C(=O)c2c(C1=O)cccc2)OC,-1.301
+COP(=S)(SCN1C(=O)c2c(C1=O)cccc2)OC,-0.95424
+CCN(C(=O)C(=C(OP(=O)(OC)OC)C)Cl)CC,-1.0899
+CCN(C(=O)C(=C(OP(=O)(OC)OC)C)Cl)CC,-0.17609
+CCN(c1nc(cc(n1)C)OP(=S)(OC)OC)CC,-0.39794
+CCN(c1nc(cc(n1)C)OP(=S)(OC)OC)CC,-1.1399
+CCC(=O)Nc1ccc(c(c1)Cl)Cl,-1.301
+CCC(=O)Nc1ccc(c(c1)Cl)Cl,-1.8808
+CCCC1COC(O1)(Cn1cncn1)c1ccc(cc1Cl)Cl,-1.3979
+CCCC1COC(O1)(Cn1cncn1)c1ccc(cc1Cl)Cl,-1.9823
+CCNc1nc(NCC)nc(n1)Cl,-0.69897
+CCNc1nc(NCC)nc(n1)Cl,-0.72428
+ClC=C(c1cc(Cl)c(cc1Cl)Cl)OP(=O)(OC)OC,-2.0
+ClC=C(c1cc(Cl)c(cc1Cl)Cl)OP(=O)(OC)OC,-1.699
+CN(C(=S)SSC(=S)N(C)C)C,-1.1761
+CN(C(=S)SSC(=S)N(C)C)C,-1.0607
+CN(C(=S)SSC(=S)N(C)C)C,-0.73799
+CN(C(=S)SSC(=S)N(C)C)C,-1.0792
+CC(N(c1c(cc(cc1N(=O)=O)C(F)(F)F)N(=O)=O)C(C)C)C,-1.6021
+CC(N(c1c(cc(cc1N(=O)=O)C(F)(F)F)N(=O)=O)C(C)C)C,-3.0
+COP(=O)(SC)N,0.045757
+COP(=O)(SC)N,1.0
+COP(=O)(SC)N,0.5376
+COP(=O)(NC(=O)C)SC,-1.5441
+COP(=O)(NC(=O)C)SC,-0.39794
+CCOP(=S)(Oc1cc(C)nc(n1)C(C)C)OCC,-0.17609
+CCOP(=S)(Oc1cc(C)nc(n1)C(C)C)OCC,-0.76343
+NC1CCCCC1,-1.7672
+NC1CCCCC1,-1.7782
+OC(=O)CNCP(=O)(O)O,-3.0
+OC(=O)CNCP(=O)(O)O,-2.4771
+S=C1NCCN1,0.63827
+S=C1NCCN1,0.60206
+S=C1NCCN1,-0.09691
+CON(C(=O)Nc1ccc(c(c1)Cl)Cl)C,-0.79588
+CON(C(=O)Nc1ccc(c(c1)Cl)Cl)C,-0.39794
+CCOP(=S)(Oc1nc(Cl)c(cc1Cl)Cl)OCC,-1.0
+CCOP(=S)(Oc1nc(Cl)c(cc1Cl)Cl)OCC,-1.2388e-09
+CC(OC(=O)Nc1cccc(c1)Cl)C,-2.699
+CC(OC(=O)Nc1cccc(c1)Cl)C,-3.0
+CCCCOCCOCCOCc1cc2OCOc2cc1CCC,-2.3979
+CCCCOCCOCCOCc1cc2OCOc2cc1CCC,-2.0
+N#CC(c1ccc(c(c1)Oc1ccccc1)F)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.0969
+N#CC(c1ccc(c(c1)Oc1ccccc1)F)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-0.87506
+N#CC(c1ccc(c(c1)Oc1ccccc1)F)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-0.77815
+N#CC(c1ccc(c(c1)Oc1ccccc1)F)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.0792
+ClC1C=CC2C1C1(Cl)C(=C(C2(C1(Cl)Cl)Cl)Cl)Cl,0.60206
+ClC1C=CC2C1C1(Cl)C(=C(C2(C1(Cl)Cl)Cl)Cl)Cl,0.42022
+CCOP(=S)(SCSC(C)(C)C)OCC,1.301
+CCOP(=S)(SCSC(C)(C)C)OCC,-0.30103
+CCOP(=S)(SCSC(C)(C)C)OCC,1.2218
+CN(C=Nc1ccc(cc1C)C)C=Nc1ccc(cc1C)C,-1.0086
+CN(C=Nc1ccc(cc1C)C)C=Nc1ccc(cc1C)C,-1.0
+CC(N1C(=O)c2ccccc2NS1(=O)=O)C,-1.6021
+CC(N1C(=O)c2ccccc2NS1(=O)=O)C,-1.5441
+CCSC(CC1CC(=O)C(=C(NOCC=CCl)CC)C(=O)C1)C,-2.0
+CCSC(CC1CC(=O)C(=C(NOCC=CCl)CC)C(=O)C1)C,-1.9345
+CCCN(C(=O)SCC)CCC,-0.95424
+CCCN(C(=O)SCC)CCC,-1.3979
+CSc1ccc(cc1C)OP(=S)(OC)OC,-0.57403
+CSc1ccc(cc1C)OP(=S)(OC)OC,0.14267
+CSc1ccc(cc1C)OP(=S)(OC)OC,0.34679
+c1scc(n1)c1nc2c([nH]1)cccc2,-0.30103
+c1scc(n1)c1nc2c([nH]1)cccc2,-1.6021
+c1scc(n1)c1nc2c([nH]1)cccc2,-1.4771
+Clc1c(Cl)c([N+](=O)[O-])c(c(c1Cl)Cl)Cl,-2.0969
+Clc1c(Cl)c([N+](=O)[O-])c(c(c1Cl)Cl)Cl,-2.1461
+Nc1ncn[nH]1,-0.39794
+Nc1ncn[nH]1,-0.69897
+O=C(NC(=O)c1c(F)cccc1F)Nc1cc(Cl)c(c(c1F)Cl)F,-1.415
+O=C(NC(=O)c1c(F)cccc1F)Nc1cc(Cl)c(c(c1F)Cl)F,-1.3979
+C=CCC1=C(C)C(CC1=O)OC(=O)C1C(C1(C)C)C=C(C)C,-1.8751
+C=CCC1=C(C)C(CC1=O)OC(=O)C1C(C1(C)C)C=C(C)C,-1.3892
+C=CCC1=C(C)C(CC1=O)OC(=O)C1C(C1(C)C)C=C(C)C,-1.1399
+CON=C(c1ccccc1COc1ccccc1C)C(=O)OC,-2.574
+CON=C(c1ccccc1COc1ccccc1C)C(=O)OC,-2.5682
+CC(N(c1c(cc(cc1N(=O)=O)S(=O)(=O)N)N(=O)=O)C(C)C)C,-1.5666
+CC(N(c1c(cc(cc1N(=O)=O)S(=O)(=O)N)N(=O)=O)C(C)C)C,-1.6532
+CCCN(C(=O)n1cncc1)CCOc1c(Cl)cc(cc1Cl)Cl,-0.87506
+CCCN(C(=O)n1cncc1)CCOc1c(Cl)cc(cc1Cl)Cl,-0.70757
+N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)(C)C,-1.2889
+N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)(C)C,-1.3979
+N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.8751
+N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.699
+ClC1C2OC2C2C1C1(Cl)C(=C(C2(C1(Cl)Cl)Cl)Cl)Cl,0.69897
+ClC1C2OC2C2C1C1(Cl)C(=C(C2(C1(Cl)Cl)Cl)Cl)Cl,1.6021
+N#CC(c1ccccc1)(Cn1cncn1)CCc1ccc(cc1)Cl,-1.6021
+N#CC(c1ccccc1)(Cn1cncn1)CCc1ccc(cc1)Cl,-1.4771
+O=C(C1C(C1(C)C)C=C(C(F)(F)F)Cl)OCc1cccc(c1C)c1ccccc1,-0.69897
+O=C(C1C(C1(C)C)C=C(C(F)(F)F)Cl)OCc1cccc(c1C)c1ccccc1,-0.90309
+[O-][N+](=O)c1cc(Cl)c(c(c1)Cl)N,-2.1761
+[O-][N+](=O)c1cc(Cl)c(c(c1)Cl)N,-2.3802
+CC(Oc1cccc(c1)NC(=O)c1ccccc1C(F)(F)F)C,-2.699
+CC(Oc1cccc(c1)NC(=O)c1ccccc1C(F)(F)F)C,-1.9395
+CCCCNC(=O)n1c(NC(=O)OC)nc2c1cccc2,-2.0607
+CCCCNC(=O)n1c(NC(=O)OC)nc2c1cccc2,-2.3979
+ClC(SN1C(=O)c2c(C1=O)cccc2)(Cl)Cl,-2.699
+ClC(SN1C(=O)c2c(C1=O)cccc2)(Cl)Cl,-1.6021
+ClC(SN1C(=O)c2c(C1=O)cccc2)(Cl)Cl,-1.699
+C#CCOS(=O)OC1CCCCC1Oc1ccc(cc1)C(C)(C)C,-2.0
+C#CCOS(=O)OC1CCCCC1Oc1ccc(cc1)C(C)(C)C,-1.2788
+N#Cc1sc2=c(sc1C#N)c(=O)c1c(c2=O)cccc1,-1.0
+N#Cc1sc2=c(sc1C#N)c(=O)c1c(c2=O)cccc1,-0.77815
+CN(C(=O)Oc1nc(nc(c1C)C)N(C)C)C,-1.0969
+CN(C(=O)Oc1nc(nc(c1C)C)N(C)C)C,-1.0899
+CNc1cnn(c(=O)c1Cl)c1cccc(c1)C(F)(F)F,-1.273
+CNc1cnn(c(=O)c1Cl)c1cccc(c1)C(F)(F)F,-1.7097
+Clc1ccccc1c1nnc(nn1)c1ccccc1Cl,-1.301
+Clc1ccccc1c1nnc(nn1)c1ccccc1Cl,-1.238
+CCCCCCCCc1cc(N(=O)=O)c(c(c1)N(=O)=O)OC(=O)C=CC,-1.699
+CCCCCCCCc1cc(N(=O)=O)c(c(c1)N(=O)=O)OC(=O)C=CC,-1.8062
+CCSC(=O)N1CCCCCC1,-1.1706
+CCSC(=O)N1CCCCCC1,-0.30103
+OC(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,-1.3979
+OC(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,-2.0212
+CCOP(=O)(Oc1ccc(c(c1)C)SC)NC(C)C,-0.17609
+CCOP(=O)(Oc1ccc(c(c1)C)SC)NC(C)C,-0.23045
+Cn1ccc(cc1)c1ccn(cc1)C,-0.57403
+Cn1ccc(cc1)c1ccn(cc1)C,-0.40654
+CCCCC(c1ccc(cc1)Cl)(Cn1cncn1)C#N,-0.993
+CCCCC(c1ccc(cc1)Cl)(Cn1cncn1)C#N,-0.99123
+Fc1ccc(cc1)C(=O)CCCN1CCN(CC1)c1ccccn1,-2.0607
+Fc1ccc(cc1)C(=O)CCCN1CCN(CC1)c1ccccn1,-1.4771
+N#CC(c1c(Cl)ccc(c1Cl)n1ncc(=O)[nH]c1=O)c1ccc(cc1)Cl,-1.1761
+N#CC(c1c(Cl)ccc(c1Cl)n1ncc(=O)[nH]c1=O)c1ccc(cc1)Cl,-1.3617
+COC1CC(OC2C(C)C=CC=C3COC4C3(O)C(C=C(C4O)C)C(=O)OC3CC(CC=C2C)OC2(C3)C=CC(C(O2)C(C)C)C)OC(C1OC1CC(OC)C(C(O1)C)NC(=O)C)C,-1.9287e-16
+COC1CC(OC2C(C)C=CC=C3COC4C3(O)C(C=C(C4O)C)C(=O)OC3CC(CC=C2C)OC2(C3)C=CC(C(O2)C(C)C)C)OC(C1OC1CC(OC)C(C(O1)C)NC(=O)C)C,-0.39794
diff --git a/data/swiss_mg_dup.csv b/data/swiss_mg_dup.csv
new file mode 100644
index 0000000..3412c2e
--- /dev/null
+++ b/data/swiss_mg_dup.csv
@@ -0,0 +1,194 @@
+SMILES,LOAEL
+OC(=O)COc1ccc(cc1Cl)Cl,-1.7924
+OC(=O)COc1ccc(cc1Cl)Cl,-0.69897
+OC(=O)COc1ccc(cc1Cl)Cl,-1.8751
+CCOCN(c1c(C)cccc1CC)C(=O)CCl,-1.6767
+CCOCN(c1c(C)cccc1CC)C(=O)CCl,-1.8388
+CCOCN(c1c(C)cccc1CC)C(=O)CCl,-1.8254
+Clc1c(ccc(c1N)[N+](=O)[O-])Oc1ccccc1,-0.90309
+Clc1c(ccc(c1N)[N+](=O)[O-])Oc1ccccc1,-1.7924
+COCN(c1c(CC)cccc1CC)C(=O)CCl,-0.39794
+COCN(c1c(CC)cccc1CC)C(=O)CCl,-1.1461
+COCN(c1c(CC)cccc1CC)C(=O)CCl,-2.1004
+OCC(CCl)O,-1.4771
+OCC(CCl)O,-0.041393
+Nc1n[nH]cn1,-0.39794
+Nc1n[nH]cn1,-0.69897
+CCNc1nc(NC(C)C)nc(n1)Cl,-0.49136
+CCNc1nc(NC(C)C)nc(n1)Cl,-1.0
+CCNc1nc(NC(C)C)nc(n1)Cl,-1.3729
+CCOC(=O)CCN(C(C)C)SN(C(=O)Oc1cccc2c1OC(C2)(C)C)C,-1.0414
+CCOC(=O)CCN(C(C)C)SN(C(=O)Oc1cccc2c1OC(C2)(C)C)C,-1.3802
+Fc1ccc(c(c1)c1ccc(c(c1)Cl)Cl)NC(=O)c1cn(nc1C(F)F)C,-1.2405
+Fc1ccc(c(c1)c1ccc(c(c1)Cl)Cl)NC(=O)c1cn(nc1C(F)F)C,-1.0828
+ClC(SN1C(=O)C2C(C1=O)CC=CC2)(Cl)Cl,-2.0
+ClC(SN1C(=O)C2C(C1=O)CC=CC2)(Cl)Cl,-1.9912
+CNC(=O)Oc1cccc2c1OC(C2)(C)C,-0.068186
+CNC(=O)Oc1cccc2c1OC(C2)(C)C,-0.69897
+Nc1cnn(c(=O)c1Cl)c1ccccc1,-1.7782
+Nc1cnn(c(=O)c1Cl)c1ccccc1,-1.699
+ClCC[N+](C)(C)C.[Cl-],-2.1335
+ClCC[N+](C)(C)C.[Cl-],-2.0969
+ClCC[N+](C)(C)C.[Cl-],-1.8751
+N#Cc1c(Cl)c(C#N)c(c(c1Cl)Cl)Cl,-0.51851
+N#Cc1c(Cl)c(C#N)c(c(c1Cl)Cl)Cl,-1.6021
+CCOP(=S)(Oc1nc(Cl)c(cc1Cl)Cl)OCC,-1.0
+CCOP(=S)(Oc1nc(Cl)c(cc1Cl)Cl)OCC,-0.77815
+CCOP(=S)(Oc1nc(Cl)c(cc1Cl)Cl)OCC,-0.0
+CCO/N=C(\C1=C(O)CC(CC1=O)C1CCCSC1)/CCC,-1.4472
+CCO/N=C(\C1=C(O)CC(CC1=O)C1CCCSC1)/CCC,-0.80618
+N#CC(c1ccc(c(c1)Oc1ccccc1)F)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.2833
+N#CC(c1ccc(c(c1)Oc1ccccc1)F)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.3579
+O[Sn](C1CCCCC1)(C1CCCCC1)C1CCCCC1,-0.14301
+O[Sn](C1CCCCC1)(C1CCCCC1)C1CCCCC1,-0.77815
+O[Sn](C1CCCCC1)(C1CCCCC1)C1CCCCC1,-0.0
+CCNC(=O)NC(=O)/C(=N\OC)/C#N,-1.3711
+CCNC(=O)NC(=O)/C(=N\OC)/C#N,-1.4814
+N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.699
+N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.8751
+CN1CN(C)CSC1=S,-0.5563
+CN1CN(C)CSC1=S,-0.72428
+N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Br)Br,-0.39794
+N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Br)Br,-0.73239
+COc1cc(ccc1OC)/C(=C/C(=O)N1CCOCC1)/c1ccc(cc1)Cl,-1.5563
+COc1cc(ccc1OC)/C(=C/C(=O)N1CCOCC1)/c1ccc(cc1)Cl,-1.5315
+c1cc[n+]2c(c1)c1cccc[n+]1CC2,-0.46389
+c1cc[n+]2c(c1)c1cccc[n+]1CC2,-0.59106
+OC(=O)c1ccccc1.CCC(C1OC2(C=CC1C)OC1CC=C(C)C(OC3CC(OC)C(C(O3)C)OC3CC(OC)C(C(O3)C)NC)C(C)C=CC=C3C4(C(C(=O)OC(C2)C1)C=C(C)C(C4OC3)O)O)C,-0.39794
+OC(=O)c1ccccc1.CCC(C1OC2(C=CC1C)OC1CC=C(C)C(OC3CC(OC)C(C(O3)C)OC3CC(OC)C(C(O3)C)NC)C(C)C=CC=C3C4(C(C(=O)OC(C2)C1)C=C(C)C(C4OC3)O)O)C,-0.0
+O=S1OCC2C(CO1)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,-1.0414
+O=S1OCC2C(CO1)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,-0.4624
+O=S1OCC2C(CO1)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,-0.69897
+Fc1ccc(cc1)C1(Cn2cncn2)OC1c1ccccc1Cl,-0.69897
+Fc1ccc(cc1)C1(Cn2cncn2)OC1c1ccccc1Cl,-0.77815
+CCN(c1c(cc(cc1[N+](=O)[O-])C(F)(F)F)[N+](=O)[O-])CC(=C)C,-1.0294
+CCN(c1c(cc(cc1[N+](=O)[O-])C(F)(F)F)[N+](=O)[O-])CC(=C)C,-1.4624
+CCCSP(=O)(SCCC)OCC,0.30103
+CCCSP(=O)(SCCC)OCC,-0.38739
+CCOP(=O)(Oc1ccc(c(c1)C)SC)NC(C)C,-0.23045
+CCOP(=O)(Oc1ccc(c(c1)C)SC)NC(C)C,0.33724
+Clc1ccc(cc1)C(c1ccccc1Cl)(c1cncnc1)O,-0.5563
+Clc1ccc(cc1)C(c1ccccc1Cl)(c1cncnc1)O,-1.243
+Clc1ccc(cc1)C(c1ccccc1Cl)(c1cncnc1)O,-0.30103
+Clc1ccc(cc1)C(c1ccccc1Cl)(c1cncnc1)O,-0.39794
+CSc1ccc(cc1C)OP(=S)(OC)OC,0.69897
+CSc1ccc(cc1C)OP(=S)(OC)OC,0.14267
+Clc1cc(cnc1Nc1c(cc(c(c1[N+](=O)[O-])Cl)C(F)(F)F)[N+](=O)[O-])C(F)(F)F,-0.59106
+Clc1cc(cnc1Nc1c(cc(c(c1[N+](=O)[O-])Cl)C(F)(F)F)[N+](=O)[O-])C(F)(F)F,-0.58206
+O=C(c1cccc(c1C(=O)NC(CS(=O)(=O)C)(C)C)I)Nc1ccc(cc1C)C(C(F)(F)F)(C(F)(F)F)F,-1.8976
+O=C(c1cccc(c1C(=O)NC(CS(=O)(=O)C)(C)C)I)Nc1ccc(cc1C)C(C(F)(F)F)(C(F)(F)F)F,-1.5315
+O=C(NC(=O)c1c(F)cccc1F)Nc1ccc(cc1F)Oc1ccc(cc1Cl)C(F)(F)F,-2.3617
+O=C(NC(=O)c1c(F)cccc1F)Nc1ccc(cc1F)Oc1ccc(cc1Cl)C(F)(F)F,-2.3385
+O=C(N(C)C)Nc1cccc(c1)C(F)(F)F,-1.2348
+O=C(N(C)C)Nc1cccc(c1)C(F)(F)F,-1.4771
+CC(C(c1cncnc1)(c1ccc(cc1)OC(F)(F)F)O)C,-0.99123
+CC(C(c1cncnc1)(c1ccc(cc1)OC(F)(F)F)O)C,-1.6345
+Fc1ccc(cc1)[Si](c1ccc(cc1)F)(Cn1cncn1)C,-1.0
+Fc1ccc(cc1)[Si](c1ccc(cc1)F)(Cn1cncn1)C,-0.69897
+OC(=O)CNCP(=O)(O)O,-2.9731
+OC(=O)CNCP(=O)(O)O,-2.7482
+OC(=O)CNCP(=O)(O)O,-2.4771
+OC(=O)CNCP(=O)(O)O,-3.0842
+C=CCOC(c1ccc(cc1Cl)Cl)Cn1cncc1,-1.2014
+C=CCOC(c1ccc(cc1Cl)Cl)Cn1cncc1,-1.1761
+CCC(c1noc(c1)NC(=O)c1c(OC)cccc1OC)(CC)C,-2.7218
+CCC(c1noc(c1)NC(=O)c1c(OC)cccc1OC)(CC)C,-2.7782
+CO/N=C(\c1ccccc1COc1ccccc1C)/C(=O)OC,-2.8763
+CO/N=C(\c1ccccc1COc1ccccc1C)/C(=O)OC,-2.574
+CON(C(=O)Nc1ccc(c(c1)Cl)Cl)C,-0.79588
+CON(C(=O)Nc1ccc(c(c1)Cl)Cl)C,-1.4949
+CCOC(=O)CC(C(=O)OCC)SP(=S)(OC)OC,-1.4624
+CCOC(=O)CC(C(=O)OCC)SP(=S)(OC)OC,-2.1553
+C[N+]1(C)CCCCC1.[Cl-],-2.4281
+C[N+]1(C)CCCCC1.[Cl-],-2.8351
+CCCCCCC(c1cc(cc(c1OC(=O)/C=C\C)[N+](=O)[O-])[N+](=O)[O-])C,-2.0821
+CCCCCCC(c1cc(cc(c1OC(=O)/C=C\C)[N+](=O)[O-])[N+](=O)[O-])C,-1.8513
+Cc1nnc(c(=O)n1N)c1ccccc1,-1.29
+Cc1nnc(c(=O)n1N)c1ccccc1,-1.8727
+ClCC(=O)N(c1c(C)cccc1C)Cn1cccn1,-1.9395
+ClCC(=O)N(c1c(C)cccc1C)Cn1cccn1,-1.2455
+OC1(Cn2ncnc2)C(CCC1(C)C)Cc1ccc(cc1)Cl,-1.1173
+OC1(Cn2ncnc2)C(CCC1(C)C)Cc1ccc(cc1)Cl,-1.1399
+COc1sc(=O)n(n1)CSP(=S)(OC)OC,0.19382
+COc1sc(=O)n(n1)CSP(=S)(OC)OC,-0.23553
+CBr,-1.0414
+CBr,-1.2304
+CSc1nnc(c(=O)n1N)C(C)(C)C,-1.1139
+CSc1nnc(c(=O)n1N)C(C)(C)C,-1.1584
+CCCCC(c1ccc(cc1)Cl)(Cn1cncn1)C#N,-0.99564
+CCCCC(c1ccc(cc1)Cl)(Cn1cncn1)C#N,-2.0253
+CCN(C(=O)C(Oc1cccc2c1cccc2)C)CC,-1.6772
+CCN(C(=O)C(Oc1cccc2c1cccc2)C)CC,-2.0
+Oc1ccccc1c1ccccc1,-3.0
+Oc1ccccc1c1ccccc1,-2.301
+Oc1ccccc1c1ccccc1,-2.7251
+CCCN(c1c(cc(cc1[N+](=O)[O-])S(=O)(=O)N)[N+](=O)[O-])CCC,-1.1461
+CCCN(c1c(cc(cc1[N+](=O)[O-])S(=O)(=O)N)[N+](=O)[O-])CCC,-1.5563
+CNC(=O)ON=C(C(=O)N(C)C)SC,-0.62221
+CNC(=O)ON=C(C(=O)N(C)C)SC,-0.6902
+C[n+]1ccc(cc1)c1cc[n+](cc1)C,-0.40654
+C[n+]1ccc(cc1)c1cc[n+](cc1)C,-0.94792
+C[n+]1ccc(cc1)c1cc[n+](cc1)C,-0.57403
+CCOP(=S)(Oc1ccc(cc1)[N+](=O)[O-])OCC,-0.39794
+CCOP(=S)(Oc1ccc(cc1)[N+](=O)[O-])OCC,-0.20412
+COP(=S)(Oc1ccc(cc1)[N+](=O)[O-])OC,-0.39794
+COP(=S)(Oc1ccc(cc1)[N+](=O)[O-])OC,0.25964
+COP(=S)(Oc1ccc(cc1)[N+](=O)[O-])OC,0.30103
+CC(CC(c1sccc1NC(=O)c1cn(nc1C(F)(F)F)C)C)C,-1.9191
+CC(CC(c1sccc1NC(=O)c1cn(nc1C(F)(F)F)C)C)C,-2.0
+ClC(=CC1C(C1(C)C)C(=O)OCc1cccc(c1)Oc1ccccc1)Cl,-1.3979
+ClC(=CC1C(C1(C)C)C(=O)OCc1cccc(c1)Oc1ccccc1)Cl,-1.699
+COP(=S)(SCN1C(=O)c2c(C1=O)cccc2)OC,-0.60206
+COP(=S)(SCN1C(=O)c2c(C1=O)cccc2)OC,-0.25527
+OC(=O)c1nc(Cl)c(c(c1Cl)N)Cl,-2.301
+OC(=O)c1nc(Cl)c(c(c1Cl)N)Cl,-2.3979
+CO/C=C(\c1ccccc1COc1cccc(n1)C(F)(F)F)/C(=O)OC,-1.6628
+CO/C=C(\c1ccccc1COc1cccc(n1)C(F)(F)F)/C(=O)OC,-2.2095
+CCCOC(=O)NCCCN(C)C.Cl,-2.8338
+CCCOC(=O)NCCCN(C)C.Cl,-2.1761
+CCCOC(=O)NCCCN(C)C.Cl,-2.0569
+O=C(C(Oc1ccc(cc1)Oc1cnc2c(n1)ccc(c2)Cl)C)OCCON=C(C)C,-0.69897
+O=C(C(Oc1ccc(cc1)Oc1cnc2c(n1)ccc(c2)Cl)C)OCCON=C(C)C,-1.3979
+C#CCOS(=O)OC1CCCCC1Oc1ccc(cc1)C(C)(C)C,-1.1761
+C#CCOS(=O)OC1CCCCC1Oc1ccc(cc1)C(C)(C)C,-1.3838
+C#CCOS(=O)OC1CCCCC1Oc1ccc(cc1)C(C)(C)C,-1.4014
+ClCC(=O)N(c1c(C)cccc1CC)COC(C)C,-2.301
+ClCC(=O)N(c1c(C)cccc1CC)COC(C)C,-1.7993
+S=C1NCCCN1,-0.69897
+S=C1NCCCN1,-0.74819
+Clc1ccccc1CC(C1(Cl)CC1)(Cn1nc[nH]c1=S)O,-1.699
+Clc1ccccc1CC(C1(Cl)CC1)(Cn1nc[nH]c1=S)O,-2.8751
+CCOC(=O)c1cn2nc(cc2nc1C)OP(=S)(OCC)OCC,-1.0414
+CCOC(=O)c1cn2nc(cc2nc1C)OP(=S)(OCC)OCC,-0.60206
+COc1c(OC)cc(c(c1OC)C(=O)c1c(OC)ncc(c1C)Cl)C,-1.6325
+COc1c(OC)cc(c(c1OC)C(=O)c1c(OC)ncc(c1C)Cl)C,-1.5611
+O=C(CC(C)(C)C)OC1=C(C(=O)OC21CCCC2)c1c(C)cc(cc1C)C,-1.2014
+O=C(CC(C)(C)C)OC1=C(C(=O)OC21CCCC2)c1c(C)cc(cc1C)C,-1.1703
+O=C1CCCC(=O)C1C(=O)c1ccc(cc1Cl)S(=O)(=O)C,-1.8573
+O=C1CCCC(=O)C1C(=O)c1ccc(cc1Cl)S(=O)(=O)C,0.39794
+O=C(NC(=O)c1c(F)cccc1F)Nc1cc(Cl)c(c(c1F)Cl)F,-1.3945
+O=C(NC(=O)c1c(F)cccc1F)Nc1cc(Cl)c(c(c1F)Cl)F,-2.0881
+CCNc1nc(Cl)nc(n1)NC(C)(C)C,-0.093422
+CCNc1nc(Cl)nc(n1)NC(C)(C)C,-0.2014
+CCNc1nc(Cl)nc(n1)NC(C)(C)C,-0.23045
+CS/C(=N/OC(=O)N(SN(C(=O)O/N=C(/SC)\C)C)C)/C,-0.69897
+CS/C(=N/OC(=O)N(SN(C(=O)O/N=C(/SC)\C)C)C)/C,-1.0
+CS/C(=N/OC(=O)N(SN(C(=O)O/N=C(/SC)\C)C)C)/C,-1.0792
+COC(=O)NC(=S)Nc1ccccc1NC(=S)NC(=O)OC,-1.5051
+COC(=O)NC(=S)Nc1ccccc1NC(=S)NC(=O)OC,-1.7243
+CN(C(=S)SSC(=S)N(C)C)C,-1.0792
+CN(C(=S)SSC(=S)N(C)C)C,-0.86332
+Cc1ccc(cc1)N(S(=O)(=O)N(C)C)SC(Cl)(Cl)F,-2.0
+Cc1ccc(cc1)N(S(=O)(=O)N(C)C)SC(Cl)(Cl)F,-1.9542
+O=C(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,-2.0569
+O=C(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,-1.3856
+CC(N(C(=O)SCC(=C(Cl)Cl)Cl)C(C)C)C,-1.1139
+CC(N(C(=O)SCC(=C(Cl)Cl)Cl)C(C)C)C,-0.9345
+CCCN(c1c(cc(cc1[N+](=O)[O-])C(F)(F)F)[N+](=O)[O-])CCC,-1.4771
+CCCN(c1c(cc(cc1[N+](=O)[O-])C(F)(F)F)[N+](=O)[O-])CCC,-2.6812
+O=CNC(C(Cl)(Cl)Cl)N1CCN(CC1)C(C(Cl)(Cl)Cl)NC=O,-2.0
+O=CNC(C(Cl)(Cl)Cl)N1CCN(CC1)C(C(Cl)(Cl)Cl)NC=O,-2.2041
+C=CC1(C)OC(=O)N(C1=O)c1cc(Cl)cc(c1)Cl,-0.90309
+C=CC1(C)OC(=O)N(C1=O)c1cc(Cl)cc(c1)Cl,-0.43136
+CN(C(=S)S[Zn]SC(=S)N(C)C)C,-1.1139
+CN(C(=S)S[Zn]SC(=S)N(C)C)C,-0.39794
diff --git a/loael.Rmd b/loael.Rmd
index dda3e38..b01e1e3 100644
--- a/loael.Rmd
+++ b/loael.Rmd
@@ -90,8 +90,9 @@ Elena: please check if this is publication strategy is ok for the Swiss Federal
Materials and Methods
=====================
-The following sections give a high level overview about
-algorithms and datasets used for this study. In order to provide unambiguous references to algorithms and datasets, links to source code and data sources are included in the text.
+The following sections give a high level overview about algorithms and datasets
+used for this study. In order to provide unambiguous references to algorithms
+and datasets, links to source code and data sources are included in the text.
Datasets
--------
@@ -253,12 +254,18 @@ weighted by its similarity to the query compound. In this case the prediction is
The applicability domain (AD) of lazar models is determined by the structural
diversity of the training data. If no similar compounds are found in the
-training data no predictions will be generated. Warnings are issued if the similarity threshold has to be lowered from 0.5 to 0.2 in order to enable predictions and if lazar has to resort to weighted average predictions, because local random forests fail. Thus predictions without warnings can be considered as close to the applicability domain and predictions with warnings as more distant from the applicability domain. Quantitative applicability domain information can be obtained from the similarities of individual neighbors.
+training data no predictions will be generated. Warnings are issued if the
+similarity threshold has to be lowered from 0.5 to 0.2 in order to enable
+predictions and if lazar has to resort to weighted average predictions, because
+local random forests fail. Thus predictions without warnings can be considered
+as close to the applicability domain and predictions with warnings as more
+distant from the applicability domain. Quantitative applicability domain
+information can be obtained from the similarities of individual neighbors.
Local regression models consider neighbor similarities to the query compound,
-by weighting the contribution of each neighbor is by its similarity.
-The variability of local model predictions is reflected in the
-95\% prediction interval associated with each prediction.
+by weighting the contribution of each neighbor is by its similarity. The
+variability of local model predictions is reflected in the 95\% prediction
+interval associated with each prediction.
### Validation
@@ -325,12 +332,12 @@ This result was confirmed with a visual inspection using the
[CheS-Mapper](http://ches-mapper.org) (Chemical Space Mapping and
Visualization in 3D, @Guetlein2012)
tool.
-CheS-Mapper can be used to analyze the relationship between the
-structure of chemical compounds, their physico-chemical properties, and
-biological or toxic effects. It depicts closely related (similar) compounds in 3D space and can be used with different kinds of features.
-We have investigated structural as well as physico-chemical properties and
-concluded that both datasets are very similar, both in terms of
-chemical structures and physico-chemical properties.
+CheS-Mapper can be used to analyze the relationship between the structure of
+chemical compounds, their physico-chemical properties, and biological or toxic
+effects. It depicts closely related (similar) compounds in 3D space and can be
+used with different kinds of features. We have investigated structural as well
+as physico-chemical properties and concluded that both datasets are very
+similar, both in terms of chemical structures and physico-chemical properties.
The only statistically significant difference between both datasets, is that the Mazzatorta dataset contains more small compounds (61 structures with less than 11 atoms) than the Swiss dataset (19 small structures, p-value 3.7E-7).
@@ -358,11 +365,13 @@ MolPrint2D features that are utilized for model building in this work.
### Experimental variability versus prediction uncertainty
-Duplicated LOAEL values can be found in both datasets and there is a
-substantial number of `r length(unique(t$SMILES))` compounds occurring in both
-datasets. These duplicates allow us to estimate the variability of
-experimental results within individual datasets and between datasets.
-Data with *identical* values (at five significant digits) in both datasets were excluded from variability analysis, because it it likely that they originate from the same experiments.
+Duplicated LOAEL values can be found in both datasets and there is
+a substantial number of `r length(unique(t$SMILES))` compounds occurring in
+both datasets. These duplicates allow us to estimate the variability of
+experimental results within individual datasets and between datasets. Data with
+*identical* values (at five significant digits) in both datasets were excluded
+from variability analysis, because it it likely that they originate from the
+same experiments.
##### Intra dataset variability
@@ -385,21 +394,40 @@ c.dup$sd <- ave(c.dup$LOAEL,c.dup$SMILES,FUN=sd)
t$sd <- ave(t$LOAEL,t$SMILES,FUN=sd)
p = t.test(m.dup$sd,s.dup$sd)$p.value
+
+m.mg = read.csv("data/mazzatorta_mg_dup.csv",header=T)
+m.mg$sd <- ave(m.mg$LOAEL,m.mg$SMILES,FUN=sd)
+
+s.mg = read.csv("data/swiss_mg_dup.csv",header=T)
+s.mg$sd <- ave(s.mg$LOAEL,s.mg$SMILES,FUN=sd)
+
+c.mg = read.csv("data/all_mg_dup.csv",header=T)
+c.mg$sd <- ave(c.mg$LOAEL,c.mg$SMILES,FUN=sd)
```
The Mazzatorta dataset has `r length(m$SMILES)` LOAEL values for
`r length(levels(m$SMILES))` unique structures, `r m.dupnr`
-compounds have multiple measurements with a mean standard deviation of
-`r round(mean(10^(-1*m.dup$sd)),2)` mmol/kg_bw/day (`r round(mean(m.dup$sd),2)` log10 units @mazzatorta08, [@fig:intra]).
+compounds have multiple measurements with a mean standard deviation (-log10 transformed values) of
+`r round(mean(m.dup$sd),2)`
+(`r round(mean(10^(-1*m.mg$sd)),2)` mg/kg_bw/day,
+`r round(mean(10^(-1*m.dup$sd)),2)` mmol/kg_bw/day)
+(@mazzatorta08, [@fig:intra]).
The Swiss Federal Office dataset has `r length(s$SMILES)` rat LOAEL values for
`r length(levels(s$SMILES))` unique structures, `r s.dupnr` compounds have
-multiple measurements with a mean standard deviation of
-`r round(mean(10^(-1*s.dup$sd)),2)` mmol/kg_bw/day (`r round(mean(s.dup$sd),2)` log10 units).
+multiple measurements with a mean standard deviation (-log10 transformed values) of
+`r round(mean(s.dup$sd),2)`
+(`r round(mean(10^(-1*s.mg$sd)),2)` mg/kg_bw/day,
+`r round(mean(10^(-1*s.dup$sd)),2)` mmol/kg_bw/day)
+([@fig:intra]).
Standard deviations of both datasets do not show
a statistically significant difference with a p-value (t-test) of `r round(p,2)`.
-The combined test set has a mean standard deviation of `r round(mean(10^(-1*c.dup$sd)),2)` mmol/kg_bw/day (`r round(mean(c.dup$sd),2)` log10 units).
+The combined test set has a mean standard deviation (-log10 transformed values) of
+`r round(mean(c.dup$sd),2)`
+(`r round(mean(10^(-1*c.mg$sd)),2)` mg/kg_bw/day,
+`r round(mean(10^(-1*c.dup$sd)),2)` mmol/kg_bw/day)
+([@fig:intra]).
![Distribution and variability of LOAEL values in both datasets. Each vertical line represents a compound, dots are individual LOAEL values.](figures/dataset-variability.pdf){#fig:intra}
diff --git a/loael.md b/loael.md
index 7202dbc..c2bfb5b 100644
--- a/loael.md
+++ b/loael.md
@@ -82,8 +82,9 @@ Elena: please check if this is publication strategy is ok for the Swiss Federal
Materials and Methods
=====================
-The following sections give a high level overview about
-algorithms and datasets used for this study. In order to provide unambiguous references to algorithms and datasets, links to source code and data sources are included in the text.
+The following sections give a high level overview about algorithms and datasets
+used for this study. In order to provide unambiguous references to algorithms
+and datasets, links to source code and data sources are included in the text.
Datasets
--------
@@ -245,12 +246,18 @@ weighted by its similarity to the query compound. In this case the prediction is
The applicability domain (AD) of lazar models is determined by the structural
diversity of the training data. If no similar compounds are found in the
-training data no predictions will be generated. Warnings are issued if the similarity threshold has to be lowered from 0.5 to 0.2 in order to enable predictions and if lazar has to resort to weighted average predictions, because local random forests fail. Thus predictions without warnings can be considered as close to the applicability domain and predictions with warnings as more distant from the applicability domain. Quantitative applicability domain information can be obtained from the similarities of individual neighbors.
+training data no predictions will be generated. Warnings are issued if the
+similarity threshold has to be lowered from 0.5 to 0.2 in order to enable
+predictions and if lazar has to resort to weighted average predictions, because
+local random forests fail. Thus predictions without warnings can be considered
+as close to the applicability domain and predictions with warnings as more
+distant from the applicability domain. Quantitative applicability domain
+information can be obtained from the similarities of individual neighbors.
Local regression models consider neighbor similarities to the query compound,
-by weighting the contribution of each neighbor is by its similarity.
-The variability of local model predictions is reflected in the
-95\% prediction interval associated with each prediction.
+by weighting the contribution of each neighbor is by its similarity. The
+variability of local model predictions is reflected in the 95\% prediction
+interval associated with each prediction.
### Validation
@@ -315,12 +322,12 @@ This result was confirmed with a visual inspection using the
[CheS-Mapper](http://ches-mapper.org) (Chemical Space Mapping and
Visualization in 3D, @Guetlein2012)
tool.
-CheS-Mapper can be used to analyze the relationship between the
-structure of chemical compounds, their physico-chemical properties, and
-biological or toxic effects. It depicts closely related (similar) compounds in 3D space and can be used with different kinds of features.
-We have investigated structural as well as physico-chemical properties and
-concluded that both datasets are very similar, both in terms of
-chemical structures and physico-chemical properties.
+CheS-Mapper can be used to analyze the relationship between the structure of
+chemical compounds, their physico-chemical properties, and biological or toxic
+effects. It depicts closely related (similar) compounds in 3D space and can be
+used with different kinds of features. We have investigated structural as well
+as physico-chemical properties and concluded that both datasets are very
+similar, both in terms of chemical structures and physico-chemical properties.
The only statistically significant difference between both datasets, is that the Mazzatorta dataset contains more small compounds (61 structures with less than 11 atoms) than the Swiss dataset (19 small structures, p-value 3.7E-7).
@@ -348,11 +355,13 @@ MolPrint2D features that are utilized for model building in this work.
### Experimental variability versus prediction uncertainty
-Duplicated LOAEL values can be found in both datasets and there is a
-substantial number of 155 compounds occurring in both
-datasets. These duplicates allow us to estimate the variability of
-experimental results within individual datasets and between datasets.
-Data with *identical* values (at five significant digits) in both datasets were excluded from variability analysis, because it it likely that they originate from the same experiments.
+Duplicated LOAEL values can be found in both datasets and there is
+a substantial number of 155 compounds occurring in
+both datasets. These duplicates allow us to estimate the variability of
+experimental results within individual datasets and between datasets. Data with
+*identical* values (at five significant digits) in both datasets were excluded
+from variability analysis, because it it likely that they originate from the
+same experiments.
##### Intra dataset variability
@@ -360,17 +369,27 @@ Data with *identical* values (at five significant digits) in both datasets were
The Mazzatorta dataset has 567 LOAEL values for
445 unique structures, 93
-compounds have multiple measurements with a mean standard deviation of
-0.56 mmol/kg_bw/day (0.32 log10 units @mazzatorta08, [@fig:intra]).
+compounds have multiple measurements with a mean standard deviation (-log10 transformed values) of
+0.32
+(0.56 mg/kg_bw/day,
+0.56 mmol/kg_bw/day)
+(@mazzatorta08, [@fig:intra]).
The Swiss Federal Office dataset has 493 rat LOAEL values for
381 unique structures, 91 compounds have
-multiple measurements with a mean standard deviation of
-0.59 mmol/kg_bw/day (0.29 log10 units).
+multiple measurements with a mean standard deviation (-log10 transformed values) of
+0.29
+(0.57 mg/kg_bw/day,
+0.59 mmol/kg_bw/day)
+([@fig:intra]).
Standard deviations of both datasets do not show
a statistically significant difference with a p-value (t-test) of 0.21.
-The combined test set has a mean standard deviation of 0.55 mmol/kg_bw/day (0.33 log10 units).
+The combined test set has a mean standard deviation (-log10 transformed values) of
+0.33
+(0.56 mg/kg_bw/day,
+0.55 mmol/kg_bw/day)
+([@fig:intra]).
![Distribution and variability of LOAEL values in both datasets. Each vertical line represents a compound, dots are individual LOAEL values.](figures/dataset-variability.pdf){#fig:intra}
diff --git a/loael.pdf b/loael.pdf
index ebf09fd..4aa0543 100644
--- a/loael.pdf
+++ b/loael.pdf
Binary files differ
diff --git a/scripts/all_mg_dup.rb b/scripts/all_mg_dup.rb
new file mode 100755
index 0000000..48323d7
--- /dev/null
+++ b/scripts/all_mg_dup.rb
@@ -0,0 +1,35 @@
+#!/usr/bin/env ruby
+require_relative '../../lazar/lib/lazar'
+include OpenTox
+csv_in = CSV.read("data/NOAEL-LOAEL_SMILES_rat_chron.csv", :encoding => 'windows-1251:utf-8')
+head = csv_in.shift
+data = {}
+csv_in.each do |line|
+ smi = line[11]
+ mg = line[19].to_f
+ unless mg.to_f == 0.0
+ c = Compound.from_smiles smi
+ data[c.smiles] ||= []
+ data[c.smiles] << -Math.log10(mg).signif(5)
+ end
+end
+csv_in = CSV.read("data/LOAEL_mg_corrected_smiles_mmol.csv", :encoding => 'windows-1251:utf-8')
+head = csv_in.shift
+data = {}
+csv_in.each do |line|
+ c = Compound.from_smiles line[0]
+ mmol = line[1].to_f
+ data[c.smiles] ||= []
+ data[c.smiles] << -Math.log10(c.mmol_to_mg(mmol)).signif(5)
+end
+File.open(File.join("data","all_mg_dup.csv"),"w+") do |f|
+ f.puts ["SMILES","LOAEL"].join ","
+ data.each do |smi,values|
+ values.uniq!
+ if values.size > 1
+ values.each do |v|
+ f.puts "#{smi},#{v}"
+ end
+ end
+ end
+end
diff --git a/scripts/mazzatorta-unique-smiles.rb b/scripts/mazzatorta-unique-smiles.rb
index 306c107..4d00fad 100755
--- a/scripts/mazzatorta-unique-smiles.rb
+++ b/scripts/mazzatorta-unique-smiles.rb
@@ -5,7 +5,6 @@ include OpenTox
csv_in = CSV.read(ARGV[0], :encoding => 'windows-1251:utf-8')
head = csv_in.shift
data = []
-data = []
csv_in.each do |line|
c = Compound.from_smiles line[0]
# round to 5 significant digits in order to detect duplicates
diff --git a/scripts/mazzatorta_mg_dup.rb b/scripts/mazzatorta_mg_dup.rb
new file mode 100755
index 0000000..7ca6d79
--- /dev/null
+++ b/scripts/mazzatorta_mg_dup.rb
@@ -0,0 +1,24 @@
+#!/usr/bin/env ruby
+require_relative '../../lazar/lib/lazar'
+include OpenTox
+
+csv_in = CSV.read(ARGV[0], :encoding => 'windows-1251:utf-8')
+head = csv_in.shift
+data = {}
+csv_in.each do |line|
+ c = Compound.from_smiles line[0]
+ mmol = line[1].to_f
+ data[c] ||= []
+ data[c] << -Math.log10(c.mmol_to_mg(mmol)).signif(5)
+end
+File.open(File.join("data","mazzatorta_mg_dup.csv"),"w+") do |f|
+ f.puts ["SMILES","LOAEL"].join ","
+ data.each do |c,values|
+ values.uniq!
+ if values.size > 1
+ values.each do |v|
+ f.puts "#{c.smiles},#{v}"
+ end
+ end
+ end
+end
diff --git a/scripts/noael_loael2swiss_mg_dup.rb b/scripts/noael_loael2swiss_mg_dup.rb
new file mode 100755
index 0000000..2699719
--- /dev/null
+++ b/scripts/noael_loael2swiss_mg_dup.rb
@@ -0,0 +1,26 @@
+#!/usr/bin/env ruby
+require_relative '../../lazar/lib/lazar'
+include OpenTox
+csv_in = CSV.read(ARGV[0], :encoding => 'windows-1251:utf-8')
+head = csv_in.shift
+data = {}
+csv_in.each do |line|
+ smi = line[11]
+ mg = line[19].to_f
+ unless mg.to_f == 0.0
+ c = Compound.from_smiles smi
+ data[c.smiles] ||= []
+ data[c.smiles] << -Math.log10(mg).signif(5)
+ end
+end
+File.open(File.join("data","swiss_mg_dup.csv"),"w+") do |f|
+ f.puts ["SMILES","LOAEL"].join ","
+ data.each do |smi,values|
+ values.uniq!
+ if values.size > 1
+ values.each do |v|
+ f.puts "#{smi},#{v}"
+ end
+ end
+ end
+end