From 8feada761cf87575ce037b5b8339691a7e9ae238 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 14 Apr 2017 12:25:45 +0200 Subject: inter dataset sd in mg units --- Makefile | 11 +- data/all_mg_dup.csv | 215 ++++++++++++++++++++++++++++++++++++ data/mazzatorta_mg_dup.csv | 215 ++++++++++++++++++++++++++++++++++++ data/swiss_mg_dup.csv | 194 ++++++++++++++++++++++++++++++++ loael.Rmd | 72 ++++++++---- loael.md | 63 +++++++---- loael.pdf | Bin 348755 -> 348856 bytes scripts/all_mg_dup.rb | 35 ++++++ scripts/mazzatorta-unique-smiles.rb | 1 - scripts/mazzatorta_mg_dup.rb | 24 ++++ scripts/noael_loael2swiss_mg_dup.rb | 26 +++++ 11 files changed, 810 insertions(+), 46 deletions(-) create mode 100644 data/all_mg_dup.csv create mode 100644 data/mazzatorta_mg_dup.csv create mode 100644 data/swiss_mg_dup.csv create mode 100755 scripts/all_mg_dup.rb create mode 100755 scripts/mazzatorta_mg_dup.rb create mode 100755 scripts/noael_loael2swiss_mg_dup.rb diff --git a/Makefile b/Makefile index 1781669..111fd7b 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Variables -datasets = data/median-correlation.csv data/test_log10.csv data/training_log10.csv data/mazzatorta_log10.csv data/swiss_log10.csv +datasets = data/median-correlation.csv data/test_log10.csv data/training_log10.csv data/mazzatorta_log10.csv data/swiss_log10.csv data/swiss_mg_dup.csv data/mazzatorta_mg_dup.csv data/all_mg_dup.csv crossvalidations = data/training_log10-cv-0.csv data/training_log10-cv-1.csv data/training_log10-cv-2.csv validations = data/training-test-predictions.csv $(crossvalidations) data/misclassifications.csv figures = figures/functional-groups.pdf figures/test-prediction.pdf figures/prediction-test-correlation.pdf figures/dataset-variability.pdf figures/median-correlation.pdf figures/crossvalidation0.pdf figures/crossvalidation1.pdf figures/crossvalidation2.pdf @@ -105,9 +105,18 @@ data/swiss_log10.csv: data/swiss.csv data/mazzatorta.csv: data/LOAEL_mg_corrected_smiles_mmol.csv scripts/mazzatorta-unique-smiles.rb data/LOAEL_mg_corrected_smiles_mmol.csv +data/mazzatorta_mg_dup.csv: data/LOAEL_mg_corrected_smiles_mmol.csv + scripts/mazzatorta_mg_dup.rb data/LOAEL_mg_corrected_smiles_mmol.csv + data/swiss.csv: data/NOAEL-LOAEL_SMILES_rat_chron.csv scripts/noael_loael2mmol.rb data/NOAEL-LOAEL_SMILES_rat_chron.csv +data/swiss_mg_dup.csv: data/NOAEL-LOAEL_SMILES_rat_chron.csv + scripts/noael_loael2swiss_mg_dup.rb data/NOAEL-LOAEL_SMILES_rat_chron.csv + +data/all_mg_dup.csv: data/NOAEL-LOAEL_SMILES_rat_chron.csv data/LOAEL_mg_corrected_smiles_mmol.csv + scripts/all_mg_dup.rb + clean: rm figures/*pdf cd data && rm `ls -I "*LOAEL*" -I "*functional*" -I "*SMARTS*"` diff --git a/data/all_mg_dup.csv b/data/all_mg_dup.csv new file mode 100644 index 0000000..3e938ed --- /dev/null +++ b/data/all_mg_dup.csv @@ -0,0 +1,215 @@ +SMILES,LOAEL +OCC(C1OC(=O)C(=C1O)O)O,-3.4844 +OCC(C1OC(=O)C(=C1O)O)O,-3.1915 +CC(c1ccccc1)C,-2.6646 +CC(c1ccccc1)C,-2.5198 +CCc1ccccc1,-2.6107 +CCc1ccccc1,-2.4639 +OCCO,-2.3979 +OCCO,-2.6021 +OCCO,-3.301 +OCCO,-2.9638 +OCCO,-3.0 +C=Cc1ccccc1,-1.3222 +C=Cc1ccccc1,-2.4548 +C=Cc1ccccc1,-2.6021 +ClC(C(c1ccc(cc1)Cl)c1ccc(cc1)Cl)(Cl)Cl,-1.5051 +ClC(C(c1ccc(cc1)Cl)c1ccc(cc1)Cl)(Cl)Cl,0.60206 +COCN(c1c(CC)cccc1CC)C(=O)CCl,-1.1761 +COCN(c1c(CC)cccc1CC)C(=O)CCl,-1.1461 +COP(=S)(SCn1nnc2c(c1=O)cccc2)OC,0.4437 +COP(=S)(SCn1nnc2c(c1=O)cccc2)OC,-0.35218 +COP(=S)(SCn1nnc2c(c1=O)cccc2)OC,-0.41162 +O=C(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,-1.3979 +O=C(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,-2.0569 +ClC(C(SN1C(=O)C2C(C1=O)CC=CC2)(Cl)Cl)Cl,-1.0792 +ClC(C(SN1C(=O)C2C(C1=O)CC=CC2)(Cl)Cl)Cl,-1.1761 +CNC(=O)Oc1cccc2c1cccc2,-1.1931 +CNC(=O)Oc1cccc2c1cccc2,-1.7782 +ClC1CC2C(C1Cl)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,0.56864 +ClC1CC2C(C1Cl)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,0.30103 +ClC1CC2C(C1Cl)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,-0.77815 +ClC1CC2C(C1Cl)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,0.60206 +Nc1ccc(cc1)Cl,-0.77815 +Nc1ccc(cc1)Cl,-1.0969 +CCOP(=S)(Oc1ccc2c(c1)oc(=O)c(c2C)Cl)OCC,0.09691 +CCOP(=S)(Oc1ccc2c(c1)oc(=O)c(c2C)Cl)OCC,-0.23045 +COC(=O)c1c(Cl)c(Cl)c(c(c1Cl)Cl)C(=O)OC,-2.699 +COC(=O)c1c(Cl)c(Cl)c(c(c1Cl)Cl)C(=O)OC,-1.0 +OC(=O)C(Cl)(Cl)C,-1.4498 +OC(=O)C(Cl)(Cl)C,-1.699 +ClCCl,-1.699 +ClCCl,-1.7208 +COP(=O)(OC=C(Cl)Cl)OC,-0.36173 +COP(=O)(OC=C(Cl)Cl)OC,-0.33244 +OC(C(Cl)(Cl)Cl)(c1ccc(cc1)Cl)c1ccc(cc1)Cl,-1.301 +OC(C(Cl)(Cl)Cl)(c1ccc(cc1)Cl)c1ccc(cc1)Cl,-0.39794 +ClC1=C(Cl)C2(C(C1(Cl)C1C2C2CC1C1C2O1)(Cl)Cl)Cl,1.301 +ClC1=C(Cl)C2(C(C1(Cl)C1C2C2CC1C1C2O1)(Cl)Cl)Cl,0.90309 +ClC1=C(Cl)C2(C(C1(Cl)C1C2C2CC1C1C2O1)(Cl)Cl)Cl,1.0 +ClC1=C(Cl)C2(C(C1(Cl)C1C2C2CC1C1C2O1)(Cl)Cl)Cl,0.60206 +O=C(NC(=O)c1c(F)cccc1F)Nc1ccc(cc1)Cl,-0.90309 +O=C(NC(=O)c1c(F)cccc1F)Nc1ccc(cc1)Cl,-0.89209 +O=C(NC(=O)c1c(F)cccc1F)Nc1ccc(cc1)Cl,-0.8451 +CNC(=O)CSP(=S)(OC)OC,0.60206 +CNC(=O)CSP(=S)(OC)OC,-0.69897 +c1ccc(cc1)Nc1ccccc1,-1.4914 +c1ccc(cc1)Nc1ccccc1,-1.3979 +c1ccn2c(c1)c1ccccn1CC2,0.23657 +c1ccn2c(c1)c1ccccn1CC2,0.24413 +CCSCCSP(=S)(OCC)OCC,1.0 +CCSCCSP(=S)(OCC)OCC,1.3979 +CCSCCSP(=S)(OCC)OCC,0.65758 +O=C(N(C)C)Nc1ccc(c(c1)Cl)Cl,-0.77815 +O=C(N(C)C)Nc1ccc(c(c1)Cl)Cl,-0.79588 +ClCCP(=O)(O)O,-2.1761 +ClCCP(=O)(O)O,-2.6493 +ClCCP(=O)(O)O,-1.0792 +Cn1cc(c2cccc(c2)C(F)(F)F)c(=O)c(c1)c1ccccc1,-1.3979 +Cn1cc(c2cccc(c2)C(F)(F)F)c(=O)c(c1)c1ccccc1,-1.5119 +CCP(=S)(Sc1ccccc1)OCC,-0.19866 +CCP(=S)(Sc1ccccc1)OCC,-0.69897 +ClC1C(Cl)C(Cl)C(C(C1Cl)Cl)Cl,-0.69897 +ClC1C(Cl)C(Cl)C(C(C1Cl)Cl)Cl,-0.66276 +ClC1C(Cl)C(Cl)C(C(C1Cl)Cl)Cl,-0.6721 +ClC1C(Cl)C(Cl)C(C(C1Cl)Cl)Cl,-0.90309 +ClC1C(Cl)C(Cl)C(C(C1Cl)Cl)Cl,-0.60206 +C=CCOC(c1ccc(cc1Cl)Cl)Cn1cncc1,-1.6021 +C=CCOC(c1ccc(cc1Cl)Cl)Cn1cncc1,-1.1761 +CCOC(=O)CC(C(=O)OCC)SP(=S)(OC)OC,-1.699 +CCOC(=O)CC(C(=O)OCC)SP(=S)(OC)OC,-2.5563 +COc1sc(=O)n(n1)CSP(=S)(OC)OC,-0.30103 +COc1sc(=O)n(n1)CSP(=S)(OC)OC,0.09691 +COc1sc(=O)n(n1)CSP(=S)(OC)OC,-0.09691 +COc1sc(=O)n(n1)CSP(=S)(OC)OC,-0.20412 +CNC(=O)ON=C(SC)C,-1.0 +CNC(=O)ON=C(SC)C,-1.301 +CCOP(=S)(Oc1ccc(cc1)N(=O)=O)OCC,-0.54407 +CCOP(=S)(Oc1ccc(cc1)N(=O)=O)OCC,0.37675 +COP(=S)(SCN1C(=O)c2c(C1=O)cccc2)OC,-1.301 +COP(=S)(SCN1C(=O)c2c(C1=O)cccc2)OC,-0.95424 +CCN(C(=O)C(=C(OP(=O)(OC)OC)C)Cl)CC,-1.0899 +CCN(C(=O)C(=C(OP(=O)(OC)OC)C)Cl)CC,-0.17609 +CCN(c1nc(cc(n1)C)OP(=S)(OC)OC)CC,-0.39794 +CCN(c1nc(cc(n1)C)OP(=S)(OC)OC)CC,-1.1399 +CCC(=O)Nc1ccc(c(c1)Cl)Cl,-1.301 +CCC(=O)Nc1ccc(c(c1)Cl)Cl,-1.8808 +CCCC1COC(O1)(Cn1cncn1)c1ccc(cc1Cl)Cl,-1.3979 +CCCC1COC(O1)(Cn1cncn1)c1ccc(cc1Cl)Cl,-1.9823 +CCNc1nc(NCC)nc(n1)Cl,-0.69897 +CCNc1nc(NCC)nc(n1)Cl,-0.72428 +ClC=C(c1cc(Cl)c(cc1Cl)Cl)OP(=O)(OC)OC,-2.0 +ClC=C(c1cc(Cl)c(cc1Cl)Cl)OP(=O)(OC)OC,-1.699 +CN(C(=S)SSC(=S)N(C)C)C,-1.1761 +CN(C(=S)SSC(=S)N(C)C)C,-1.0607 +CN(C(=S)SSC(=S)N(C)C)C,-0.73799 +CN(C(=S)SSC(=S)N(C)C)C,-1.0792 +CC(N(c1c(cc(cc1N(=O)=O)C(F)(F)F)N(=O)=O)C(C)C)C,-1.6021 +CC(N(c1c(cc(cc1N(=O)=O)C(F)(F)F)N(=O)=O)C(C)C)C,-3.0 +COP(=O)(SC)N,0.045757 +COP(=O)(SC)N,1.0 +COP(=O)(SC)N,0.5376 +COP(=O)(NC(=O)C)SC,-1.5441 +COP(=O)(NC(=O)C)SC,-0.39794 +CCOP(=S)(Oc1cc(C)nc(n1)C(C)C)OCC,-0.17609 +CCOP(=S)(Oc1cc(C)nc(n1)C(C)C)OCC,-0.76343 +NC1CCCCC1,-1.7672 +NC1CCCCC1,-1.7782 +OC(=O)CNCP(=O)(O)O,-3.0 +OC(=O)CNCP(=O)(O)O,-2.4771 +S=C1NCCN1,0.63827 +S=C1NCCN1,0.60206 +S=C1NCCN1,-0.09691 +CON(C(=O)Nc1ccc(c(c1)Cl)Cl)C,-0.79588 +CON(C(=O)Nc1ccc(c(c1)Cl)Cl)C,-0.39794 +CCOP(=S)(Oc1nc(Cl)c(cc1Cl)Cl)OCC,-1.0 +CCOP(=S)(Oc1nc(Cl)c(cc1Cl)Cl)OCC,-1.2388e-09 +CC(OC(=O)Nc1cccc(c1)Cl)C,-2.699 +CC(OC(=O)Nc1cccc(c1)Cl)C,-3.0 +CCCCOCCOCCOCc1cc2OCOc2cc1CCC,-2.3979 +CCCCOCCOCCOCc1cc2OCOc2cc1CCC,-2.0 +N#CC(c1ccc(c(c1)Oc1ccccc1)F)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.0969 +N#CC(c1ccc(c(c1)Oc1ccccc1)F)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-0.87506 +N#CC(c1ccc(c(c1)Oc1ccccc1)F)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-0.77815 +N#CC(c1ccc(c(c1)Oc1ccccc1)F)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.0792 +ClC1C=CC2C1C1(Cl)C(=C(C2(C1(Cl)Cl)Cl)Cl)Cl,0.60206 +ClC1C=CC2C1C1(Cl)C(=C(C2(C1(Cl)Cl)Cl)Cl)Cl,0.42022 +CCOP(=S)(SCSC(C)(C)C)OCC,1.301 +CCOP(=S)(SCSC(C)(C)C)OCC,-0.30103 +CCOP(=S)(SCSC(C)(C)C)OCC,1.2218 +CN(C=Nc1ccc(cc1C)C)C=Nc1ccc(cc1C)C,-1.0086 +CN(C=Nc1ccc(cc1C)C)C=Nc1ccc(cc1C)C,-1.0 +CC(N1C(=O)c2ccccc2NS1(=O)=O)C,-1.6021 +CC(N1C(=O)c2ccccc2NS1(=O)=O)C,-1.5441 +CCSC(CC1CC(=O)C(=C(NOCC=CCl)CC)C(=O)C1)C,-2.0 +CCSC(CC1CC(=O)C(=C(NOCC=CCl)CC)C(=O)C1)C,-1.9345 +CCCN(C(=O)SCC)CCC,-0.95424 +CCCN(C(=O)SCC)CCC,-1.3979 +CSc1ccc(cc1C)OP(=S)(OC)OC,-0.57403 +CSc1ccc(cc1C)OP(=S)(OC)OC,0.14267 +CSc1ccc(cc1C)OP(=S)(OC)OC,0.34679 +c1scc(n1)c1nc2c([nH]1)cccc2,-0.30103 +c1scc(n1)c1nc2c([nH]1)cccc2,-1.6021 +c1scc(n1)c1nc2c([nH]1)cccc2,-1.4771 +Clc1c(Cl)c([N+](=O)[O-])c(c(c1Cl)Cl)Cl,-2.0969 +Clc1c(Cl)c([N+](=O)[O-])c(c(c1Cl)Cl)Cl,-2.1461 +Nc1ncn[nH]1,-0.39794 +Nc1ncn[nH]1,-0.69897 +O=C(NC(=O)c1c(F)cccc1F)Nc1cc(Cl)c(c(c1F)Cl)F,-1.415 +O=C(NC(=O)c1c(F)cccc1F)Nc1cc(Cl)c(c(c1F)Cl)F,-1.3979 +C=CCC1=C(C)C(CC1=O)OC(=O)C1C(C1(C)C)C=C(C)C,-1.8751 +C=CCC1=C(C)C(CC1=O)OC(=O)C1C(C1(C)C)C=C(C)C,-1.3892 +C=CCC1=C(C)C(CC1=O)OC(=O)C1C(C1(C)C)C=C(C)C,-1.1399 +CON=C(c1ccccc1COc1ccccc1C)C(=O)OC,-2.574 +CON=C(c1ccccc1COc1ccccc1C)C(=O)OC,-2.5682 +CC(N(c1c(cc(cc1N(=O)=O)S(=O)(=O)N)N(=O)=O)C(C)C)C,-1.5666 +CC(N(c1c(cc(cc1N(=O)=O)S(=O)(=O)N)N(=O)=O)C(C)C)C,-1.6532 +CCCN(C(=O)n1cncc1)CCOc1c(Cl)cc(cc1Cl)Cl,-0.87506 +CCCN(C(=O)n1cncc1)CCOc1c(Cl)cc(cc1Cl)Cl,-0.70757 +N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)(C)C,-1.2889 +N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)(C)C,-1.3979 +N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.8751 +N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.699 +ClC1C2OC2C2C1C1(Cl)C(=C(C2(C1(Cl)Cl)Cl)Cl)Cl,0.69897 +ClC1C2OC2C2C1C1(Cl)C(=C(C2(C1(Cl)Cl)Cl)Cl)Cl,1.6021 +N#CC(c1ccccc1)(Cn1cncn1)CCc1ccc(cc1)Cl,-1.6021 +N#CC(c1ccccc1)(Cn1cncn1)CCc1ccc(cc1)Cl,-1.4771 +O=C(C1C(C1(C)C)C=C(C(F)(F)F)Cl)OCc1cccc(c1C)c1ccccc1,-0.69897 +O=C(C1C(C1(C)C)C=C(C(F)(F)F)Cl)OCc1cccc(c1C)c1ccccc1,-0.90309 +[O-][N+](=O)c1cc(Cl)c(c(c1)Cl)N,-2.1761 +[O-][N+](=O)c1cc(Cl)c(c(c1)Cl)N,-2.3802 +CC(Oc1cccc(c1)NC(=O)c1ccccc1C(F)(F)F)C,-2.699 +CC(Oc1cccc(c1)NC(=O)c1ccccc1C(F)(F)F)C,-1.9395 +CCCCNC(=O)n1c(NC(=O)OC)nc2c1cccc2,-2.0607 +CCCCNC(=O)n1c(NC(=O)OC)nc2c1cccc2,-2.3979 +ClC(SN1C(=O)c2c(C1=O)cccc2)(Cl)Cl,-2.699 +ClC(SN1C(=O)c2c(C1=O)cccc2)(Cl)Cl,-1.6021 +ClC(SN1C(=O)c2c(C1=O)cccc2)(Cl)Cl,-1.699 +C#CCOS(=O)OC1CCCCC1Oc1ccc(cc1)C(C)(C)C,-2.0 +C#CCOS(=O)OC1CCCCC1Oc1ccc(cc1)C(C)(C)C,-1.2788 +N#Cc1sc2=c(sc1C#N)c(=O)c1c(c2=O)cccc1,-1.0 +N#Cc1sc2=c(sc1C#N)c(=O)c1c(c2=O)cccc1,-0.77815 +CN(C(=O)Oc1nc(nc(c1C)C)N(C)C)C,-1.0969 +CN(C(=O)Oc1nc(nc(c1C)C)N(C)C)C,-1.0899 +CNc1cnn(c(=O)c1Cl)c1cccc(c1)C(F)(F)F,-1.273 +CNc1cnn(c(=O)c1Cl)c1cccc(c1)C(F)(F)F,-1.7097 +Clc1ccccc1c1nnc(nn1)c1ccccc1Cl,-1.301 +Clc1ccccc1c1nnc(nn1)c1ccccc1Cl,-1.238 +CCCCCCCCc1cc(N(=O)=O)c(c(c1)N(=O)=O)OC(=O)C=CC,-1.699 +CCCCCCCCc1cc(N(=O)=O)c(c(c1)N(=O)=O)OC(=O)C=CC,-1.8062 +CCSC(=O)N1CCCCCC1,-1.1706 +CCSC(=O)N1CCCCCC1,-0.30103 +OC(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,-1.3979 +OC(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,-2.0212 +CCOP(=O)(Oc1ccc(c(c1)C)SC)NC(C)C,-0.17609 +CCOP(=O)(Oc1ccc(c(c1)C)SC)NC(C)C,-0.23045 +Cn1ccc(cc1)c1ccn(cc1)C,-0.57403 +Cn1ccc(cc1)c1ccn(cc1)C,-0.40654 +CCCCC(c1ccc(cc1)Cl)(Cn1cncn1)C#N,-0.993 +CCCCC(c1ccc(cc1)Cl)(Cn1cncn1)C#N,-0.99123 +Fc1ccc(cc1)C(=O)CCCN1CCN(CC1)c1ccccn1,-2.0607 +Fc1ccc(cc1)C(=O)CCCN1CCN(CC1)c1ccccn1,-1.4771 +N#CC(c1c(Cl)ccc(c1Cl)n1ncc(=O)[nH]c1=O)c1ccc(cc1)Cl,-1.1761 +N#CC(c1c(Cl)ccc(c1Cl)n1ncc(=O)[nH]c1=O)c1ccc(cc1)Cl,-1.3617 +COC1CC(OC2C(C)C=CC=C3COC4C3(O)C(C=C(C4O)C)C(=O)OC3CC(CC=C2C)OC2(C3)C=CC(C(O2)C(C)C)C)OC(C1OC1CC(OC)C(C(O1)C)NC(=O)C)C,-1.9287e-16 +COC1CC(OC2C(C)C=CC=C3COC4C3(O)C(C=C(C4O)C)C(=O)OC3CC(CC=C2C)OC2(C3)C=CC(C(O2)C(C)C)C)OC(C1OC1CC(OC)C(C(O1)C)NC(=O)C)C,-0.39794 diff --git a/data/mazzatorta_mg_dup.csv b/data/mazzatorta_mg_dup.csv new file mode 100644 index 0000000..3e938ed --- /dev/null +++ b/data/mazzatorta_mg_dup.csv @@ -0,0 +1,215 @@ +SMILES,LOAEL +OCC(C1OC(=O)C(=C1O)O)O,-3.4844 +OCC(C1OC(=O)C(=C1O)O)O,-3.1915 +CC(c1ccccc1)C,-2.6646 +CC(c1ccccc1)C,-2.5198 +CCc1ccccc1,-2.6107 +CCc1ccccc1,-2.4639 +OCCO,-2.3979 +OCCO,-2.6021 +OCCO,-3.301 +OCCO,-2.9638 +OCCO,-3.0 +C=Cc1ccccc1,-1.3222 +C=Cc1ccccc1,-2.4548 +C=Cc1ccccc1,-2.6021 +ClC(C(c1ccc(cc1)Cl)c1ccc(cc1)Cl)(Cl)Cl,-1.5051 +ClC(C(c1ccc(cc1)Cl)c1ccc(cc1)Cl)(Cl)Cl,0.60206 +COCN(c1c(CC)cccc1CC)C(=O)CCl,-1.1761 +COCN(c1c(CC)cccc1CC)C(=O)CCl,-1.1461 +COP(=S)(SCn1nnc2c(c1=O)cccc2)OC,0.4437 +COP(=S)(SCn1nnc2c(c1=O)cccc2)OC,-0.35218 +COP(=S)(SCn1nnc2c(c1=O)cccc2)OC,-0.41162 +O=C(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,-1.3979 +O=C(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,-2.0569 +ClC(C(SN1C(=O)C2C(C1=O)CC=CC2)(Cl)Cl)Cl,-1.0792 +ClC(C(SN1C(=O)C2C(C1=O)CC=CC2)(Cl)Cl)Cl,-1.1761 +CNC(=O)Oc1cccc2c1cccc2,-1.1931 +CNC(=O)Oc1cccc2c1cccc2,-1.7782 +ClC1CC2C(C1Cl)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,0.56864 +ClC1CC2C(C1Cl)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,0.30103 +ClC1CC2C(C1Cl)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,-0.77815 +ClC1CC2C(C1Cl)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,0.60206 +Nc1ccc(cc1)Cl,-0.77815 +Nc1ccc(cc1)Cl,-1.0969 +CCOP(=S)(Oc1ccc2c(c1)oc(=O)c(c2C)Cl)OCC,0.09691 +CCOP(=S)(Oc1ccc2c(c1)oc(=O)c(c2C)Cl)OCC,-0.23045 +COC(=O)c1c(Cl)c(Cl)c(c(c1Cl)Cl)C(=O)OC,-2.699 +COC(=O)c1c(Cl)c(Cl)c(c(c1Cl)Cl)C(=O)OC,-1.0 +OC(=O)C(Cl)(Cl)C,-1.4498 +OC(=O)C(Cl)(Cl)C,-1.699 +ClCCl,-1.699 +ClCCl,-1.7208 +COP(=O)(OC=C(Cl)Cl)OC,-0.36173 +COP(=O)(OC=C(Cl)Cl)OC,-0.33244 +OC(C(Cl)(Cl)Cl)(c1ccc(cc1)Cl)c1ccc(cc1)Cl,-1.301 +OC(C(Cl)(Cl)Cl)(c1ccc(cc1)Cl)c1ccc(cc1)Cl,-0.39794 +ClC1=C(Cl)C2(C(C1(Cl)C1C2C2CC1C1C2O1)(Cl)Cl)Cl,1.301 +ClC1=C(Cl)C2(C(C1(Cl)C1C2C2CC1C1C2O1)(Cl)Cl)Cl,0.90309 +ClC1=C(Cl)C2(C(C1(Cl)C1C2C2CC1C1C2O1)(Cl)Cl)Cl,1.0 +ClC1=C(Cl)C2(C(C1(Cl)C1C2C2CC1C1C2O1)(Cl)Cl)Cl,0.60206 +O=C(NC(=O)c1c(F)cccc1F)Nc1ccc(cc1)Cl,-0.90309 +O=C(NC(=O)c1c(F)cccc1F)Nc1ccc(cc1)Cl,-0.89209 +O=C(NC(=O)c1c(F)cccc1F)Nc1ccc(cc1)Cl,-0.8451 +CNC(=O)CSP(=S)(OC)OC,0.60206 +CNC(=O)CSP(=S)(OC)OC,-0.69897 +c1ccc(cc1)Nc1ccccc1,-1.4914 +c1ccc(cc1)Nc1ccccc1,-1.3979 +c1ccn2c(c1)c1ccccn1CC2,0.23657 +c1ccn2c(c1)c1ccccn1CC2,0.24413 +CCSCCSP(=S)(OCC)OCC,1.0 +CCSCCSP(=S)(OCC)OCC,1.3979 +CCSCCSP(=S)(OCC)OCC,0.65758 +O=C(N(C)C)Nc1ccc(c(c1)Cl)Cl,-0.77815 +O=C(N(C)C)Nc1ccc(c(c1)Cl)Cl,-0.79588 +ClCCP(=O)(O)O,-2.1761 +ClCCP(=O)(O)O,-2.6493 +ClCCP(=O)(O)O,-1.0792 +Cn1cc(c2cccc(c2)C(F)(F)F)c(=O)c(c1)c1ccccc1,-1.3979 +Cn1cc(c2cccc(c2)C(F)(F)F)c(=O)c(c1)c1ccccc1,-1.5119 +CCP(=S)(Sc1ccccc1)OCC,-0.19866 +CCP(=S)(Sc1ccccc1)OCC,-0.69897 +ClC1C(Cl)C(Cl)C(C(C1Cl)Cl)Cl,-0.69897 +ClC1C(Cl)C(Cl)C(C(C1Cl)Cl)Cl,-0.66276 +ClC1C(Cl)C(Cl)C(C(C1Cl)Cl)Cl,-0.6721 +ClC1C(Cl)C(Cl)C(C(C1Cl)Cl)Cl,-0.90309 +ClC1C(Cl)C(Cl)C(C(C1Cl)Cl)Cl,-0.60206 +C=CCOC(c1ccc(cc1Cl)Cl)Cn1cncc1,-1.6021 +C=CCOC(c1ccc(cc1Cl)Cl)Cn1cncc1,-1.1761 +CCOC(=O)CC(C(=O)OCC)SP(=S)(OC)OC,-1.699 +CCOC(=O)CC(C(=O)OCC)SP(=S)(OC)OC,-2.5563 +COc1sc(=O)n(n1)CSP(=S)(OC)OC,-0.30103 +COc1sc(=O)n(n1)CSP(=S)(OC)OC,0.09691 +COc1sc(=O)n(n1)CSP(=S)(OC)OC,-0.09691 +COc1sc(=O)n(n1)CSP(=S)(OC)OC,-0.20412 +CNC(=O)ON=C(SC)C,-1.0 +CNC(=O)ON=C(SC)C,-1.301 +CCOP(=S)(Oc1ccc(cc1)N(=O)=O)OCC,-0.54407 +CCOP(=S)(Oc1ccc(cc1)N(=O)=O)OCC,0.37675 +COP(=S)(SCN1C(=O)c2c(C1=O)cccc2)OC,-1.301 +COP(=S)(SCN1C(=O)c2c(C1=O)cccc2)OC,-0.95424 +CCN(C(=O)C(=C(OP(=O)(OC)OC)C)Cl)CC,-1.0899 +CCN(C(=O)C(=C(OP(=O)(OC)OC)C)Cl)CC,-0.17609 +CCN(c1nc(cc(n1)C)OP(=S)(OC)OC)CC,-0.39794 +CCN(c1nc(cc(n1)C)OP(=S)(OC)OC)CC,-1.1399 +CCC(=O)Nc1ccc(c(c1)Cl)Cl,-1.301 +CCC(=O)Nc1ccc(c(c1)Cl)Cl,-1.8808 +CCCC1COC(O1)(Cn1cncn1)c1ccc(cc1Cl)Cl,-1.3979 +CCCC1COC(O1)(Cn1cncn1)c1ccc(cc1Cl)Cl,-1.9823 +CCNc1nc(NCC)nc(n1)Cl,-0.69897 +CCNc1nc(NCC)nc(n1)Cl,-0.72428 +ClC=C(c1cc(Cl)c(cc1Cl)Cl)OP(=O)(OC)OC,-2.0 +ClC=C(c1cc(Cl)c(cc1Cl)Cl)OP(=O)(OC)OC,-1.699 +CN(C(=S)SSC(=S)N(C)C)C,-1.1761 +CN(C(=S)SSC(=S)N(C)C)C,-1.0607 +CN(C(=S)SSC(=S)N(C)C)C,-0.73799 +CN(C(=S)SSC(=S)N(C)C)C,-1.0792 +CC(N(c1c(cc(cc1N(=O)=O)C(F)(F)F)N(=O)=O)C(C)C)C,-1.6021 +CC(N(c1c(cc(cc1N(=O)=O)C(F)(F)F)N(=O)=O)C(C)C)C,-3.0 +COP(=O)(SC)N,0.045757 +COP(=O)(SC)N,1.0 +COP(=O)(SC)N,0.5376 +COP(=O)(NC(=O)C)SC,-1.5441 +COP(=O)(NC(=O)C)SC,-0.39794 +CCOP(=S)(Oc1cc(C)nc(n1)C(C)C)OCC,-0.17609 +CCOP(=S)(Oc1cc(C)nc(n1)C(C)C)OCC,-0.76343 +NC1CCCCC1,-1.7672 +NC1CCCCC1,-1.7782 +OC(=O)CNCP(=O)(O)O,-3.0 +OC(=O)CNCP(=O)(O)O,-2.4771 +S=C1NCCN1,0.63827 +S=C1NCCN1,0.60206 +S=C1NCCN1,-0.09691 +CON(C(=O)Nc1ccc(c(c1)Cl)Cl)C,-0.79588 +CON(C(=O)Nc1ccc(c(c1)Cl)Cl)C,-0.39794 +CCOP(=S)(Oc1nc(Cl)c(cc1Cl)Cl)OCC,-1.0 +CCOP(=S)(Oc1nc(Cl)c(cc1Cl)Cl)OCC,-1.2388e-09 +CC(OC(=O)Nc1cccc(c1)Cl)C,-2.699 +CC(OC(=O)Nc1cccc(c1)Cl)C,-3.0 +CCCCOCCOCCOCc1cc2OCOc2cc1CCC,-2.3979 +CCCCOCCOCCOCc1cc2OCOc2cc1CCC,-2.0 +N#CC(c1ccc(c(c1)Oc1ccccc1)F)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.0969 +N#CC(c1ccc(c(c1)Oc1ccccc1)F)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-0.87506 +N#CC(c1ccc(c(c1)Oc1ccccc1)F)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-0.77815 +N#CC(c1ccc(c(c1)Oc1ccccc1)F)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.0792 +ClC1C=CC2C1C1(Cl)C(=C(C2(C1(Cl)Cl)Cl)Cl)Cl,0.60206 +ClC1C=CC2C1C1(Cl)C(=C(C2(C1(Cl)Cl)Cl)Cl)Cl,0.42022 +CCOP(=S)(SCSC(C)(C)C)OCC,1.301 +CCOP(=S)(SCSC(C)(C)C)OCC,-0.30103 +CCOP(=S)(SCSC(C)(C)C)OCC,1.2218 +CN(C=Nc1ccc(cc1C)C)C=Nc1ccc(cc1C)C,-1.0086 +CN(C=Nc1ccc(cc1C)C)C=Nc1ccc(cc1C)C,-1.0 +CC(N1C(=O)c2ccccc2NS1(=O)=O)C,-1.6021 +CC(N1C(=O)c2ccccc2NS1(=O)=O)C,-1.5441 +CCSC(CC1CC(=O)C(=C(NOCC=CCl)CC)C(=O)C1)C,-2.0 +CCSC(CC1CC(=O)C(=C(NOCC=CCl)CC)C(=O)C1)C,-1.9345 +CCCN(C(=O)SCC)CCC,-0.95424 +CCCN(C(=O)SCC)CCC,-1.3979 +CSc1ccc(cc1C)OP(=S)(OC)OC,-0.57403 +CSc1ccc(cc1C)OP(=S)(OC)OC,0.14267 +CSc1ccc(cc1C)OP(=S)(OC)OC,0.34679 +c1scc(n1)c1nc2c([nH]1)cccc2,-0.30103 +c1scc(n1)c1nc2c([nH]1)cccc2,-1.6021 +c1scc(n1)c1nc2c([nH]1)cccc2,-1.4771 +Clc1c(Cl)c([N+](=O)[O-])c(c(c1Cl)Cl)Cl,-2.0969 +Clc1c(Cl)c([N+](=O)[O-])c(c(c1Cl)Cl)Cl,-2.1461 +Nc1ncn[nH]1,-0.39794 +Nc1ncn[nH]1,-0.69897 +O=C(NC(=O)c1c(F)cccc1F)Nc1cc(Cl)c(c(c1F)Cl)F,-1.415 +O=C(NC(=O)c1c(F)cccc1F)Nc1cc(Cl)c(c(c1F)Cl)F,-1.3979 +C=CCC1=C(C)C(CC1=O)OC(=O)C1C(C1(C)C)C=C(C)C,-1.8751 +C=CCC1=C(C)C(CC1=O)OC(=O)C1C(C1(C)C)C=C(C)C,-1.3892 +C=CCC1=C(C)C(CC1=O)OC(=O)C1C(C1(C)C)C=C(C)C,-1.1399 +CON=C(c1ccccc1COc1ccccc1C)C(=O)OC,-2.574 +CON=C(c1ccccc1COc1ccccc1C)C(=O)OC,-2.5682 +CC(N(c1c(cc(cc1N(=O)=O)S(=O)(=O)N)N(=O)=O)C(C)C)C,-1.5666 +CC(N(c1c(cc(cc1N(=O)=O)S(=O)(=O)N)N(=O)=O)C(C)C)C,-1.6532 +CCCN(C(=O)n1cncc1)CCOc1c(Cl)cc(cc1Cl)Cl,-0.87506 +CCCN(C(=O)n1cncc1)CCOc1c(Cl)cc(cc1Cl)Cl,-0.70757 +N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)(C)C,-1.2889 +N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)(C)C,-1.3979 +N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.8751 +N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.699 +ClC1C2OC2C2C1C1(Cl)C(=C(C2(C1(Cl)Cl)Cl)Cl)Cl,0.69897 +ClC1C2OC2C2C1C1(Cl)C(=C(C2(C1(Cl)Cl)Cl)Cl)Cl,1.6021 +N#CC(c1ccccc1)(Cn1cncn1)CCc1ccc(cc1)Cl,-1.6021 +N#CC(c1ccccc1)(Cn1cncn1)CCc1ccc(cc1)Cl,-1.4771 +O=C(C1C(C1(C)C)C=C(C(F)(F)F)Cl)OCc1cccc(c1C)c1ccccc1,-0.69897 +O=C(C1C(C1(C)C)C=C(C(F)(F)F)Cl)OCc1cccc(c1C)c1ccccc1,-0.90309 +[O-][N+](=O)c1cc(Cl)c(c(c1)Cl)N,-2.1761 +[O-][N+](=O)c1cc(Cl)c(c(c1)Cl)N,-2.3802 +CC(Oc1cccc(c1)NC(=O)c1ccccc1C(F)(F)F)C,-2.699 +CC(Oc1cccc(c1)NC(=O)c1ccccc1C(F)(F)F)C,-1.9395 +CCCCNC(=O)n1c(NC(=O)OC)nc2c1cccc2,-2.0607 +CCCCNC(=O)n1c(NC(=O)OC)nc2c1cccc2,-2.3979 +ClC(SN1C(=O)c2c(C1=O)cccc2)(Cl)Cl,-2.699 +ClC(SN1C(=O)c2c(C1=O)cccc2)(Cl)Cl,-1.6021 +ClC(SN1C(=O)c2c(C1=O)cccc2)(Cl)Cl,-1.699 +C#CCOS(=O)OC1CCCCC1Oc1ccc(cc1)C(C)(C)C,-2.0 +C#CCOS(=O)OC1CCCCC1Oc1ccc(cc1)C(C)(C)C,-1.2788 +N#Cc1sc2=c(sc1C#N)c(=O)c1c(c2=O)cccc1,-1.0 +N#Cc1sc2=c(sc1C#N)c(=O)c1c(c2=O)cccc1,-0.77815 +CN(C(=O)Oc1nc(nc(c1C)C)N(C)C)C,-1.0969 +CN(C(=O)Oc1nc(nc(c1C)C)N(C)C)C,-1.0899 +CNc1cnn(c(=O)c1Cl)c1cccc(c1)C(F)(F)F,-1.273 +CNc1cnn(c(=O)c1Cl)c1cccc(c1)C(F)(F)F,-1.7097 +Clc1ccccc1c1nnc(nn1)c1ccccc1Cl,-1.301 +Clc1ccccc1c1nnc(nn1)c1ccccc1Cl,-1.238 +CCCCCCCCc1cc(N(=O)=O)c(c(c1)N(=O)=O)OC(=O)C=CC,-1.699 +CCCCCCCCc1cc(N(=O)=O)c(c(c1)N(=O)=O)OC(=O)C=CC,-1.8062 +CCSC(=O)N1CCCCCC1,-1.1706 +CCSC(=O)N1CCCCCC1,-0.30103 +OC(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,-1.3979 +OC(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,-2.0212 +CCOP(=O)(Oc1ccc(c(c1)C)SC)NC(C)C,-0.17609 +CCOP(=O)(Oc1ccc(c(c1)C)SC)NC(C)C,-0.23045 +Cn1ccc(cc1)c1ccn(cc1)C,-0.57403 +Cn1ccc(cc1)c1ccn(cc1)C,-0.40654 +CCCCC(c1ccc(cc1)Cl)(Cn1cncn1)C#N,-0.993 +CCCCC(c1ccc(cc1)Cl)(Cn1cncn1)C#N,-0.99123 +Fc1ccc(cc1)C(=O)CCCN1CCN(CC1)c1ccccn1,-2.0607 +Fc1ccc(cc1)C(=O)CCCN1CCN(CC1)c1ccccn1,-1.4771 +N#CC(c1c(Cl)ccc(c1Cl)n1ncc(=O)[nH]c1=O)c1ccc(cc1)Cl,-1.1761 +N#CC(c1c(Cl)ccc(c1Cl)n1ncc(=O)[nH]c1=O)c1ccc(cc1)Cl,-1.3617 +COC1CC(OC2C(C)C=CC=C3COC4C3(O)C(C=C(C4O)C)C(=O)OC3CC(CC=C2C)OC2(C3)C=CC(C(O2)C(C)C)C)OC(C1OC1CC(OC)C(C(O1)C)NC(=O)C)C,-1.9287e-16 +COC1CC(OC2C(C)C=CC=C3COC4C3(O)C(C=C(C4O)C)C(=O)OC3CC(CC=C2C)OC2(C3)C=CC(C(O2)C(C)C)C)OC(C1OC1CC(OC)C(C(O1)C)NC(=O)C)C,-0.39794 diff --git a/data/swiss_mg_dup.csv b/data/swiss_mg_dup.csv new file mode 100644 index 0000000..3412c2e --- /dev/null +++ b/data/swiss_mg_dup.csv @@ -0,0 +1,194 @@ +SMILES,LOAEL +OC(=O)COc1ccc(cc1Cl)Cl,-1.7924 +OC(=O)COc1ccc(cc1Cl)Cl,-0.69897 +OC(=O)COc1ccc(cc1Cl)Cl,-1.8751 +CCOCN(c1c(C)cccc1CC)C(=O)CCl,-1.6767 +CCOCN(c1c(C)cccc1CC)C(=O)CCl,-1.8388 +CCOCN(c1c(C)cccc1CC)C(=O)CCl,-1.8254 +Clc1c(ccc(c1N)[N+](=O)[O-])Oc1ccccc1,-0.90309 +Clc1c(ccc(c1N)[N+](=O)[O-])Oc1ccccc1,-1.7924 +COCN(c1c(CC)cccc1CC)C(=O)CCl,-0.39794 +COCN(c1c(CC)cccc1CC)C(=O)CCl,-1.1461 +COCN(c1c(CC)cccc1CC)C(=O)CCl,-2.1004 +OCC(CCl)O,-1.4771 +OCC(CCl)O,-0.041393 +Nc1n[nH]cn1,-0.39794 +Nc1n[nH]cn1,-0.69897 +CCNc1nc(NC(C)C)nc(n1)Cl,-0.49136 +CCNc1nc(NC(C)C)nc(n1)Cl,-1.0 +CCNc1nc(NC(C)C)nc(n1)Cl,-1.3729 +CCOC(=O)CCN(C(C)C)SN(C(=O)Oc1cccc2c1OC(C2)(C)C)C,-1.0414 +CCOC(=O)CCN(C(C)C)SN(C(=O)Oc1cccc2c1OC(C2)(C)C)C,-1.3802 +Fc1ccc(c(c1)c1ccc(c(c1)Cl)Cl)NC(=O)c1cn(nc1C(F)F)C,-1.2405 +Fc1ccc(c(c1)c1ccc(c(c1)Cl)Cl)NC(=O)c1cn(nc1C(F)F)C,-1.0828 +ClC(SN1C(=O)C2C(C1=O)CC=CC2)(Cl)Cl,-2.0 +ClC(SN1C(=O)C2C(C1=O)CC=CC2)(Cl)Cl,-1.9912 +CNC(=O)Oc1cccc2c1OC(C2)(C)C,-0.068186 +CNC(=O)Oc1cccc2c1OC(C2)(C)C,-0.69897 +Nc1cnn(c(=O)c1Cl)c1ccccc1,-1.7782 +Nc1cnn(c(=O)c1Cl)c1ccccc1,-1.699 +ClCC[N+](C)(C)C.[Cl-],-2.1335 +ClCC[N+](C)(C)C.[Cl-],-2.0969 +ClCC[N+](C)(C)C.[Cl-],-1.8751 +N#Cc1c(Cl)c(C#N)c(c(c1Cl)Cl)Cl,-0.51851 +N#Cc1c(Cl)c(C#N)c(c(c1Cl)Cl)Cl,-1.6021 +CCOP(=S)(Oc1nc(Cl)c(cc1Cl)Cl)OCC,-1.0 +CCOP(=S)(Oc1nc(Cl)c(cc1Cl)Cl)OCC,-0.77815 +CCOP(=S)(Oc1nc(Cl)c(cc1Cl)Cl)OCC,-0.0 +CCO/N=C(\C1=C(O)CC(CC1=O)C1CCCSC1)/CCC,-1.4472 +CCO/N=C(\C1=C(O)CC(CC1=O)C1CCCSC1)/CCC,-0.80618 +N#CC(c1ccc(c(c1)Oc1ccccc1)F)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.2833 +N#CC(c1ccc(c(c1)Oc1ccccc1)F)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.3579 +O[Sn](C1CCCCC1)(C1CCCCC1)C1CCCCC1,-0.14301 +O[Sn](C1CCCCC1)(C1CCCCC1)C1CCCCC1,-0.77815 +O[Sn](C1CCCCC1)(C1CCCCC1)C1CCCCC1,-0.0 +CCNC(=O)NC(=O)/C(=N\OC)/C#N,-1.3711 +CCNC(=O)NC(=O)/C(=N\OC)/C#N,-1.4814 +N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.699 +N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,-1.8751 +CN1CN(C)CSC1=S,-0.5563 +CN1CN(C)CSC1=S,-0.72428 +N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Br)Br,-0.39794 +N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Br)Br,-0.73239 +COc1cc(ccc1OC)/C(=C/C(=O)N1CCOCC1)/c1ccc(cc1)Cl,-1.5563 +COc1cc(ccc1OC)/C(=C/C(=O)N1CCOCC1)/c1ccc(cc1)Cl,-1.5315 +c1cc[n+]2c(c1)c1cccc[n+]1CC2,-0.46389 +c1cc[n+]2c(c1)c1cccc[n+]1CC2,-0.59106 +OC(=O)c1ccccc1.CCC(C1OC2(C=CC1C)OC1CC=C(C)C(OC3CC(OC)C(C(O3)C)OC3CC(OC)C(C(O3)C)NC)C(C)C=CC=C3C4(C(C(=O)OC(C2)C1)C=C(C)C(C4OC3)O)O)C,-0.39794 +OC(=O)c1ccccc1.CCC(C1OC2(C=CC1C)OC1CC=C(C)C(OC3CC(OC)C(C(O3)C)OC3CC(OC)C(C(O3)C)NC)C(C)C=CC=C3C4(C(C(=O)OC(C2)C1)C=C(C)C(C4OC3)O)O)C,-0.0 +O=S1OCC2C(CO1)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,-1.0414 +O=S1OCC2C(CO1)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,-0.4624 +O=S1OCC2C(CO1)C1(C(C2(Cl)C(=C1Cl)Cl)(Cl)Cl)Cl,-0.69897 +Fc1ccc(cc1)C1(Cn2cncn2)OC1c1ccccc1Cl,-0.69897 +Fc1ccc(cc1)C1(Cn2cncn2)OC1c1ccccc1Cl,-0.77815 +CCN(c1c(cc(cc1[N+](=O)[O-])C(F)(F)F)[N+](=O)[O-])CC(=C)C,-1.0294 +CCN(c1c(cc(cc1[N+](=O)[O-])C(F)(F)F)[N+](=O)[O-])CC(=C)C,-1.4624 +CCCSP(=O)(SCCC)OCC,0.30103 +CCCSP(=O)(SCCC)OCC,-0.38739 +CCOP(=O)(Oc1ccc(c(c1)C)SC)NC(C)C,-0.23045 +CCOP(=O)(Oc1ccc(c(c1)C)SC)NC(C)C,0.33724 +Clc1ccc(cc1)C(c1ccccc1Cl)(c1cncnc1)O,-0.5563 +Clc1ccc(cc1)C(c1ccccc1Cl)(c1cncnc1)O,-1.243 +Clc1ccc(cc1)C(c1ccccc1Cl)(c1cncnc1)O,-0.30103 +Clc1ccc(cc1)C(c1ccccc1Cl)(c1cncnc1)O,-0.39794 +CSc1ccc(cc1C)OP(=S)(OC)OC,0.69897 +CSc1ccc(cc1C)OP(=S)(OC)OC,0.14267 +Clc1cc(cnc1Nc1c(cc(c(c1[N+](=O)[O-])Cl)C(F)(F)F)[N+](=O)[O-])C(F)(F)F,-0.59106 +Clc1cc(cnc1Nc1c(cc(c(c1[N+](=O)[O-])Cl)C(F)(F)F)[N+](=O)[O-])C(F)(F)F,-0.58206 +O=C(c1cccc(c1C(=O)NC(CS(=O)(=O)C)(C)C)I)Nc1ccc(cc1C)C(C(F)(F)F)(C(F)(F)F)F,-1.8976 +O=C(c1cccc(c1C(=O)NC(CS(=O)(=O)C)(C)C)I)Nc1ccc(cc1C)C(C(F)(F)F)(C(F)(F)F)F,-1.5315 +O=C(NC(=O)c1c(F)cccc1F)Nc1ccc(cc1F)Oc1ccc(cc1Cl)C(F)(F)F,-2.3617 +O=C(NC(=O)c1c(F)cccc1F)Nc1ccc(cc1F)Oc1ccc(cc1Cl)C(F)(F)F,-2.3385 +O=C(N(C)C)Nc1cccc(c1)C(F)(F)F,-1.2348 +O=C(N(C)C)Nc1cccc(c1)C(F)(F)F,-1.4771 +CC(C(c1cncnc1)(c1ccc(cc1)OC(F)(F)F)O)C,-0.99123 +CC(C(c1cncnc1)(c1ccc(cc1)OC(F)(F)F)O)C,-1.6345 +Fc1ccc(cc1)[Si](c1ccc(cc1)F)(Cn1cncn1)C,-1.0 +Fc1ccc(cc1)[Si](c1ccc(cc1)F)(Cn1cncn1)C,-0.69897 +OC(=O)CNCP(=O)(O)O,-2.9731 +OC(=O)CNCP(=O)(O)O,-2.7482 +OC(=O)CNCP(=O)(O)O,-2.4771 +OC(=O)CNCP(=O)(O)O,-3.0842 +C=CCOC(c1ccc(cc1Cl)Cl)Cn1cncc1,-1.2014 +C=CCOC(c1ccc(cc1Cl)Cl)Cn1cncc1,-1.1761 +CCC(c1noc(c1)NC(=O)c1c(OC)cccc1OC)(CC)C,-2.7218 +CCC(c1noc(c1)NC(=O)c1c(OC)cccc1OC)(CC)C,-2.7782 +CO/N=C(\c1ccccc1COc1ccccc1C)/C(=O)OC,-2.8763 +CO/N=C(\c1ccccc1COc1ccccc1C)/C(=O)OC,-2.574 +CON(C(=O)Nc1ccc(c(c1)Cl)Cl)C,-0.79588 +CON(C(=O)Nc1ccc(c(c1)Cl)Cl)C,-1.4949 +CCOC(=O)CC(C(=O)OCC)SP(=S)(OC)OC,-1.4624 +CCOC(=O)CC(C(=O)OCC)SP(=S)(OC)OC,-2.1553 +C[N+]1(C)CCCCC1.[Cl-],-2.4281 +C[N+]1(C)CCCCC1.[Cl-],-2.8351 +CCCCCCC(c1cc(cc(c1OC(=O)/C=C\C)[N+](=O)[O-])[N+](=O)[O-])C,-2.0821 +CCCCCCC(c1cc(cc(c1OC(=O)/C=C\C)[N+](=O)[O-])[N+](=O)[O-])C,-1.8513 +Cc1nnc(c(=O)n1N)c1ccccc1,-1.29 +Cc1nnc(c(=O)n1N)c1ccccc1,-1.8727 +ClCC(=O)N(c1c(C)cccc1C)Cn1cccn1,-1.9395 +ClCC(=O)N(c1c(C)cccc1C)Cn1cccn1,-1.2455 +OC1(Cn2ncnc2)C(CCC1(C)C)Cc1ccc(cc1)Cl,-1.1173 +OC1(Cn2ncnc2)C(CCC1(C)C)Cc1ccc(cc1)Cl,-1.1399 +COc1sc(=O)n(n1)CSP(=S)(OC)OC,0.19382 +COc1sc(=O)n(n1)CSP(=S)(OC)OC,-0.23553 +CBr,-1.0414 +CBr,-1.2304 +CSc1nnc(c(=O)n1N)C(C)(C)C,-1.1139 +CSc1nnc(c(=O)n1N)C(C)(C)C,-1.1584 +CCCCC(c1ccc(cc1)Cl)(Cn1cncn1)C#N,-0.99564 +CCCCC(c1ccc(cc1)Cl)(Cn1cncn1)C#N,-2.0253 +CCN(C(=O)C(Oc1cccc2c1cccc2)C)CC,-1.6772 +CCN(C(=O)C(Oc1cccc2c1cccc2)C)CC,-2.0 +Oc1ccccc1c1ccccc1,-3.0 +Oc1ccccc1c1ccccc1,-2.301 +Oc1ccccc1c1ccccc1,-2.7251 +CCCN(c1c(cc(cc1[N+](=O)[O-])S(=O)(=O)N)[N+](=O)[O-])CCC,-1.1461 +CCCN(c1c(cc(cc1[N+](=O)[O-])S(=O)(=O)N)[N+](=O)[O-])CCC,-1.5563 +CNC(=O)ON=C(C(=O)N(C)C)SC,-0.62221 +CNC(=O)ON=C(C(=O)N(C)C)SC,-0.6902 +C[n+]1ccc(cc1)c1cc[n+](cc1)C,-0.40654 +C[n+]1ccc(cc1)c1cc[n+](cc1)C,-0.94792 +C[n+]1ccc(cc1)c1cc[n+](cc1)C,-0.57403 +CCOP(=S)(Oc1ccc(cc1)[N+](=O)[O-])OCC,-0.39794 +CCOP(=S)(Oc1ccc(cc1)[N+](=O)[O-])OCC,-0.20412 +COP(=S)(Oc1ccc(cc1)[N+](=O)[O-])OC,-0.39794 +COP(=S)(Oc1ccc(cc1)[N+](=O)[O-])OC,0.25964 +COP(=S)(Oc1ccc(cc1)[N+](=O)[O-])OC,0.30103 +CC(CC(c1sccc1NC(=O)c1cn(nc1C(F)(F)F)C)C)C,-1.9191 +CC(CC(c1sccc1NC(=O)c1cn(nc1C(F)(F)F)C)C)C,-2.0 +ClC(=CC1C(C1(C)C)C(=O)OCc1cccc(c1)Oc1ccccc1)Cl,-1.3979 +ClC(=CC1C(C1(C)C)C(=O)OCc1cccc(c1)Oc1ccccc1)Cl,-1.699 +COP(=S)(SCN1C(=O)c2c(C1=O)cccc2)OC,-0.60206 +COP(=S)(SCN1C(=O)c2c(C1=O)cccc2)OC,-0.25527 +OC(=O)c1nc(Cl)c(c(c1Cl)N)Cl,-2.301 +OC(=O)c1nc(Cl)c(c(c1Cl)N)Cl,-2.3979 +CO/C=C(\c1ccccc1COc1cccc(n1)C(F)(F)F)/C(=O)OC,-1.6628 +CO/C=C(\c1ccccc1COc1cccc(n1)C(F)(F)F)/C(=O)OC,-2.2095 +CCCOC(=O)NCCCN(C)C.Cl,-2.8338 +CCCOC(=O)NCCCN(C)C.Cl,-2.1761 +CCCOC(=O)NCCCN(C)C.Cl,-2.0569 +O=C(C(Oc1ccc(cc1)Oc1cnc2c(n1)ccc(c2)Cl)C)OCCON=C(C)C,-0.69897 +O=C(C(Oc1ccc(cc1)Oc1cnc2c(n1)ccc(c2)Cl)C)OCCON=C(C)C,-1.3979 +C#CCOS(=O)OC1CCCCC1Oc1ccc(cc1)C(C)(C)C,-1.1761 +C#CCOS(=O)OC1CCCCC1Oc1ccc(cc1)C(C)(C)C,-1.3838 +C#CCOS(=O)OC1CCCCC1Oc1ccc(cc1)C(C)(C)C,-1.4014 +ClCC(=O)N(c1c(C)cccc1CC)COC(C)C,-2.301 +ClCC(=O)N(c1c(C)cccc1CC)COC(C)C,-1.7993 +S=C1NCCCN1,-0.69897 +S=C1NCCCN1,-0.74819 +Clc1ccccc1CC(C1(Cl)CC1)(Cn1nc[nH]c1=S)O,-1.699 +Clc1ccccc1CC(C1(Cl)CC1)(Cn1nc[nH]c1=S)O,-2.8751 +CCOC(=O)c1cn2nc(cc2nc1C)OP(=S)(OCC)OCC,-1.0414 +CCOC(=O)c1cn2nc(cc2nc1C)OP(=S)(OCC)OCC,-0.60206 +COc1c(OC)cc(c(c1OC)C(=O)c1c(OC)ncc(c1C)Cl)C,-1.6325 +COc1c(OC)cc(c(c1OC)C(=O)c1c(OC)ncc(c1C)Cl)C,-1.5611 +O=C(CC(C)(C)C)OC1=C(C(=O)OC21CCCC2)c1c(C)cc(cc1C)C,-1.2014 +O=C(CC(C)(C)C)OC1=C(C(=O)OC21CCCC2)c1c(C)cc(cc1C)C,-1.1703 +O=C1CCCC(=O)C1C(=O)c1ccc(cc1Cl)S(=O)(=O)C,-1.8573 +O=C1CCCC(=O)C1C(=O)c1ccc(cc1Cl)S(=O)(=O)C,0.39794 +O=C(NC(=O)c1c(F)cccc1F)Nc1cc(Cl)c(c(c1F)Cl)F,-1.3945 +O=C(NC(=O)c1c(F)cccc1F)Nc1cc(Cl)c(c(c1F)Cl)F,-2.0881 +CCNc1nc(Cl)nc(n1)NC(C)(C)C,-0.093422 +CCNc1nc(Cl)nc(n1)NC(C)(C)C,-0.2014 +CCNc1nc(Cl)nc(n1)NC(C)(C)C,-0.23045 +CS/C(=N/OC(=O)N(SN(C(=O)O/N=C(/SC)\C)C)C)/C,-0.69897 +CS/C(=N/OC(=O)N(SN(C(=O)O/N=C(/SC)\C)C)C)/C,-1.0 +CS/C(=N/OC(=O)N(SN(C(=O)O/N=C(/SC)\C)C)C)/C,-1.0792 +COC(=O)NC(=S)Nc1ccccc1NC(=S)NC(=O)OC,-1.5051 +COC(=O)NC(=S)Nc1ccccc1NC(=S)NC(=O)OC,-1.7243 +CN(C(=S)SSC(=S)N(C)C)C,-1.0792 +CN(C(=S)SSC(=S)N(C)C)C,-0.86332 +Cc1ccc(cc1)N(S(=O)(=O)N(C)C)SC(Cl)(Cl)F,-2.0 +Cc1ccc(cc1)N(S(=O)(=O)N(C)C)SC(Cl)(Cl)F,-1.9542 +O=C(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,-2.0569 +O=C(C(C)(C)C)C(n1ncnc1)Oc1ccc(cc1)Cl,-1.3856 +CC(N(C(=O)SCC(=C(Cl)Cl)Cl)C(C)C)C,-1.1139 +CC(N(C(=O)SCC(=C(Cl)Cl)Cl)C(C)C)C,-0.9345 +CCCN(c1c(cc(cc1[N+](=O)[O-])C(F)(F)F)[N+](=O)[O-])CCC,-1.4771 +CCCN(c1c(cc(cc1[N+](=O)[O-])C(F)(F)F)[N+](=O)[O-])CCC,-2.6812 +O=CNC(C(Cl)(Cl)Cl)N1CCN(CC1)C(C(Cl)(Cl)Cl)NC=O,-2.0 +O=CNC(C(Cl)(Cl)Cl)N1CCN(CC1)C(C(Cl)(Cl)Cl)NC=O,-2.2041 +C=CC1(C)OC(=O)N(C1=O)c1cc(Cl)cc(c1)Cl,-0.90309 +C=CC1(C)OC(=O)N(C1=O)c1cc(Cl)cc(c1)Cl,-0.43136 +CN(C(=S)S[Zn]SC(=S)N(C)C)C,-1.1139 +CN(C(=S)S[Zn]SC(=S)N(C)C)C,-0.39794 diff --git a/loael.Rmd b/loael.Rmd index dda3e38..b01e1e3 100644 --- a/loael.Rmd +++ b/loael.Rmd @@ -90,8 +90,9 @@ Elena: please check if this is publication strategy is ok for the Swiss Federal Materials and Methods ===================== -The following sections give a high level overview about -algorithms and datasets used for this study. In order to provide unambiguous references to algorithms and datasets, links to source code and data sources are included in the text. +The following sections give a high level overview about algorithms and datasets +used for this study. In order to provide unambiguous references to algorithms +and datasets, links to source code and data sources are included in the text. Datasets -------- @@ -253,12 +254,18 @@ weighted by its similarity to the query compound. In this case the prediction is The applicability domain (AD) of lazar models is determined by the structural diversity of the training data. If no similar compounds are found in the -training data no predictions will be generated. Warnings are issued if the similarity threshold has to be lowered from 0.5 to 0.2 in order to enable predictions and if lazar has to resort to weighted average predictions, because local random forests fail. Thus predictions without warnings can be considered as close to the applicability domain and predictions with warnings as more distant from the applicability domain. Quantitative applicability domain information can be obtained from the similarities of individual neighbors. +training data no predictions will be generated. Warnings are issued if the +similarity threshold has to be lowered from 0.5 to 0.2 in order to enable +predictions and if lazar has to resort to weighted average predictions, because +local random forests fail. Thus predictions without warnings can be considered +as close to the applicability domain and predictions with warnings as more +distant from the applicability domain. Quantitative applicability domain +information can be obtained from the similarities of individual neighbors. Local regression models consider neighbor similarities to the query compound, -by weighting the contribution of each neighbor is by its similarity. -The variability of local model predictions is reflected in the -95\% prediction interval associated with each prediction. +by weighting the contribution of each neighbor is by its similarity. The +variability of local model predictions is reflected in the 95\% prediction +interval associated with each prediction. ### Validation @@ -325,12 +332,12 @@ This result was confirmed with a visual inspection using the [CheS-Mapper](http://ches-mapper.org) (Chemical Space Mapping and Visualization in 3D, @Guetlein2012) tool. -CheS-Mapper can be used to analyze the relationship between the -structure of chemical compounds, their physico-chemical properties, and -biological or toxic effects. It depicts closely related (similar) compounds in 3D space and can be used with different kinds of features. -We have investigated structural as well as physico-chemical properties and -concluded that both datasets are very similar, both in terms of -chemical structures and physico-chemical properties. +CheS-Mapper can be used to analyze the relationship between the structure of +chemical compounds, their physico-chemical properties, and biological or toxic +effects. It depicts closely related (similar) compounds in 3D space and can be +used with different kinds of features. We have investigated structural as well +as physico-chemical properties and concluded that both datasets are very +similar, both in terms of chemical structures and physico-chemical properties. The only statistically significant difference between both datasets, is that the Mazzatorta dataset contains more small compounds (61 structures with less than 11 atoms) than the Swiss dataset (19 small structures, p-value 3.7E-7). @@ -358,11 +365,13 @@ MolPrint2D features that are utilized for model building in this work. ### Experimental variability versus prediction uncertainty -Duplicated LOAEL values can be found in both datasets and there is a -substantial number of `r length(unique(t$SMILES))` compounds occurring in both -datasets. These duplicates allow us to estimate the variability of -experimental results within individual datasets and between datasets. -Data with *identical* values (at five significant digits) in both datasets were excluded from variability analysis, because it it likely that they originate from the same experiments. +Duplicated LOAEL values can be found in both datasets and there is +a substantial number of `r length(unique(t$SMILES))` compounds occurring in +both datasets. These duplicates allow us to estimate the variability of +experimental results within individual datasets and between datasets. Data with +*identical* values (at five significant digits) in both datasets were excluded +from variability analysis, because it it likely that they originate from the +same experiments. ##### Intra dataset variability @@ -385,21 +394,40 @@ c.dup$sd <- ave(c.dup$LOAEL,c.dup$SMILES,FUN=sd) t$sd <- ave(t$LOAEL,t$SMILES,FUN=sd) p = t.test(m.dup$sd,s.dup$sd)$p.value + +m.mg = read.csv("data/mazzatorta_mg_dup.csv",header=T) +m.mg$sd <- ave(m.mg$LOAEL,m.mg$SMILES,FUN=sd) + +s.mg = read.csv("data/swiss_mg_dup.csv",header=T) +s.mg$sd <- ave(s.mg$LOAEL,s.mg$SMILES,FUN=sd) + +c.mg = read.csv("data/all_mg_dup.csv",header=T) +c.mg$sd <- ave(c.mg$LOAEL,c.mg$SMILES,FUN=sd) ``` The Mazzatorta dataset has `r length(m$SMILES)` LOAEL values for `r length(levels(m$SMILES))` unique structures, `r m.dupnr` -compounds have multiple measurements with a mean standard deviation of -`r round(mean(10^(-1*m.dup$sd)),2)` mmol/kg_bw/day (`r round(mean(m.dup$sd),2)` log10 units @mazzatorta08, [@fig:intra]). +compounds have multiple measurements with a mean standard deviation (-log10 transformed values) of +`r round(mean(m.dup$sd),2)` +(`r round(mean(10^(-1*m.mg$sd)),2)` mg/kg_bw/day, +`r round(mean(10^(-1*m.dup$sd)),2)` mmol/kg_bw/day) +(@mazzatorta08, [@fig:intra]). The Swiss Federal Office dataset has `r length(s$SMILES)` rat LOAEL values for `r length(levels(s$SMILES))` unique structures, `r s.dupnr` compounds have -multiple measurements with a mean standard deviation of -`r round(mean(10^(-1*s.dup$sd)),2)` mmol/kg_bw/day (`r round(mean(s.dup$sd),2)` log10 units). +multiple measurements with a mean standard deviation (-log10 transformed values) of +`r round(mean(s.dup$sd),2)` +(`r round(mean(10^(-1*s.mg$sd)),2)` mg/kg_bw/day, +`r round(mean(10^(-1*s.dup$sd)),2)` mmol/kg_bw/day) +([@fig:intra]). Standard deviations of both datasets do not show a statistically significant difference with a p-value (t-test) of `r round(p,2)`. -The combined test set has a mean standard deviation of `r round(mean(10^(-1*c.dup$sd)),2)` mmol/kg_bw/day (`r round(mean(c.dup$sd),2)` log10 units). +The combined test set has a mean standard deviation (-log10 transformed values) of +`r round(mean(c.dup$sd),2)` +(`r round(mean(10^(-1*c.mg$sd)),2)` mg/kg_bw/day, +`r round(mean(10^(-1*c.dup$sd)),2)` mmol/kg_bw/day) +([@fig:intra]). ![Distribution and variability of LOAEL values in both datasets. Each vertical line represents a compound, dots are individual LOAEL values.](figures/dataset-variability.pdf){#fig:intra} diff --git a/loael.md b/loael.md index 7202dbc..c2bfb5b 100644 --- a/loael.md +++ b/loael.md @@ -82,8 +82,9 @@ Elena: please check if this is publication strategy is ok for the Swiss Federal Materials and Methods ===================== -The following sections give a high level overview about -algorithms and datasets used for this study. In order to provide unambiguous references to algorithms and datasets, links to source code and data sources are included in the text. +The following sections give a high level overview about algorithms and datasets +used for this study. In order to provide unambiguous references to algorithms +and datasets, links to source code and data sources are included in the text. Datasets -------- @@ -245,12 +246,18 @@ weighted by its similarity to the query compound. In this case the prediction is The applicability domain (AD) of lazar models is determined by the structural diversity of the training data. If no similar compounds are found in the -training data no predictions will be generated. Warnings are issued if the similarity threshold has to be lowered from 0.5 to 0.2 in order to enable predictions and if lazar has to resort to weighted average predictions, because local random forests fail. Thus predictions without warnings can be considered as close to the applicability domain and predictions with warnings as more distant from the applicability domain. Quantitative applicability domain information can be obtained from the similarities of individual neighbors. +training data no predictions will be generated. Warnings are issued if the +similarity threshold has to be lowered from 0.5 to 0.2 in order to enable +predictions and if lazar has to resort to weighted average predictions, because +local random forests fail. Thus predictions without warnings can be considered +as close to the applicability domain and predictions with warnings as more +distant from the applicability domain. Quantitative applicability domain +information can be obtained from the similarities of individual neighbors. Local regression models consider neighbor similarities to the query compound, -by weighting the contribution of each neighbor is by its similarity. -The variability of local model predictions is reflected in the -95\% prediction interval associated with each prediction. +by weighting the contribution of each neighbor is by its similarity. The +variability of local model predictions is reflected in the 95\% prediction +interval associated with each prediction. ### Validation @@ -315,12 +322,12 @@ This result was confirmed with a visual inspection using the [CheS-Mapper](http://ches-mapper.org) (Chemical Space Mapping and Visualization in 3D, @Guetlein2012) tool. -CheS-Mapper can be used to analyze the relationship between the -structure of chemical compounds, their physico-chemical properties, and -biological or toxic effects. It depicts closely related (similar) compounds in 3D space and can be used with different kinds of features. -We have investigated structural as well as physico-chemical properties and -concluded that both datasets are very similar, both in terms of -chemical structures and physico-chemical properties. +CheS-Mapper can be used to analyze the relationship between the structure of +chemical compounds, their physico-chemical properties, and biological or toxic +effects. It depicts closely related (similar) compounds in 3D space and can be +used with different kinds of features. We have investigated structural as well +as physico-chemical properties and concluded that both datasets are very +similar, both in terms of chemical structures and physico-chemical properties. The only statistically significant difference between both datasets, is that the Mazzatorta dataset contains more small compounds (61 structures with less than 11 atoms) than the Swiss dataset (19 small structures, p-value 3.7E-7). @@ -348,11 +355,13 @@ MolPrint2D features that are utilized for model building in this work. ### Experimental variability versus prediction uncertainty -Duplicated LOAEL values can be found in both datasets and there is a -substantial number of 155 compounds occurring in both -datasets. These duplicates allow us to estimate the variability of -experimental results within individual datasets and between datasets. -Data with *identical* values (at five significant digits) in both datasets were excluded from variability analysis, because it it likely that they originate from the same experiments. +Duplicated LOAEL values can be found in both datasets and there is +a substantial number of 155 compounds occurring in +both datasets. These duplicates allow us to estimate the variability of +experimental results within individual datasets and between datasets. Data with +*identical* values (at five significant digits) in both datasets were excluded +from variability analysis, because it it likely that they originate from the +same experiments. ##### Intra dataset variability @@ -360,17 +369,27 @@ Data with *identical* values (at five significant digits) in both datasets were The Mazzatorta dataset has 567 LOAEL values for 445 unique structures, 93 -compounds have multiple measurements with a mean standard deviation of -0.56 mmol/kg_bw/day (0.32 log10 units @mazzatorta08, [@fig:intra]). +compounds have multiple measurements with a mean standard deviation (-log10 transformed values) of +0.32 +(0.56 mg/kg_bw/day, +0.56 mmol/kg_bw/day) +(@mazzatorta08, [@fig:intra]). The Swiss Federal Office dataset has 493 rat LOAEL values for 381 unique structures, 91 compounds have -multiple measurements with a mean standard deviation of -0.59 mmol/kg_bw/day (0.29 log10 units). +multiple measurements with a mean standard deviation (-log10 transformed values) of +0.29 +(0.57 mg/kg_bw/day, +0.59 mmol/kg_bw/day) +([@fig:intra]). Standard deviations of both datasets do not show a statistically significant difference with a p-value (t-test) of 0.21. -The combined test set has a mean standard deviation of 0.55 mmol/kg_bw/day (0.33 log10 units). +The combined test set has a mean standard deviation (-log10 transformed values) of +0.33 +(0.56 mg/kg_bw/day, +0.55 mmol/kg_bw/day) +([@fig:intra]). ![Distribution and variability of LOAEL values in both datasets. Each vertical line represents a compound, dots are individual LOAEL values.](figures/dataset-variability.pdf){#fig:intra} diff --git a/loael.pdf b/loael.pdf index ebf09fd..4aa0543 100644 Binary files a/loael.pdf and b/loael.pdf differ diff --git a/scripts/all_mg_dup.rb b/scripts/all_mg_dup.rb new file mode 100755 index 0000000..48323d7 --- /dev/null +++ b/scripts/all_mg_dup.rb @@ -0,0 +1,35 @@ +#!/usr/bin/env ruby +require_relative '../../lazar/lib/lazar' +include OpenTox +csv_in = CSV.read("data/NOAEL-LOAEL_SMILES_rat_chron.csv", :encoding => 'windows-1251:utf-8') +head = csv_in.shift +data = {} +csv_in.each do |line| + smi = line[11] + mg = line[19].to_f + unless mg.to_f == 0.0 + c = Compound.from_smiles smi + data[c.smiles] ||= [] + data[c.smiles] << -Math.log10(mg).signif(5) + end +end +csv_in = CSV.read("data/LOAEL_mg_corrected_smiles_mmol.csv", :encoding => 'windows-1251:utf-8') +head = csv_in.shift +data = {} +csv_in.each do |line| + c = Compound.from_smiles line[0] + mmol = line[1].to_f + data[c.smiles] ||= [] + data[c.smiles] << -Math.log10(c.mmol_to_mg(mmol)).signif(5) +end +File.open(File.join("data","all_mg_dup.csv"),"w+") do |f| + f.puts ["SMILES","LOAEL"].join "," + data.each do |smi,values| + values.uniq! + if values.size > 1 + values.each do |v| + f.puts "#{smi},#{v}" + end + end + end +end diff --git a/scripts/mazzatorta-unique-smiles.rb b/scripts/mazzatorta-unique-smiles.rb index 306c107..4d00fad 100755 --- a/scripts/mazzatorta-unique-smiles.rb +++ b/scripts/mazzatorta-unique-smiles.rb @@ -5,7 +5,6 @@ include OpenTox csv_in = CSV.read(ARGV[0], :encoding => 'windows-1251:utf-8') head = csv_in.shift data = [] -data = [] csv_in.each do |line| c = Compound.from_smiles line[0] # round to 5 significant digits in order to detect duplicates diff --git a/scripts/mazzatorta_mg_dup.rb b/scripts/mazzatorta_mg_dup.rb new file mode 100755 index 0000000..7ca6d79 --- /dev/null +++ b/scripts/mazzatorta_mg_dup.rb @@ -0,0 +1,24 @@ +#!/usr/bin/env ruby +require_relative '../../lazar/lib/lazar' +include OpenTox + +csv_in = CSV.read(ARGV[0], :encoding => 'windows-1251:utf-8') +head = csv_in.shift +data = {} +csv_in.each do |line| + c = Compound.from_smiles line[0] + mmol = line[1].to_f + data[c] ||= [] + data[c] << -Math.log10(c.mmol_to_mg(mmol)).signif(5) +end +File.open(File.join("data","mazzatorta_mg_dup.csv"),"w+") do |f| + f.puts ["SMILES","LOAEL"].join "," + data.each do |c,values| + values.uniq! + if values.size > 1 + values.each do |v| + f.puts "#{c.smiles},#{v}" + end + end + end +end diff --git a/scripts/noael_loael2swiss_mg_dup.rb b/scripts/noael_loael2swiss_mg_dup.rb new file mode 100755 index 0000000..2699719 --- /dev/null +++ b/scripts/noael_loael2swiss_mg_dup.rb @@ -0,0 +1,26 @@ +#!/usr/bin/env ruby +require_relative '../../lazar/lib/lazar' +include OpenTox +csv_in = CSV.read(ARGV[0], :encoding => 'windows-1251:utf-8') +head = csv_in.shift +data = {} +csv_in.each do |line| + smi = line[11] + mg = line[19].to_f + unless mg.to_f == 0.0 + c = Compound.from_smiles smi + data[c.smiles] ||= [] + data[c.smiles] << -Math.log10(mg).signif(5) + end +end +File.open(File.join("data","swiss_mg_dup.csv"),"w+") do |f| + f.puts ["SMILES","LOAEL"].join "," + data.each do |smi,values| + values.uniq! + if values.size > 1 + values.each do |v| + f.puts "#{smi},#{v}" + end + end + end +end -- cgit v1.2.3