From 455da06aa6459da0d25b286ca6cb866ff64c4c34 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 20 Jun 2019 22:01:50 +0200 Subject: separate csv serialisations for batch predictions and training data, repeated measurements in mutagenicity dataset fixed, daphnia import fixed, CENTRAL_MONGO_IP removed --- data/Mutagenicity-Salmonella_typhimurium.csv | 96 ++++++++++++++++++---------- ext/lazar/extconf.rb | 2 +- ext/lazar/rinstall.R | 22 ++++--- lib/dataset.rb | 75 +++++++++++++++------- lib/download.rb | 3 +- lib/feature.rb | 11 ++-- lib/lazar.rb | 9 ++- lib/model.rb | 8 +-- lib/validation-statistics.rb | 4 +- test/classification-model.rb | 10 ++- test/dataset.rb | 18 +++++- test/regression-model.rb | 17 ++++- 12 files changed, 184 insertions(+), 91 deletions(-) diff --git a/data/Mutagenicity-Salmonella_typhimurium.csv b/data/Mutagenicity-Salmonella_typhimurium.csv index 0694a94..331de54 100644 --- a/data/Mutagenicity-Salmonella_typhimurium.csv +++ b/data/Mutagenicity-Salmonella_typhimurium.csv @@ -696,7 +696,8 @@ Clc1cc(Cl)c(c(c1)S(=O)c1cc(Cl)cc(c1O)Cl)O,mutagenic O=C1C=C(C(=O)C(=C1)C1=CC(=O)C=C(C1=O)C(C)(C)C)C(C)(C)C,mutagenic OC(=O)Cc1ccccc1Cl,non-mutagenic C=CCCC(C=O)CC,non-mutagenic -Nc1cnn(c(=O)c1Cl)c1ccccc1,non-mutagenic mutagenic +Nc1cnn(c(=O)c1Cl)c1ccccc1,non-mutagenic +Nc1cnn(c(=O)c1Cl)c1ccccc1,mutagenic COc1cccc2c1C(=O)c1c(C2=O)c(O)c2c(c1O)C(OC1CC(N)C(C(O1)C)O)CC(C2)(O)C(=O)C,mutagenic c1ccc2c(-c3ccccc3C3C2N3)c1,mutagenic c1ccc2c(c1)cc1c3c2[C@H]2O[C@H]2c3cc2c1cccc2,mutagenic @@ -856,7 +857,8 @@ O[C@@H]1[C@H](O)[C@@H](O)CO[C@H]1N(c1ccc(cc1)[N+](=O)[O-])N=O,mutagenic OC(=O)C(CC(=O)c1cccc(c1N)O)N,non-mutagenic C=O,mutagenic O=NN1CC[C@H](C1)O,mutagenic -Oc1ccccc1c1ccccc1,non-mutagenic mutagenic +Oc1ccccc1c1ccccc1,non-mutagenic +Oc1ccccc1c1ccccc1,mutagenic CC(=O)C1=C(O)C2N(C1=O)C(C1C2c2c[nH]c3c2c(C1)ccc3)(C)C,mutagenic [O-][N+](=O)C1=Cc2c3c1cccc3cc1c2c2ccccc2cc1,mutagenic c1cc2ccc3c4c2c(c1)ccc4nc1c3cccc1,mutagenic @@ -1501,7 +1503,8 @@ CCCCOC(=O)c1ccc(cc1)O,non-mutagenic OC1C=Cc2c(C1O)ccc1c2cc2ccccc2c1[N+](=O)[O-],mutagenic O/N=C(/c1ccccc1)\N,mutagenic Clc1cc(N)c(c(c1)C(=O)O)Cl,mutagenic -CCOP(=O)(O/C(=C\Cl)/c1ccc(cc1Cl)Cl)OCC,mutagenic non-mutagenic +CCOP(=O)(O/C(=C\Cl)/c1ccc(cc1Cl)Cl)OCC,mutagenic +CCOP(=O)(O/C(=C\Cl)/c1ccc(cc1Cl)Cl)OCC,non-mutagenic Nc1ccc(cc1)N=Nc1cccc(c1N)N,mutagenic CC1=C[C@]2(O[C@@H](C1)[C@@H](/C=C/[C@H]1CC[C@]3(O1)CC[C@@H]1[C@@H](O3)[C@H](O)C(=C)[C@H](O1)[C@H](C[C@@H]([C@H]1O[C@@]3(CCCCO3)CC[C@H]1C)C)O)C)O[C@@H](CC[C@@H]2O)C[C@](C(=O)O)(O)C,non-mutagenic OC(=O)C(Oc1ccc(cc1)[C@@H]1CC1(Cl)Cl)(C)C,non-mutagenic @@ -1532,7 +1535,8 @@ ClCC(Cl)(Cl)Cl,non-mutagenic O=C(c1csc(c1)[N+](=O)[O-])Nc1ccccc1[N+](=O)[O-],mutagenic CC(=C)C=O,mutagenic c1ccc2c(c1)cc1c(c2)c2Oc2c2c1cccc2,mutagenic -Clc1cc(Cl)cc(c1Oc1ccc(cc1)[N+](=O)[O-])Cl,mutagenic non-mutagenic +Clc1cc(Cl)cc(c1Oc1ccc(cc1)[N+](=O)[O-])Cl,mutagenic +Clc1cc(Cl)cc(c1Oc1ccc(cc1)[N+](=O)[O-])Cl,non-mutagenic CC[C@H](c1ccccc1O)C,non-mutagenic c1ccc(cc1)Cc1ccccc1OCC1CO1,mutagenic CC[n+]1c2ccccc2nc2c1cccc2,mutagenic @@ -1708,7 +1712,8 @@ Oc1ccc2c(c1N=Nc1ccc(cc1)S(=O)(=O)O)c(cc(c2)S(=O)(=O)O)S(=O)(=O)O,non-mutagenic CCCCCC(=O)OC1(CCC2C1(C)CCC1C2CCC2=CC(=O)CCC12)C(=O)C,non-mutagenic O=NN1CCC[C@@H](C1)O,mutagenic ClCC(=O)c1ccc(cc1Cl)Cl,mutagenic -[O-][N+](=O)c1ccc(cc1)CNc1[nH]cnc2-c1ncn2,mutagenic non-mutagenic +[O-][N+](=O)c1ccc(cc1)CNc1[nH]cnc2-c1ncn2,mutagenic +[O-][N+](=O)c1ccc(cc1)CNc1[nH]cnc2-c1ncn2,non-mutagenic Nc1ccc2c3c1-c1ccccc1-c3ccc2,mutagenic CC(=O)OCc1ccc(cc1)N=Nc1ccc(cc1)COC(=O)C,mutagenic BrCC(C(OP(=O)(OC(C(CBr)Br)C)OC(C(CBr)Br)C)C)Br,mutagenic @@ -1757,7 +1762,8 @@ NCCN,mutagenic Nc1sc2c(n1)C1CCCNC1CC2,non-mutagenic O[C@@H]([C@@H](C(=O)O)O)C(=O)O,non-mutagenic OC[C@H]1O[C@@H](Oc2cc(O)cc3c2c(=O)c2c(o3)c(O)ccc2O)[C@@H]([C@H]([C@@H]1O)O)O,mutagenic -c1scc(n1)c1nc2c([nH]1)cccc2,mutagenic non-mutagenic +c1scc(n1)c1nc2c([nH]1)cccc2,mutagenic +c1scc(n1)c1nc2c([nH]1)cccc2,non-mutagenic CCCCOc1ccc(cc1)CC(=O)NO,mutagenic [O-][N+](=O)c1nc2c([nH]1)cccc2,mutagenic Nc1ccc2c(n1)n1cccc(c1n2)C,mutagenic @@ -2102,7 +2108,8 @@ N#Cc1cc(I)c(c(c1)[N+](=O)[O-])O,non-mutagenic OCCOc1ccccc1,non-mutagenic CC(OC(=O)COc1ccc(cc1Cl)Cl)C,non-mutagenic CCCCCCOC(=O)c1ccccc1,non-mutagenic -COC(=O)c1cc(ccc1[N+](=O)[O-])Oc1ccc(cc1Cl)Cl,mutagenic non-mutagenic +COC(=O)c1cc(ccc1[N+](=O)[O-])Oc1ccc(cc1Cl)Cl,mutagenic +COC(=O)c1cc(ccc1[N+](=O)[O-])Oc1ccc(cc1Cl)Cl,non-mutagenic Nc1cc(Cl)c(c(c1)Cl)N,mutagenic CNC(=O)/C=C(/OP(=O)(OC)OC)\C,mutagenic COC(=C1C(=NC(=C([C@@H]1c1cccc(c1)[N+](=O)[O-])C(=O)OC/C=C/c1ccccc1)C)C)O,non-mutagenic @@ -2538,7 +2545,8 @@ C=CC(=O)NC(CC(=O)C)(C)C,non-mutagenic CCOP(=S)(SCn1c(=O)oc2c1ccc(c2)Cl)OCC,non-mutagenic CCOP(=O)(SCCN(C(C)C)C(C)C)C,non-mutagenic COc1cc(CNC(=O)C(Br)C)ccc1O,mutagenic -CN1CN(C)CSC1=S,mutagenic non-mutagenic +CN1CN(C)CSC1=S,mutagenic +CN1CN(C)CSC1=S,non-mutagenic [O-][N+](=O)c1cccc(c1C)N=[N+](c1cccc(c1C)[N+](=O)[O-])[O-],non-mutagenic OC(C(Cl)(Cl)Cl)(c1ccc(cc1)Cl)c1ccc(cc1)Cl,non-mutagenic C=CCN(CC=C)N=O,mutagenic @@ -2680,7 +2688,8 @@ ONc1ccc(cc1C(C)(C)C)c1ccccc1,mutagenic O/N=C/c1ccccn1,non-mutagenic ClCCCCN(COC(=O)C)N=O,mutagenic COc1cc2O[C@@H]3[C@H](c2c2c1c1[C@@H](O)C[C@@H](c1c(=O)o2)O)C=CO3,mutagenic -CNC(=O)Oc1cccc2c1cccc2,mutagenic non-mutagenic +CNC(=O)Oc1cccc2c1cccc2,mutagenic +CNC(=O)Oc1cccc2c1cccc2,non-mutagenic Nc1cc(C)c(c(c1)[N+](=O)[O-])N,mutagenic ClCc1ccc2c3c1ccc1c3c(cc2)cc2c1cccc2,non-mutagenic CC(=O)Nc1nc(NC(=O)C)nc(n1)c1ccc(o1)[N+](=O)[O-],mutagenic @@ -2722,7 +2731,8 @@ Cc1ccc(c(c1)[N+](=O)[O-])C,mutagenic O=C1C=CC(=O)C=C1c1ccccc1,non-mutagenic O=C(c1ccccc1)N(OC(=O)C)OCc1ccc(cc1)C(C)(C)C,mutagenic CC(=O)Nc1scc(n1)c1scc(c1)[N+](=O)[O-],mutagenic -CCCN(c1c(cc(cc1[N+](=O)[O-])C(F)(F)F)[N+](=O)[O-])CCC,mutagenic non-mutagenic +CCCN(c1c(cc(cc1[N+](=O)[O-])C(F)(F)F)[N+](=O)[O-])CCC,mutagenic +CCCN(c1c(cc(cc1[N+](=O)[O-])C(F)(F)F)[N+](=O)[O-])CCC,non-mutagenic OC(=O)c1ccco1,non-mutagenic [O-][N+](=O)OOC(=O)C,mutagenic Cc1ccccc1N=Nc1c(O)ccc2c1cccc2,mutagenic @@ -3440,7 +3450,8 @@ CCc1ccccc1[N+](=O)[O-],non-mutagenic ClC(c1ccccc1)C(=O)Cl,mutagenic OC(=O)Cc1ccc(cc1)N,mutagenic CCCSC(=O)Cl,mutagenic -[O-][N+](=O)NC(=N)NC,mutagenic non-mutagenic +[O-][N+](=O)NC(=N)NC,mutagenic +[O-][N+](=O)NC(=N)NC,non-mutagenic Oc1ccc2c(c1N=Nc1ccccc1)ccc(c2)S(=O)(=O)O,non-mutagenic COC(C1=C(N2CC2)C(=O)C(=C(C1=O)N1CC1)C)COC(=O)N,mutagenic COCC12OOC2(C)c2c(O1)cccc2,mutagenic @@ -3634,7 +3645,8 @@ O=NN1CCCCCCCCCCCC1,mutagenic Nc1cc([N+](=O)[O-])c(c(c1C)C)N,mutagenic O=NN(Cc1ccc(cc1)C)C,non-mutagenic O=Nc1cc(ccc1C)[N+](=O)[O-],mutagenic -Cc1cccc(c1N)C,mutagenic non-mutagenic +Cc1cccc(c1N)C,mutagenic +Cc1cccc(c1N)C,non-mutagenic [O-][N+](=O)c1ccc(c(c1)C)N,mutagenic Clc1ccc(cc1)C(=O)c1ccc(cc1)OC(C(=O)O)(C)C,non-mutagenic CC(=O)Nc1ccc(cc1)Oc1ccc(cc1)N,mutagenic @@ -4066,7 +4078,8 @@ Nc1ccc2c(c1)cns2,mutagenic Sc1nc2c(s1)cccc2,non-mutagenic N#CCC[C@](C#N)(CBr)Br,non-mutagenic COc1ccc(cc1)N=[N+](c1ccc(cc1)OC)[O-],mutagenic -COP(=S)(Oc1ccc(c(c1)C)[N+](=O)[O-])OC,mutagenic non-mutagenic +COP(=S)(Oc1ccc(c(c1)C)[N+](=O)[O-])OC,mutagenic +COP(=S)(Oc1ccc(c(c1)C)[N+](=O)[O-])OC,non-mutagenic CCOc1ccc(cc1)[N+](=O)[O-],mutagenic NCCCC[C@@H](C(=O)N1CCC[C@H]1C(=O)O)N[C@H](C(=O)O)CCc1ccccc1,non-mutagenic Clc1cc(N)c(cc1c1cc(Cl)c(cc1Cl)N)Cl,mutagenic @@ -4737,7 +4750,8 @@ CCCCOC(=O)c1ccccc1C(=O)OC1CCCCC1,non-mutagenic CCc1[nH]c2c(n1)c1c(cc2)ccc2c1cc(O)cc2,mutagenic ClC(=O)c1ccccc1C(=O)Cl,mutagenic CCc1cccc2c1nccc2,mutagenic -O=C(N(C)C)Nc1ccc(c(c1)Cl)Cl,mutagenic non-mutagenic +O=C(N(C)C)Nc1ccc(c(c1)Cl)Cl,mutagenic +O=C(N(C)C)Nc1ccc(c(c1)Cl)Cl,non-mutagenic Nc1ccc(c(c1)Cl)C,mutagenic [O-][N+](=O)c1ccc2c3c1cccc3CC2,mutagenic Nc1ccc2c(c1)nc1c(c2)ccc(c1)N,mutagenic @@ -4753,7 +4767,8 @@ Oc1ccc(c(c1)C)Cl,non-mutagenic CCCCN(CC(=O)CC)N=O,mutagenic Oc1ccc(cc1)c1ccc(cc1)O,non-mutagenic c1ccc(cc1)c1ccccc1OCC1CO1,mutagenic -COc1c(C/C=C(/CCC(=O)O)\C)c(O)c2c(c1C)COC2=O,mutagenic non-mutagenic +COc1c(C/C=C(/CCC(=O)O)\C)c(O)c2c(c1C)COC2=O,mutagenic +COc1c(C/C=C(/CCC(=O)O)\C)c(O)c2c(c1C)COC2=O,non-mutagenic OCc1ccc(cc1)Br,non-mutagenic OCCN(c1ccc(cc1)N=Nc1cccnc1)CCO,non-mutagenic ClCCSCC(C(=O)NCC(=O)OC)NC(=O)CCC(C(=O)O)N,mutagenic @@ -5364,7 +5379,8 @@ Clc1ccc(c(c1)Cl)S(=O)(=O)n1ncc(c(c1=O)Cl)Cl,non-mutagenic OC(COc1ccc(cc1)NC(=O)C)CNC(C)C,non-mutagenic O=C(C(=C)C)OCC(COC(=O)C(=C)C)(C)C,non-mutagenic OCC1OC(OC23C=C(C)C4(C(C3C(=O)C(C2)(C)C)(C)O)CC4)C(C(C1OC(=O)/C=C/c1ccc(cc1)O)O)OC(=O)C,mutagenic -CO/C(=C\C(=O)O)/C(=O)C(=C)C,mutagenic non-mutagenic +CO/C(=C\C(=O)O)/C(=O)C(=C)C,mutagenic +CO/C(=C\C(=O)O)/C(=O)C(=C)C,non-mutagenic O=C1CCc2c1c1c(cc2)ccc2c1cccc2,non-mutagenic CCCCOCCCC,non-mutagenic CCNC(=N)N([N+](=O)[O-])N=O,mutagenic @@ -5798,7 +5814,8 @@ C=CCOC(=O)c1ccccc1C(=O)OCC=C,non-mutagenic CCC(=O)Nc1ccc(c(c1)Cl)Cl,non-mutagenic Cc1cccc2c1c1ccc3c(c1cc2)cccc3,mutagenic CC(=O)Nc1scc(n1)/C=C\c1ccc(o1)[N+](=O)[O-],mutagenic -NC(=O)Cc1cccc2c1cccc2,mutagenic non-mutagenic +NC(=O)Cc1cccc2c1cccc2,mutagenic +NC(=O)Cc1cccc2c1cccc2,non-mutagenic [O-][N+](=O)c1ccc(cc1)n1cnc2c1ncnc2N,mutagenic Cc1cc2n(C)c(nc2c2c1nccn2)N,mutagenic O=C(N(C)C)Nc1ccc(c(c1)Cl)C,non-mutagenic @@ -6199,7 +6216,8 @@ Cc1nc(C)cc(c1)c1cc2c(cc1F)n1c(n2C2CC2)cc(=O)n(c1=O)O,mutagenic ClCCN(c1ccc(cc1)c1[nH]c2c(n1)cc(cc2)CCCCCCc1nc2c([nH]1)ccc(c2)N1CCN(CC1)C)CCCl,non-mutagenic Cn1c(N)nc2c1cc1ncccc1n2,non-mutagenic COC(=O)C[C@H]1[C@@]2(C)[C@H](OC3C2=C(C)[C@@H](C3)c2cocc2)[C@H]2C3[C@]1(C)C(=O)C=C[C@@]3(C)C(=O)O2,non-mutagenic -N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,non-mutagenic mutagenic +N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,non-mutagenic +N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,mutagenic OC1CC=Cc2c1cc1ccc3c4c1c2ccc4ccc3,mutagenic COc1ccc(cc1)C(C=C)O,non-mutagenic BrCC(=O)N(Cc1ccccc1)C,non-mutagenic @@ -6423,7 +6441,8 @@ O[C@@H]1C=Cc2c([C@@H]1O)c1cc3ccc(c4c3c(c1cc2)CC4)C,mutagenic CC(=O)Nc1cccc2c1ncc(c2)F,mutagenic Nc1ccc(cc1)/C=C/c1cccc(c1)N,mutagenic CC(OC(=O)/C=C/c1ccc(o1)[N+](=O)[O-])C,mutagenic -O=C1C=C(O)c2c(C1=O)cccc2,mutagenic non-mutagenic +O=C1C=C(O)c2c(C1=O)cccc2,mutagenic +O=C1C=C(O)c2c(C1=O)cccc2,non-mutagenic Cl/C=C\C[N+]12CN3CN(C2)CN(C1)C3,mutagenic [O-][N+](=O)c1cc(ccc1C)C(=O)O,mutagenic ClCCN(c1ccc(cc1)CC(=O)O[C@H]1CC[C@]2([C@H](C1)CC[C@@H]1[C@@H]2CC[C@]2([C@H]1CCC(=O)N2)C)C)CCCl,mutagenic @@ -7270,7 +7289,7 @@ OC[C@H]1O[C@@H](O[C@@]23C=C(C)C4([C@]([C@@H]3C(=O)C(C2)(C)C)(C)O)CC4)[C@@H]([C@H Oc1ccc(cc1)/N=N/c1ccc(cc1)c1ccc(cc1)/N=N/c1c(N)c2c(cc1S(=O)(=O)O)cc(c(c2O)/N=N/c1ccccc1)S(=O)(=O)O.[Na+].[Na+],mutagenic N[C@@H]1CCC[C@H](C1)N,non-mutagenic O=C1c2ccccc2N/C/1=C\1/Nc2c(C1=O)cccc2,mutagenic -C1CCC(CC1)N1[C@@H]2[C@H]1c1ccccc1-c1c2cccc1,mutagenic +C1CCC(CC1)[N@@]1[C@@H]2[C@H]1c1ccccc1-c1c2cccc1,mutagenic Cl[C@@H](C=C)CCl,non-mutagenic COc1ccccc1N.Cl,mutagenic BrC[C@H]1CN(C)[C@H]2[C@H](C1)c1cccc3c1c(C2)c[nH]3,mutagenic @@ -7343,7 +7362,7 @@ O=CCC=O,mutagenic Cc1nsc(c1)N.Cl,mutagenic CCCCN1[C@@H]2[C@H]1c1ccccc1-c1c2cccc1,mutagenic O[C@@H]1CC[C@]2([C@@H](C1)CC[C@@H]1[C@@H]2CC[C@]2([C@H]1CC[C@@H]2[C@@H](CCC(=O)O)C)C)C,non-mutagenic -CCOC(=O)O[C@H](c1ccnc2c1cc(OC)cc2)[C@@H]1C[C@@H]2CCN1C[C@@H]2C=C,non-mutagenic +CCOC(=O)O[C@H](c1ccnc2c1cc(OC)cc2)[C@@H]1C[C@@H]2CC[N@]1C[C@@H]2C=C,non-mutagenic OS(=O)(=O)O.OC[C@@H]1O[C@@H](O[C@H]([C@@H](C(=O)N[C@@H]([C@@H]([C@H](C(=O)N[C@@H](C(=O)NCCc2scc(n2)c2ncc(s2)C(=O)NCCC[S+](C)C)[C@@H](O)C)C)O)C)NC(=O)c2nc(nc(c2C)N)[C@H](CC(=O)N)NC[C@@H](C(=O)N)N)c2nc[nH]c2)[C@H]([C@@H]([C@@H]1O)O)O[C@@H]1O[C@H](CO)[C@H]([C@H]([C@@H]1O)OC(=O)N)O.OC[C@@H]1O[C@@H](O[C@H]([C@H](C(=O)N[C@@H]([C@@H]([C@@H](C(=O)N[C@@H](C(=O)NCCc2scc(n2)c2ncc(s2)C(=O)NCCC[S+](C)C)[C@@H](O)C)C)O)C)NC(=O)c2nc(nc(c2C)N)[C@H](CC(=O)N)NC[C@@H](C(=O)N)N)c2nc[nH]c2)[C@H]([C@@H]([C@H]1O)O)O[C@@H]1O[C@H](CO)[C@H]([C@H]([C@@H]1O)OC(=O)N)O,mutagenic O[C@@H]1C[C@@]2(C)[C@H]([C@]([C@@H]1O)(C)C(=O)O)CC[C@@]1([C@@H]2CC=C2[C@]1(C)CC[C@]1([C@H]2CC(C)(C)CC1)C(=O)O)C,non-mutagenic CN(CCN(c1ccccn1)Cc1cscc1)C.Cl,non-mutagenic @@ -7435,7 +7454,7 @@ Clc1ccc(cc1)O[C@@H](C(=O)C(C)(C)C)n1cncc1,non-mutagenic OC[C@H]1O[C@H](C[C@H]1O)n1cc(CC)c(=O)[nH]c1=O,non-mutagenic OC(=O)[C@H](c1ccc(cc1)Oc1nccs1)C,non-mutagenic CCNCC#CC(OC(=O)[C@@](c1ccccc1)(C1CCCCC1)O)(C)C.Cl,non-mutagenic -C=C[C@H]1CN2CC[C@H]1C[C@H]2[C@@H](c1ccnc2c1cc(OC)cc2)O.Cl.Cl,non-mutagenic +C=C[C@H]1C[N@@]2CC[C@H]1C[C@H]2[C@@H](c1ccnc2c1cc(OC)cc2)O.Cl.Cl,non-mutagenic OCCNc1ccc(cc1)/N=N/c1ccc(cc1)NCCO,mutagenic CC(=C[C@H]1[C@@H](C1(C)C)C(=O)OCc1cccc(c1)Oc1ccccc1)C,mutagenic CC[C@H](OS(=O)(=O)C)C,mutagenic @@ -7631,6 +7650,7 @@ CC(=O)OCc1ccccc1/N=N/c1ccc(cc1)N(C)C,mutagenic COc1nsc2c1cccc2OC[C@H]1CO1,mutagenic CNNC,non-mutagenic [N-]=[N+]=Nc1ccc(cc1)Nc1c2ccccc2nc2c1cccc2,mutagenic +N#C[C@@H]1COCC[N@]1[C@H]1C[C@@H](O[C@H]([C@H]1O)C)O[C@H]1C[C@@](O)(Cc2c1c(O)c1c(c2O)C(=O)c2c(C1=O)c(OC)ccc2)C(=O)CO,non-mutagenic C[C@@H](c1ccccc1)N(C)C,non-mutagenic OC(=O)c1cn2[C@@H](C)COc3c2c(c1=O)cc(c3C1(N)CC1)F,non-mutagenic CCc1cccc2c1[nH]c1c2CCO[C@@]1(C)CC,non-mutagenic @@ -7722,6 +7742,7 @@ Sc1ncnc2c1[nH]cn2.O,mutagenic SCCC(=O)N1[C@@H](CS[C@H]1c1ccccc1O)C(=O)O,non-mutagenic COc1cc(ccc1OC)C[C@H]1CO1,mutagenic CCCCOc1ccc2c(n1)c(NCCCNCCCl)c1c(n2)cc(cc1)Cl.Cl.Cl.O,mutagenic +CC(CCC[C@H]([C@@H]1CC[C@@H]2[C@]1(C)CC[C@H]1[C@H]2C[C@H]2[C@@]3([C@]1(C)CC[C@@H](C3)OC(=O)C)[N@@]2N1C(=O)c2c(C1=O)cccc2)C)C,mutagenic OS(=O)(=O)OCc1c2ccccc2c2c3c1ccc1c3c(cc2)ccc1.[Na+],mutagenic C[n+]1c2ccccc2cc2c1cccc2N.Cl,mutagenic CCCn1cc2c3c1cccc3[C@@H]1[C@@H](C2)N(C#N)C[C@@H](C1)C,mutagenic @@ -7843,7 +7864,7 @@ O=[P@@]1(OCc2c(O1)cccc2)Oc1ccccc1,mutagenic COC(=O)C1=C(CC)[C@@H](OC1=O)C,mutagenic Fc1ccc(cc1)Cn1c(nc2c1cccc2)N1CCC(CC1)N(c1nccc(=O)[nH]1)C,non-mutagenic Oc1ccc(cc1)[C@@H]1CC(=O)c2c(O1)cc(cc2O)O,non-mutagenic -C=C[C@H]1CN2CC[C@H]1C[C@@H]2[C@H](c1ccnc2c1cc(OC)cc2)O,non-mutagenic +C=C[C@H]1C[N@@]2CC[C@H]1C[C@@H]2[C@H](c1ccnc2c1cc(OC)cc2)O,non-mutagenic Nc1cc(N)c(cc1/N=N/c1ccccc1C)C,mutagenic CC[S@@](=O)CCSP(=O)(OC)OC,mutagenic c1cc2[C@@H]3O[C@@H]3c3c2c(c1)c1cc2ccccc2cc1c3,mutagenic @@ -7913,13 +7934,14 @@ CNC(=O)Oc1ccc(cc1)c1ccccc1,mutagenic C[C@@H]1CCC[C@@H](N1CCC[C@](c1ccccn1)(c1ccccc1)O)C.Cl,non-mutagenic BrC[C@H](CO[P@@](=O)(OC[C@@H](CBr)Br)O)Br,mutagenic O=Nc1ccc2c(c1)cccc2,mutagenic +OC[C@H]1O[C@@H](O[C@H]2CC[C@]3(C(=CC[C@@H]4[C@@H]3CC[C@]3([C@H]4C[C@@H]4[C@@H]3[C@H](C)[C@@H]3[N@@]4C[C@H](CC3)C)C)C2)C)[C@H]([C@H]([C@H]1O)O[C@@H]1O[C@@H](CO)[C@H]([C@H]([C@H]1O)O)O)O[C@@H]1O[C@@H](C)[C@@H]([C@H]([C@H]1O)O)O,non-mutagenic c1cc2ccc3c4c2c(c1)ccc4ccc3,mutagenic NCC(=O)O.Cl,non-mutagenic COc1cc(/N=N/c2ccccc2)ccc1N,mutagenic C=CCOc1ccccc1OC[C@@H](CNC(C)C)O,non-mutagenic C/C(=N\O)/C(=O)C,non-mutagenic c1ccc2c(-c3ccccc3[C@@H]3[C@H]2N3)c1,mutagenic -COc1cc2c(cc1OC)N1[C@@H]3[C@@]42CCN2[C@H]4C[C@@H]4[C@H]3[C@H](CC1=O)OCC=C4C2,non-mutagenic +COc1cc2c(cc1OC)N1[C@@H]3[C@@]42CC[N@@]2[C@H]4C[C@@H]4[C@H]3[C@H](CC1=O)OCC=C4C2,non-mutagenic OCc1cc(ccc1O)C(=O)CN(C(C)(C)C)Cc1ccccc1.Cl,mutagenic Oc1cc(O)c2c(c1)oc(c(c2=O)O)c1ccc(c(c1)O)O.O.O,mutagenic ClCc1cccc(c1)/N=N/c1ccc(cc1)N(C)C,mutagenic @@ -7941,7 +7963,7 @@ O=C1CN(CCCN2CC(=O)NC(=O)C2)CC(=O)N1,non-mutagenic CC(c1ccc(c2-c(c1)c(C)cc2S(=O)(=O)O)C)C.[Na+],non-mutagenic ClC[C@@]12[C@H](Cl)[C@H]([C@H](C2(CCl)CCl)CC1(Cl)Cl)Cl,non-mutagenic OC(=O)COc1ccc(cc1Cl)Cl.CC(N)C,non-mutagenic -C=C[C@@H]1CN2CC[C@H]1C[C@H]2[C@@H](c1ccnc2c1cc(OC)cc2)O.Br,non-mutagenic +C=C[C@@H]1C[N@@]2CC[C@H]1C[C@H]2[C@@H](c1ccnc2c1cc(OC)cc2)O.Br,non-mutagenic O[C@H]1[C@H]2O[C@H]2c2c([C@@H]1O)ccc1c2cc2ccccc2c1C,mutagenic Nc1cc(N)c(cc1/N=N/c1ccc(cc1)c1ccc(cc1)/N=N/c1ccc(c(c1)C(=O)O)O)/N=N/c1ccc(cc1)S(=O)(=O)O,mutagenic COc1ccc2c(c1)[nH]c1c2CCN=C1C.Cl.O.O,non-mutagenic @@ -7972,11 +7994,12 @@ C[C@H](C(=O)O)Oc1cc(Cl)c(cc1Cl)Cl,non-mutagenic [N-]=[N+]=Nc1ccc2c(c1)nc1c(c2Nc2ccc(cc2OC)NS(=O)(=O)C)cccc1,mutagenic O[C@@H]1[C@@H](O)[C@@H](O[C@@H]1n1ccc(=N)[nH]c1=O)COP(=O)(O)O,non-mutagenic O=CC1=C[C@@]2(O)CC(C[C@H]2[C@]2([C@]1(C=O)C2)C)(C)C,mutagenic -C/C=C\1/CC(=C)[C@](O)(CO)C(=O)OCC2=CCN3[C@H]2[C@H](OC1=O)CC3,mutagenic +C/C=C\1/CC(=C)[C@](O)(CO)C(=O)OCC2=CC[N@@]3[C@H]2[C@H](OC1=O)CC3,mutagenic CC(=O)O[C@@H]1C(=O)O[C@H]2[C@H]1OC(=O)[C@@H]2OC(=O)C,non-mutagenic CC(=O)N/N=C/c1c[n+]([O-])c2c([n+]1[O-])cccc2,mutagenic OC(=O)CN(CC(=O)O)CCN(CC(=O)O)CC(=O)O.[Na+].[Na+].[Na+].[Na+],non-mutagenic O=C1NC(=O)[C@](N1)(c1ccc(cc1)O)c1ccccc1,non-mutagenic +C1C[N@@]2CC[N@]1CC2,non-mutagenic BrC[C@H]([C@H](O[P@](=O)(O[C@@H]([C@@H](CBr)Br)C)O[C@@H]([C@@H](CBr)Br)C)C)Br,mutagenic Nc1ccc2c(c1)cc1c(c2)cccc1,mutagenic Br/C=C(\c1ccc(cc1Cl)Cl)/OP(=O)(OC)OC,non-mutagenic @@ -7987,6 +8010,7 @@ ClCC=CCCl,mutagenic [O-][N+](=O)c1ccc2c(c1)ccc1c2ccc(c1)[N+](=O)[O-],mutagenic CCC(c1cc(N2Nc3c(N2)cccc3)c(c(c1)C(CC)(C)C)O)(C)C,non-mutagenic C[C@]1(OC1)c1ccc(cc1)c1ccccc1,mutagenic +OC[C@H]1O[C@@H](O[C@H]2CC[C@]3(C(=CC[C@H]4[C@H]3CC[C@]3([C@H]4C[C@@H]4[C@@H]3[C@H](C)[C@H]3[N@]4C[C@@H](CC3)C)C)C2)C)[C@@H]([C@H]([C@@H]1O[C@H]1O[C@@H](C)[C@@H]([C@H]([C@H]1O[C@H]1O[C@@H](C)[C@@H]([C@H]([C@H]1O)O)O)O)O)O)O,non-mutagenic O[C@@H]1C[C@@]23[C@](C1)(O3)C=CC=C2,non-mutagenic COc1ccccc1C[C@H]1CO1,mutagenic ClCC(=O)O[C@H](P(=O)(OC)OC)C(Cl)(Cl)Cl,non-mutagenic @@ -8041,6 +8065,7 @@ CCCCCCCCCCCC(=O)OCCS(=O)(=O)O.[Na+],non-mutagenic C[C@@H](C(C)(C)C)O[P@@](=O)(Cl)C,non-mutagenic OC(=O)/C=C\C(=O)O.COc1ccc(cc1)CN(c1ccccn1)CCN(C)C,non-mutagenic Oc1ccc(cc1)/C=N/n1nnc2c(c1=O)[nH]c1c2cccc1,mutagenic +C1[N@@]2C[N@@]3C[N@]1C[N@](C2)C3,mutagenic CC(=O)[C@@H]1C(=O)C=C2[C@](C1=O)(C)c1c(O)c(C)c(c(c1O2)C(=O)C)O,non-mutagenic O=C1C(=O)[C@]2(C([C@@H]1CC2)(C)C)C,non-mutagenic CCC/C=C/C(=O)O[C@@H]1C(C)(C)C[C@@H]2[C@]1(O)C=C(C=O)[C@@]13[C@@]2(C1)C(=O)O[C@@H]3O,mutagenic @@ -8065,7 +8090,7 @@ CN(c1ccc(cc1)/C(=C\1/C=CC(=[N+](C)C)C=C1)/c1c2ccc(cc2cc(c1O)S(=O)(=O)O)S(=O)(=O) Cc1cc(ccc1/N=N/c1ccc2c(c1O)c(N)c(cc2S(=O)(=O)O)S(=O)(=O)O)c1ccc(c(c1)C)/N=N/c1ccc2c(c1O)c(N)c(cc2S(=O)(=O)O)S(=O)(=O)O,mutagenic Nc1cc(cc2c1c(O)c(c(c2)S(=O)(=O)O)/N=N/c1ccccc1)S(=O)(=O)O,non-mutagenic Cc1ccc(c(c1)C)/N=N/c1c2ccc(cc2cc(c1O)S(=O)(=O)O)S(=O)(=O)O,mutagenic -Cl/C=C/C[N@+]12CN3CN(C2)CN(C1)C3,mutagenic +Cl/C=C/C[N@+]12C[N@]3C[N@@](C2)C[N@@](C1)C3,mutagenic ClCCN(CCCl)CCC[C@H](Nc1c2cc(OC)ccc2nc2c1ccc(c2)Cl)C,mutagenic Oc1ccc(cc1)/N=N/c1ccc(cc1)c1ccc(cc1)/N=N/c1c(O)c2c(cc1S(=O)(=O)O)cc(c(c2N)/N=N/c1ccc(cc1)[N+](=O)[O-])S(=O)(=O)O,mutagenic ClCCCN(C)C,mutagenic @@ -8077,7 +8102,7 @@ Cc1cc(ccc1/N=N/c1c(O)c2c(N)cc(cc2cc1S(=O)(=O)O)S(=O)(=O)O)c1ccc(c(c1)C)/N=N/c1c( [N-]=[N+]=CC(=O)OC[C@@H](C(=O)O)N,mutagenic O=c1[nH]ncc2c1cccc2,non-mutagenic CC(CCC[C@H]([C@@H]1CC[C@@H]2[C@]1(C)CC[C@H]1[C@H]2C[C@H]2[C@@]3([C@]1(C)CC[C@@H](C3)O)O2)C)C,mutagenic -C/C=C\1/C[C@@H](C)[C@@](C)(O)C(=O)OCC2=CCN3[C@H]2[C@@H](OC1=O)CC3,non-mutagenic +C/C=C\1/C[C@@H](C)[C@@](C)(O)C(=O)OCC2=CC[N@@]3[C@H]2[C@@H](OC1=O)CC3,non-mutagenic Nc1[nH]c(=O)c2c(n1)[nH]nn2,non-mutagenic CCCCCCCCCCCCCCCC(=O)OC[C@@H]([C@@H]1OC(=O)C(=C1O)O)O,non-mutagenic ClCCN(c1ccc(cc1)C[C@@H](C(=O)O)N)CCCl,mutagenic @@ -8089,9 +8114,11 @@ CC(CC(=O)O[C@H]1C[C@@]2(OC(=O)C)[C@@H](C=C1C)O[C@@H]1[C@]3([C@@]2(C)[C@H](OC(=O) CCCCCC[C@@H]([C@H]1C2=C(C[C@@H](CC3=C([C@H]1O)C(=O)OC3=O)[C@H]([C@@H]1CC=CC(=O)O1)O)C(=O)OC2=O)O,non-mutagenic CC(=O)OC[C@@]12CCC(=C[C@H]1O[C@@H]1[C@@]3([C@@]2(C)[C@@H](OC(=O)C)[C@@H]1O)CO3)C,non-mutagenic OC[C@@]12[C@H](C=C(C(=O)[C@@H]1O)C)O[C@@H]1[C@@]3([C@@]2(C)[C@H](O)[C@H]1O)OC3,non-mutagenic +COc1ccc2c(c1)[nH]c1c2CC[N@@]2[C@@H]1C[C@H]1[C@H](C2)C[C@H]([C@@H]([C@H]1C(=O)OC)OC)OC(=O)/C=C/c1cc(OC)c(c(c1)OC)OC,non-mutagenic ClC[C@@H]1[C@H](CCl)[C@@]2(C([C@@]1(Cl)C(=C2Cl)Cl)(Cl)Cl)Cl,non-mutagenic Clc1ccc2c(c1)C(=NC=C1N2C=NN1)c1ccccc1,non-mutagenic C=C[C@@H]1C[C@@H]2C[C@H]1C=C2,non-mutagenic +O=C1O[C@H]2CC[N@]3[C@@H]2C(=CC3)COC(=O)[C@]([C@]([C@H]1C)(C)O)(C)O,non-mutagenic O=C(O[C@@H]1C[C@@](O)(C[C@H]([C@H]1O)O)C(=O)O)/C=C/c1ccc(c(c1)O)O,non-mutagenic Brc1cccc2c1cc1ccc3c(c1c2)cccc3,mutagenic COc1cc(cc(c1O)OC)[C@@H]1[C@H]2C(=O)OC[C@@H]2[C@@H](c2c1cc1OCOc1c2)O[C@@H]1O[C@@H]2CO[C@H](O[C@H]2[C@@H]([C@H]1O)O)C,non-mutagenic @@ -8107,6 +8134,7 @@ CN[C@H]1CCc2c(-c3c1cc(=O)c(cc3)OC)c(OC)c(c(c2)OC)OC,non-mutagenic BrC[C@H]([C@H]([C@@H]([C@@H](CBr)O)O)O)O,mutagenic Oc1ccc2c(c1)CC[C@@H]1[C@H]2CC[C@]2([C@H]1C[C@H]([C@@H]2O)O)C,non-mutagenic Sc1ncnc2c1[nH]cn2,mutagenic +COc1ccc2c(c1)[nH]c1c2CC[N@@]2[C@@H]1C[C@H]1[C@H](C2)C[C@H]([C@@H]([C@H]1C(=O)OC)OC)OC(=O)c1cc(OC)c(c(c1)OC)OC,non-mutagenic OC[C@H]1O[C@H](C[C@H]1O)n1cc(C)c(=O)[nH]c1=O,non-mutagenic Fc1c[nH]c(=O)[nH]c1=O,non-mutagenic C[C@@H]1CC[C@@]2(OC1)O[C@H]1[C@H]([C@@H]2C)[C@@]2([C@@H](C1)[C@@H]1CC=C3[C@]([C@H]1CC2)(C)CC[C@@H](C3)O)C,non-mutagenic @@ -8135,7 +8163,7 @@ C#C[C@@]1(O)CC[C@@H]2[C@]1(C)CC[C@H]1[C@H]2CCC2=CC(=O)CC[C@H]12,non-mutagenic CO[C@H]([C@H]1Cc2cc3cc(O[C@H]4C[C@H](O[C@H]5C[C@H](O)[C@H]([C@@H](O5)C)OC)[C@@H]([C@@H](O4)C)OC(=O)C)c(c(c3c(c2C(=O)[C@H]1O[C@H]1C[C@H](O[C@@H]2C[C@H](O[C@@H]3O[C@@H](C)[C@H]([C@@](C3)(C)O)OC(=O)C)[C@@H]([C@@H](O2)C)O)[C@@H]([C@@H](O1)C)O)O)O)C)C(=O)[C@@H]([C@@H](O)C)O,non-mutagenic O=C1CC[C@]2(C(=C1)[C@@H](C)C[C@@H]1[C@@H]2CC[C@]2([C@H]1CC[C@]2(OC(=O)C)C(=O)C)C)C,non-mutagenic OC[C@@H]1C[C@H]([C@@H](O1)n1cnc2c1ncnc2N)O,mutagenic -O[C@H]1CC[C@]2(C(=CC[C@@H]3[C@@H]2CC[C@]2([C@@H]3C[C@@H]3[C@@H]2[C@H](C)[C@@H]2N3C[C@H](CC2)C)C)C1)C,non-mutagenic +O[C@H]1CC[C@]2(C(=CC[C@@H]3[C@@H]2CC[C@]2([C@@H]3C[C@@H]3[C@@H]2[C@H](C)[C@@H]2[N@@]3C[C@H](CC2)C)C)C1)C,non-mutagenic O[C@@H]1CC[C@]2([C@@H](C1)C[C@H]([C@@H]1[C@@H]2C[C@H](O)[C@]2([C@H]1CC[C@@H]2[C@@H](CCC(=O)O)C)C)O)C,non-mutagenic C/C/1=C\CCC(=C)C2C(CC1)C(C2)(C)C,non-mutagenic OC[C@H]([C@H]1OC(=O)C(=C1O)O)O,non-mutagenic @@ -8163,7 +8191,7 @@ OC(=O)CN(CC(=O)O)CCN(CC(=O)O)CC(=O)O.[Na+].[Na+],non-mutagenic NC(=N)c1ccc(cc1)OCCCCCOc1ccc(cc1)C(=N)N.OCCS(=O)(=O)O.OCCS(=O)(=O)O,non-mutagenic Nc1ccccc1.Cl,non-mutagenic OC(=O)CC[C@@H](C(=O)O)N.[Na+],non-mutagenic -OS(=O)(=O)O.COc1cc2N(C)[C@H]3[C@@]4(c2cc1[C@]1(C[C@H]2CN(CCc5c1[nH]c1c5cccc1)C[C@](C2)(O)CC)C(=O)OC)CCN1[C@H]4[C@@]([C@H]([C@]3(O)C(=O)OC)OC(=O)C)(CC)C=CC1,non-mutagenic +OS(=O)(=O)O.COc1cc2N(C)[C@H]3[C@@]4(c2cc1[C@]1(C[C@H]2C[N@@](CCc5c1[nH]c1c5cccc1)C[C@](C2)(O)CC)C(=O)OC)CC[N@@]1[C@H]4[C@@]([C@H]([C@]3(O)C(=O)OC)OC(=O)C)(CC)C=CC1,non-mutagenic OC(=O)O.[Na+],non-mutagenic N/N=c/1\sc2c(n1C)cccc2.Cl,mutagenic COc1c2N(C)[C@@H]3[C@](c2cc(c1OC)Cl)(O)[C@H]([C@@]12N3C(=O)[C@@](C)(SS1)N(C2=O)C)O,non-mutagenic @@ -8178,7 +8206,7 @@ OC(=O)C1=NN(C(=O)[C@H]1/N=N/c1ccc(cc1)S(=O)(=O)O)c1ccc(cc1)S(=O)(=O)O.[Na+].[Na+ CSCC[C@@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N)Cc1ccccc1)CC(=O)O)NC(=O)[C@H](Cc1c[nH]c2c1cccc2)NC(=O)OCc1ccccc1,non-mutagenic COc1cc(ccc1N)c1ccc(c(c1)OC)N.Cl.Cl,mutagenic CN([C@H]1C(=C(C(=O)N)C(=O)[C@]2([C@H]1[C@H](O)[C@H]1C(=C2O)C(=O)c2c([C@]1(C)O)cccc2O)O)O)C.Cl,non-mutagenic -OS(=O)(=O)O.O=CN1c2cc(OC)c(cc2[C@]23[C@H]1[C@@](O)(C(=O)OC)[C@H](OC(=O)C)[C@]1([C@@H]3N(CC2)CC=C1)CC)[C@]1(C[C@H]2CN(CCc3c1[nH]c1c3cccc1)C[C@](C2)(O)CC)C(=O)OC,non-mutagenic +OS(=O)(=O)O.O=CN1c2cc(OC)c(cc2[C@]23[C@H]1[C@@](O)(C(=O)OC)[C@H](OC(=O)C)[C@]1([C@@H]3[N@@](CC2)CC=C1)CC)[C@]1(C[C@H]2C[N@@](CCc3c1[nH]c1c3cccc1)C[C@](C2)(O)CC)C(=O)OC,non-mutagenic COc1cccc2c1C(=O)c1c(C2=O)c(O)c2c(c1O)[C@@H](O[C@H]1C[C@H](N)[C@H]([C@@H](O1)C)O)C[C@](C2)(O)C(=O)C,mutagenic CCNc1nc(NC(C)C)[nH]c(=O)n1,non-mutagenic O[C@@H]1[C@@H]2C[C@@]34[C@@H]5[C@H]1[C@@]1(C(=C2O)C(=O)c2c(C1=O)c(O)c(cc2O)C)C[C@@H]([C@H]5O)C(=C3C(=O)c1c(C4=O)c(O)c(cc1O)C)O,non-mutagenic @@ -8201,7 +8229,7 @@ NC(=N)C(/N=N/C(C(=N)N)(C)C)(C)C.Cl.Cl,mutagenic ClCC[N+](CCCl)(C)[O-].Cl,mutagenic N/N=C/1\N=NC=C2[C@@H]1C=CC=C2.Cl,mutagenic CCNC(=O)CC[C@@H](C(=O)O)N,non-mutagenic -CC[C@H]1CN2CCc3c([C@H]2C[C@H]1C[C@H]1NCCc2c1cc(OC)c(c2)OC)cc(c(c3)OC)OC.Cl.Cl,non-mutagenic +CC[C@H]1C[N@]2CCc3c([C@H]2C[C@H]1C[C@H]1NCCc2c1cc(OC)c(c2)OC)cc(c(c3)OC)OC.Cl.Cl,non-mutagenic [O-][N+](=O)c1ccc(s1)NC(=O)NCCCl,mutagenic CCN(CCCN(C1Cc2c(C1)cccc2)c1ccccc1)CC.Cl,non-mutagenic O[C@H]1[C@H](O)[C@H](O[C@H]1n1ccc(=O)[nH]c1=O)COP(=O)(O)O.[Na+].[Na+],non-mutagenic @@ -8220,10 +8248,10 @@ OC[C@H]1O[C@H]([C@@H]([C@@H]1O)O)n1cnc2c1ncnc2NCc1ccc(cc1)[N+](=O)[O-],non-mutag OC(=O)[C@](Cc1ccc(c(c1)O)O)(N)C.O,non-mutagenic OC[C@H]1O[C@H](C[C@H]1O)n1cc(C=O)c(=O)[nH]c1=O,non-mutagenic O[C@@H]1[C@H](O)[C@H](O[C@H]1n1cnc2c1nc[nH]c2=O)COP(=O)(O)O.[Na+].[Na+],non-mutagenic -C/C=C/1\CC(=C)[C@@](C)(O)C(=O)OCC2=CCN3[C@H]2[C@@H](OC1=O)CC3,mutagenic +C/C=C/1\CC(=C)[C@@](C)(O)C(=O)OCC2=CC[N@@]3[C@H]2[C@@H](OC1=O)CC3,mutagenic NCC(=O)Nc1ccccc1.Cl,mutagenic O=c1[nH]cnc2c1cccc2,non-mutagenic -CCC1=C(C[C@H]2NCCc3c2cc(OC)c(c3)OC)C[C@@H]2N(C1)CCc1c2cc(c(c1)OC)OC,non-mutagenic +CCC1=C(C[C@H]2NCCc3c2cc(OC)c(c3)OC)C[C@@H]2[N@@](C1)CCc1c2cc(c(c1)OC)OC,non-mutagenic OP(=O)(O)O.OP(=O)(O)O.CCN(CCC[C@H](Nc1ccnc2c1ccc(c2)Cl)C)CC,mutagenic CC(C1=CC2=CC[C@H]3[C@]([C@H]2CC1)(C)CCC[C@@]3(C)C(=O)O)C,non-mutagenic Oc1ccc2c(c1)Oc1c(C32OC(=O)c2c3cccc2)ccc(c1)O.[Na+].[Na+],non-mutagenic diff --git a/ext/lazar/extconf.rb b/ext/lazar/extconf.rb index 7cd85e0..aa031e5 100644 --- a/ext/lazar/extconf.rb +++ b/ext/lazar/extconf.rb @@ -10,7 +10,7 @@ programs.each do |program| abort "Please install #{program} on your system." unless find_executable program end -abort "Please install Rserve on your system. Execute 'install.packages('Rserve')' in a R console running as root ('sudo R')." unless `R CMD Rserve --version`.match(/^Rserve/) +abort "Please install the latest Rserve version on your system (the CRAN version is outdated). Execute 'install.packages('Rserve',,'http://www.rforge.net/')' in a R console running as root ('sudo R')." unless `R CMD Rserve --version`.match(/^Rserve v1.8/) # install R packages r_dir = File.join main_dir, "R" diff --git a/ext/lazar/rinstall.R b/ext/lazar/rinstall.R index 98e612d..17c2e61 100644 --- a/ext/lazar/rinstall.R +++ b/ext/lazar/rinstall.R @@ -1,12 +1,14 @@ libdir = commandArgs(trailingOnly=TRUE)[1] repo = "https://stat.ethz.ch/CRAN/" -#install.packages("Rserve",lib=libdir,repos=repo,dependencies=TRUE) -install.packages("stringi",lib=libdir,repos=repo,dependencies=TRUE); -install.packages("iterators",lib=libdir,repos=repo,dependencies=TRUE); -install.packages("foreach",lib=libdir,repos=repo,dependencies=TRUE); -install.packages("gridExtra",lib=libdir,repos=repo,dependencies=TRUE); -install.packages("ggplot2",lib=libdir,repos=repo,dependencies=TRUE); -install.packages("pls",lib=libdir,repos=repo,dependencies=TRUE); -install.packages("randomForest",lib=libdir,repos=repo,dependencies=TRUE); -install.packages("caret",lib=libdir,repos=repo,dependencies=TRUE); -install.packages("doMC",lib=libdir,repos=repo,dependencies=TRUE); +#install.packages("Rserve",lib=libdir,repos=) +# dependencies=TRUE installs unnecessary Suggests packages. The default, NA, means c("Depends", "Imports", "LinkingTo"). +install.packages("caret",lib=libdir,repos=repo); +install.packages("stringi",lib=libdir,repos=repo); +install.packages("iterators",lib=libdir,repos=repo); +install.packages("labeling",lib=libdir,repos=repo); +install.packages("foreach",lib=libdir,repos=repo); +install.packages("gridExtra",lib=libdir,repos=repo); +install.packages("ggplot2",lib=libdir,repos=repo); +install.packages("pls",lib=libdir,repos=repo); +install.packages("randomForest",lib=libdir,repos=repo); +install.packages("doMC",lib=libdir,repos=repo); diff --git a/lib/dataset.rb b/lib/dataset.rb index df17569..596c53c 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -96,8 +96,14 @@ module OpenTox # Get nominal and numeric prediction features # @return [Array] - def prediction_features - features.select{|f| f._type.match("Prediction")} + def prediction_feature + features.select{|f| f._type.match(/Prediction$/)}.first + end + + # Get supporting nominal and numeric prediction features (class probabilities, prediction interval) + # @return [Array] + def prediction_supporting_features + features.select{|f| f.is_a?(LazarPredictionProbability) or f.is_a?(LazarPredictionInterval)} end # Get nominal and numeric merged features @@ -259,7 +265,7 @@ module OpenTox feature_names = table.shift.collect{|f| f.strip} raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size - if feature_names[0] =~ /ID/i # check ID column + if feature_names[0] !~ /SMILES|InChI/i # check ID column original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => feature_names.shift) else original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => "LineID") @@ -343,30 +349,52 @@ module OpenTox # Serialisation - # Convert dataset to csv format + # Convert lazar prediction dataset to csv format # @return [String] - def to_csv #inchi=false - CSV.generate() do |csv| - - compound = substances.first.is_a? Compound - f = features - original_id_features - original_smiles_features - warnings_features - header = original_id_features.collect{|f| "ID "+Dataset.find(f.dataset_id).name} - header += original_smiles_features.collect{|f| "SMILES "+Dataset.find(f.dataset_id).name} if compound - compound ? header << "Canonical SMILES" : header << "Name" - header += f.collect{|f| f.name} - header += warnings_features.collect{|f| "Warnings "+Dataset.find(f.dataset_id).name} - csv << header - - substances.each do |substance| - row = original_id_features.collect{|f| values(substance,f).join(" ")} - row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound - compound ? row << substance.smiles : row << substance.name - row += f.collect{|f| values(substance,f).join(" ")} - row += warnings_features.collect{|f| values(substance,f).uniq.join(" ")} + def to_prediction_csv + + compound = substances.first.is_a? Compound + header = ["ID"] + header << "Original SMILES" if compound + compound ? header << "Canonical SMILES" : header << "Name" + header << "Prediction" if prediction_feature + header << "Confidence" if confidence_feature + header += prediction_supporting_features.collect{|f| f.name} + header << "Measurements" + csv = [header] + + substances.each do |substance| + row = original_id_features.collect{|f| values(substance,f).join(" ")} + row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound + compound ? row << substance.smiles : row << substance.name + row << values(substance,prediction_feature).join(" ") + row << values(substance,confidence_feature).join(" ") + row += prediction_supporting_features.collect{|f| values(substance,f).join(" ")} + row << values(substance,bioactivity_features[0]).join(" ") + csv << row + end + csv.collect{|r| r.join(",")}.join("\n") + end + + # Convert dataset into csv formatted training data + # @return [String] + def to_training_csv + + p features + p bioactivity_features + header = ["Canonical SMILES"] + header << bioactivity_features[0].name + csv = [header] + + substances.each do |substance| + nr_activities = values(substance,bioactivity_features.first).size + (0..nr_activities-1).each do |n| # new row for each value + row = [substance.smiles] + row << values(substance,bioactivity_features[0])[n] csv << row end - end + csv.collect{|r| r.join(",")}.join("\n") end # Convert dataset to SDF format @@ -396,7 +424,6 @@ module OpenTox predictions = {} substances.each do |s| predictions[s] ||= {} - prediction_feature = prediction_features.first predictions[s][:value] = values(s,prediction_feature).first #predictions[s][:warnings] = [] #warnings_features.each { |w| predictions[s][:warnings] += values(s,w) } diff --git a/lib/download.rb b/lib/download.rb index f17d060..2546dc4 100644 --- a/lib/download.rb +++ b/lib/download.rb @@ -122,7 +122,6 @@ module OpenTox # Combine mutagenicity data from Kazius, Hansen and EFSA and download into the data folder def self.mutagenicity $logger.debug "Mutagenicity" - # TODO add download/conversion programs to lazar dependencies hansen_url = "http://doc.ml.tu-berlin.de/toxbenchmark/Mutagenicity_N6512.csv" kazius_url = "http://cheminformatics.org/datasets/bursi/cas_4337.zip" efsa_url = "https://data.europa.eu/euodp/data/storage/f/2017-07-19T142131/GENOTOX data and dictionary.xls" @@ -185,7 +184,7 @@ module OpenTox map = {"mutagen" => "mutagenic", "nonmutagen" => "non-mutagenic"} dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,nil,map], keep_original_features: false, remove_duplicates: true dataset.merged_features.first.name = "Mutagenicity" - File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv"),"w+"){|f| f.puts dataset.to_csv} + File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv"),"w+"){|f| f.puts dataset.to_training_csv} meta = { :species => "Salmonella typhimurium", :endpoint => "Mutagenicity", diff --git a/lib/feature.rb b/lib/feature.rb index 72c26d7..296a174 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -18,6 +18,9 @@ module OpenTox # Confidence class Confidence < Feature field :dataset_id, type: BSON::ObjectId + def name + "Confidence" + end end # Categorical variables @@ -66,13 +69,13 @@ module OpenTox field :model_id, type: BSON::ObjectId field :training_feature_id, type: BSON::ObjectId def name - "#{self[:name]} Prediction" + "Prediction: #{self[:name]}" end end class LazarPredictionProbability < NominalLazarPrediction def name - "probability(#{self[:name]})" + "Probability: #{self[:name]}" end end @@ -81,13 +84,13 @@ module OpenTox field :model_id, type: BSON::ObjectId field :training_feature_id, type: BSON::ObjectId def name - "#{self[:name]} Prediction" + "Prediction: #{self[:name]}" end end class LazarPredictionInterval < NumericLazarPrediction def name - "prediction_interval_#{self[:name]}" + "#{self[:name].capitalize} prediction interval" end end diff --git a/lib/lazar.rb b/lib/lazar.rb index 2a3f749..e77de9d 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -17,19 +17,22 @@ raise "Incorrect lazar environment variable LAZAR_ENV '#{ENV["LAZAR_ENV"]}', ple ENV["MONGOID_ENV"] = ENV["LAZAR_ENV"] ENV["RACK_ENV"] = ENV["LAZAR_ENV"] # should set sinatra environment +# CH: this interferes with /etc/hosts on my machine # search for a central mongo database in use # http://opentox.github.io/installation/2017/03/07/use-central-mongodb-in-docker-environment -CENTRAL_MONGO_IP = `grep -oP '^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}(?=.*mongodb)' /etc/hosts`.chomp +# CENTRAL_MONGO_IP = `grep -oP '^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}(?=.*mongodb)' /etc/hosts`.chomp Mongoid.load_configuration({ :clients => { :default => { :database => ENV["LAZAR_ENV"], - :hosts => (CENTRAL_MONGO_IP.blank? ? ["localhost:27017"] : ["#{CENTRAL_MONGO_IP}:27017"]), + #:hosts => (CENTRAL_MONGO_IP.blank? ? ["localhost:27017"] : ["#{CENTRAL_MONGO_IP}:27017"]), + :hosts => ["localhost:27017"] } } }) Mongoid.raise_not_found_error = false # return nil if no document is found -$mongo = Mongo::Client.new("mongodb://#{(CENTRAL_MONGO_IP.blank? ? "127.0.0.1" : CENTRAL_MONGO_IP)}:27017/#{ENV['LAZAR_ENV']}") +#$mongo = Mongo::Client.new("mongodb://#{(CENTRAL_MONGO_IP.blank? ? "127.0.0.1" : CENTRAL_MONGO_IP)}:27017/#{ENV['LAZAR_ENV']}") +$mongo = Mongo::Client.new("mongodb://127.0.0.1:27017/#{ENV['LAZAR_ENV']}") $gridfs = $mongo.database.fs # Logger setup diff --git a/lib/model.rb b/lib/model.rb index cbfefe3..05cd113 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -286,14 +286,14 @@ module OpenTox end if threshold == algorithms[:similarity][:min].first if prediction[:warnings].empty? - prediction[:confidence] = "High (close to bioassay results)" + prediction[:confidence] = "Similar to bioassay results" return prediction else # try again with a lower threshold prediction[:warnings] << "Lowering similarity threshold to #{algorithms[:similarity][:min].last}." predict_substance substance, algorithms[:similarity][:min].last, prediction end elsif threshold < algorithms[:similarity][:min].first - prediction[:confidence] = "Low (lower than bioassay results)" + prediction[:confidence] = "Lower than bioassay results" return prediction end end @@ -348,9 +348,9 @@ module OpenTox end elsif prediction_feature.is_a? NumericBioActivity f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id) - prediction_interval = {} + prediction_interval = [] ["lower","upper"].each do |v| - prediction_interval[v] = LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id) + prediction_interval << LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id) end end diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 8a8970e..d603294 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -18,7 +18,7 @@ module OpenTox if pred[:value] == v confusion_matrix[:all][i][i] += 1 self.nr_predictions[:all] += 1 - if pred[:confidence].match(/High/i) + if pred[:confidence].match(/Similar/i) confusion_matrix[:confidence_high][i][i] += 1 self.nr_predictions[:confidence_high] += 1 elsif pred[:confidence].match(/Low/i) @@ -32,7 +32,7 @@ module OpenTox if pred[:value] == v confusion_matrix[:all][i][(i+1)%2] += 1 self.nr_predictions[:all] += 1 - if pred[:confidence].match(/High/i) + if pred[:confidence].match(/Similar/i) confusion_matrix[:confidence_high][i][(i+1)%2] += 1 self.nr_predictions[:confidence_high] += 1 elsif pred[:confidence].match(/Low/i) diff --git a/test/classification-model.rb b/test/classification-model.rb index 79ccb98..c41b211 100644 --- a/test/classification-model.rb +++ b/test/classification-model.rb @@ -84,13 +84,19 @@ class ClassificationModelTest < MiniTest::Test assert_kind_of Dataset, result assert_equal 7, result.features.size assert_equal 85, result.compounds.size - prediction_feature = result.prediction_features.first + prediction_feature = result.prediction_feature assert_equal ["carcinogenic"], result.values(result.compounds[1], prediction_feature) assert_equal ["non-carcinogenic"], result.values(result.compounds[5], prediction_feature) assert_nil result.predictions[result.compounds.first][:value] assert_equal "carcinogenic", result.predictions[result.compounds[1]][:value] assert_equal 0.27, result.predictions[result.compounds[1]][:probabilities]["non-carcinogenic"].round(2) - assert_match /High/i, result.predictions[result.compounds[1]][:confidence] + assert_match /Similar/i, result.predictions[result.compounds[1]][:confidence] + csv = result.to_prediction_csv + rows = csv.split("\n") + assert_equal "ID,Original SMILES,Canonical SMILES,Prediction,Confidence,Probability: carcinogenic,Probability: non-carcinogenic,Measurements", rows[0] + items = rows[2].split(",") + assert_equal "carcinogenic", items[3] + assert_equal 0.27, items[6].to_f.round(2) # probabilities end def test_carcinogenicity_rf_classification diff --git a/test/dataset.rb b/test/dataset.rb index 8e230e0..b978512 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -137,7 +137,6 @@ class DatasetTest < MiniTest::Test d = Dataset.from_csv_file File.join(DATA_DIR,"batch_prediction_#{type}_small.csv") assert_equal Dataset, d.class refute_nil d.id - dataset = Dataset.find d.id assert_equal 3, d.compounds.size end end @@ -175,10 +174,16 @@ class DatasetTest < MiniTest::Test datasets = [hansen,efsa,kazius] map = {"mutagen" => "mutagenic", "nonmutagen" => "non-mutagenic"} dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,nil,map], keep_original_features: true, remove_duplicates: true - assert_equal 8281, dataset.compounds.size - assert_equal 9, dataset.features.size + csv = dataset.to_training_csv + rows = csv.split("\n") + header = rows.shift + assert_equal "Canonical SMILES,Mutagenicity",header + values = rows.collect{|r| r.split(",")[1]}.uniq + assert_equal 2, values.size + assert_equal 8290, dataset.compounds.size c = Compound.from_smiles("C/C=C/C=O") assert_equal ["mutagenic"], dataset.values(c,dataset.merged_features.first) + assert_equal 9, dataset.features.size end # serialisation @@ -203,6 +208,13 @@ class DatasetTest < MiniTest::Test end # special cases/details + + def test_daphnia_import + d = Dataset.from_csv_file File.join(File.dirname(__FILE__),"..","data", "Acute_toxicity-Daphnia_magna.csv") + assert 3, d.features.size + assert 546, d.compounds.size + puts d.to_training_csv + end def test_dataset_accessors d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv" diff --git a/test/regression-model.rb b/test/regression-model.rb index 7f667dc..3b41171 100644 --- a/test/regression-model.rb +++ b/test/regression-model.rb @@ -173,13 +173,26 @@ class LazarRegressionTest < MiniTest::Test model = Model::Lazar.create training_dataset: training_dataset result = model.predict training_dataset assert_kind_of Dataset, result - assert_equal 6, result.features.size + assert_equal 8, result.features.size assert_equal 88, result.compounds.size assert_equal [1.95], result.values(result.compounds.first, result.bioactivity_features[0]).collect{|v| v.round(2)} assert_equal [1.37], result.values(result.compounds[6], result.bioactivity_features[0]).collect{|v| v.round(2)} - assert_equal [1.79], result.values(result.compounds[6], result.prediction_features[0]).collect{|v| v.round(2)} + assert_equal [1.79], result.values(result.compounds[6], result.prediction_feature).collect{|v| v.round(2)} assert_equal [1.84,1.73], result.values(result.compounds[7], result.bioactivity_features[0]).collect{|v| v.round(2)} assert_match /Low/i, result.predictions[result.compounds[6]][:confidence] + csv = result.to_prediction_csv + rows = csv.split("\n") + assert_equal "ID,Original SMILES,Canonical SMILES,Prediction,Confidence,Lower prediction interval,Upper prediction interval,Measurements", rows[0] + items = rows[3].split(",") + # prediction and measurement within prediciton interval + prediction = items[3].to_f + pi_low = items[5].to_f + pi_hi = items[6].to_f + measurement = items[7].to_f + [prediction,measurement].each do |v| + assert(v > pi_low) + assert(v < pi_hi) + end end end -- cgit v1.2.3