separate csv serialisations for batch predictions and training data, repeated measure...
authorChristoph Helma <helma@in-silico.ch>
Thu, 20 Jun 2019 20:01:50 +0000 (22:01 +0200)
committerChristoph Helma <helma@in-silico.ch>
Thu, 20 Jun 2019 20:01:50 +0000 (22:01 +0200)
12 files changed:
data/Mutagenicity-Salmonella_typhimurium.csv
ext/lazar/extconf.rb
ext/lazar/rinstall.R
lib/dataset.rb
lib/download.rb
lib/feature.rb
lib/lazar.rb
lib/model.rb
lib/validation-statistics.rb
test/classification-model.rb
test/dataset.rb
test/regression-model.rb

index 0694a94..331de54 100644 (file)
@@ -696,7 +696,8 @@ Clc1cc(Cl)c(c(c1)S(=O)c1cc(Cl)cc(c1O)Cl)O,mutagenic
 O=C1C=C(C(=O)C(=C1)C1=CC(=O)C=C(C1=O)C(C)(C)C)C(C)(C)C,mutagenic
 OC(=O)Cc1ccccc1Cl,non-mutagenic
 C=CCCC(C=O)CC,non-mutagenic
-Nc1cnn(c(=O)c1Cl)c1ccccc1,non-mutagenic mutagenic
+Nc1cnn(c(=O)c1Cl)c1ccccc1,non-mutagenic
+Nc1cnn(c(=O)c1Cl)c1ccccc1,mutagenic
 COc1cccc2c1C(=O)c1c(C2=O)c(O)c2c(c1O)C(OC1CC(N)C(C(O1)C)O)CC(C2)(O)C(=O)C,mutagenic
 c1ccc2c(-c3ccccc3C3C2N3)c1,mutagenic
 c1ccc2c(c1)cc1c3c2[C@H]2O[C@H]2c3cc2c1cccc2,mutagenic
@@ -856,7 +857,8 @@ O[C@@H]1[C@H](O)[C@@H](O)CO[C@H]1N(c1ccc(cc1)[N+](=O)[O-])N=O,mutagenic
 OC(=O)C(CC(=O)c1cccc(c1N)O)N,non-mutagenic
 C=O,mutagenic
 O=NN1CC[C@H](C1)O,mutagenic
-Oc1ccccc1c1ccccc1,non-mutagenic mutagenic
+Oc1ccccc1c1ccccc1,non-mutagenic
+Oc1ccccc1c1ccccc1,mutagenic
 CC(=O)C1=C(O)C2N(C1=O)C(C1C2c2c[nH]c3c2c(C1)ccc3)(C)C,mutagenic
 [O-][N+](=O)C1=Cc2c3c1cccc3cc1c2c2ccccc2cc1,mutagenic
 c1cc2ccc3c4c2c(c1)ccc4nc1c3cccc1,mutagenic
@@ -1501,7 +1503,8 @@ CCCCOC(=O)c1ccc(cc1)O,non-mutagenic
 OC1C=Cc2c(C1O)ccc1c2cc2ccccc2c1[N+](=O)[O-],mutagenic
 O/N=C(/c1ccccc1)\N,mutagenic
 Clc1cc(N)c(c(c1)C(=O)O)Cl,mutagenic
-CCOP(=O)(O/C(=C\Cl)/c1ccc(cc1Cl)Cl)OCC,mutagenic non-mutagenic
+CCOP(=O)(O/C(=C\Cl)/c1ccc(cc1Cl)Cl)OCC,mutagenic
+CCOP(=O)(O/C(=C\Cl)/c1ccc(cc1Cl)Cl)OCC,non-mutagenic
 Nc1ccc(cc1)N=Nc1cccc(c1N)N,mutagenic
 CC1=C[C@]2(O[C@@H](C1)[C@@H](/C=C/[C@H]1CC[C@]3(O1)CC[C@@H]1[C@@H](O3)[C@H](O)C(=C)[C@H](O1)[C@H](C[C@@H]([C@H]1O[C@@]3(CCCCO3)CC[C@H]1C)C)O)C)O[C@@H](CC[C@@H]2O)C[C@](C(=O)O)(O)C,non-mutagenic
 OC(=O)C(Oc1ccc(cc1)[C@@H]1CC1(Cl)Cl)(C)C,non-mutagenic
@@ -1532,7 +1535,8 @@ ClCC(Cl)(Cl)Cl,non-mutagenic
 O=C(c1csc(c1)[N+](=O)[O-])Nc1ccccc1[N+](=O)[O-],mutagenic
 CC(=C)C=O,mutagenic
 c1ccc2c(c1)cc1c(c2)c2Oc2c2c1cccc2,mutagenic
-Clc1cc(Cl)cc(c1Oc1ccc(cc1)[N+](=O)[O-])Cl,mutagenic non-mutagenic
+Clc1cc(Cl)cc(c1Oc1ccc(cc1)[N+](=O)[O-])Cl,mutagenic
+Clc1cc(Cl)cc(c1Oc1ccc(cc1)[N+](=O)[O-])Cl,non-mutagenic
 CC[C@H](c1ccccc1O)C,non-mutagenic
 c1ccc(cc1)Cc1ccccc1OCC1CO1,mutagenic
 CC[n+]1c2ccccc2nc2c1cccc2,mutagenic
@@ -1708,7 +1712,8 @@ Oc1ccc2c(c1N=Nc1ccc(cc1)S(=O)(=O)O)c(cc(c2)S(=O)(=O)O)S(=O)(=O)O,non-mutagenic
 CCCCCC(=O)OC1(CCC2C1(C)CCC1C2CCC2=CC(=O)CCC12)C(=O)C,non-mutagenic
 O=NN1CCC[C@@H](C1)O,mutagenic
 ClCC(=O)c1ccc(cc1Cl)Cl,mutagenic
-[O-][N+](=O)c1ccc(cc1)CNc1[nH]cnc2-c1ncn2,mutagenic non-mutagenic
+[O-][N+](=O)c1ccc(cc1)CNc1[nH]cnc2-c1ncn2,mutagenic
+[O-][N+](=O)c1ccc(cc1)CNc1[nH]cnc2-c1ncn2,non-mutagenic
 Nc1ccc2c3c1-c1ccccc1-c3ccc2,mutagenic
 CC(=O)OCc1ccc(cc1)N=Nc1ccc(cc1)COC(=O)C,mutagenic
 BrCC(C(OP(=O)(OC(C(CBr)Br)C)OC(C(CBr)Br)C)C)Br,mutagenic
@@ -1757,7 +1762,8 @@ NCCN,mutagenic
 Nc1sc2c(n1)C1CCCNC1CC2,non-mutagenic
 O[C@@H]([C@@H](C(=O)O)O)C(=O)O,non-mutagenic
 OC[C@H]1O[C@@H](Oc2cc(O)cc3c2c(=O)c2c(o3)c(O)ccc2O)[C@@H]([C@H]([C@@H]1O)O)O,mutagenic
-c1scc(n1)c1nc2c([nH]1)cccc2,mutagenic non-mutagenic
+c1scc(n1)c1nc2c([nH]1)cccc2,mutagenic
+c1scc(n1)c1nc2c([nH]1)cccc2,non-mutagenic
 CCCCOc1ccc(cc1)CC(=O)NO,mutagenic
 [O-][N+](=O)c1nc2c([nH]1)cccc2,mutagenic
 Nc1ccc2c(n1)n1cccc(c1n2)C,mutagenic
@@ -2102,7 +2108,8 @@ N#Cc1cc(I)c(c(c1)[N+](=O)[O-])O,non-mutagenic
 OCCOc1ccccc1,non-mutagenic
 CC(OC(=O)COc1ccc(cc1Cl)Cl)C,non-mutagenic
 CCCCCCOC(=O)c1ccccc1,non-mutagenic
-COC(=O)c1cc(ccc1[N+](=O)[O-])Oc1ccc(cc1Cl)Cl,mutagenic non-mutagenic
+COC(=O)c1cc(ccc1[N+](=O)[O-])Oc1ccc(cc1Cl)Cl,mutagenic
+COC(=O)c1cc(ccc1[N+](=O)[O-])Oc1ccc(cc1Cl)Cl,non-mutagenic
 Nc1cc(Cl)c(c(c1)Cl)N,mutagenic
 CNC(=O)/C=C(/OP(=O)(OC)OC)\C,mutagenic
 COC(=C1C(=NC(=C([C@@H]1c1cccc(c1)[N+](=O)[O-])C(=O)OC/C=C/c1ccccc1)C)C)O,non-mutagenic
@@ -2538,7 +2545,8 @@ C=CC(=O)NC(CC(=O)C)(C)C,non-mutagenic
 CCOP(=S)(SCn1c(=O)oc2c1ccc(c2)Cl)OCC,non-mutagenic
 CCOP(=O)(SCCN(C(C)C)C(C)C)C,non-mutagenic
 COc1cc(CNC(=O)C(Br)C)ccc1O,mutagenic
-CN1CN(C)CSC1=S,mutagenic non-mutagenic
+CN1CN(C)CSC1=S,mutagenic
+CN1CN(C)CSC1=S,non-mutagenic
 [O-][N+](=O)c1cccc(c1C)N=[N+](c1cccc(c1C)[N+](=O)[O-])[O-],non-mutagenic
 OC(C(Cl)(Cl)Cl)(c1ccc(cc1)Cl)c1ccc(cc1)Cl,non-mutagenic
 C=CCN(CC=C)N=O,mutagenic
@@ -2680,7 +2688,8 @@ ONc1ccc(cc1C(C)(C)C)c1ccccc1,mutagenic
 O/N=C/c1ccccn1,non-mutagenic
 ClCCCCN(COC(=O)C)N=O,mutagenic
 COc1cc2O[C@@H]3[C@H](c2c2c1c1[C@@H](O)C[C@@H](c1c(=O)o2)O)C=CO3,mutagenic
-CNC(=O)Oc1cccc2c1cccc2,mutagenic non-mutagenic
+CNC(=O)Oc1cccc2c1cccc2,mutagenic
+CNC(=O)Oc1cccc2c1cccc2,non-mutagenic
 Nc1cc(C)c(c(c1)[N+](=O)[O-])N,mutagenic
 ClCc1ccc2c3c1ccc1c3c(cc2)cc2c1cccc2,non-mutagenic
 CC(=O)Nc1nc(NC(=O)C)nc(n1)c1ccc(o1)[N+](=O)[O-],mutagenic
@@ -2722,7 +2731,8 @@ Cc1ccc(c(c1)[N+](=O)[O-])C,mutagenic
 O=C1C=CC(=O)C=C1c1ccccc1,non-mutagenic
 O=C(c1ccccc1)N(OC(=O)C)OCc1ccc(cc1)C(C)(C)C,mutagenic
 CC(=O)Nc1scc(n1)c1scc(c1)[N+](=O)[O-],mutagenic
-CCCN(c1c(cc(cc1[N+](=O)[O-])C(F)(F)F)[N+](=O)[O-])CCC,mutagenic non-mutagenic
+CCCN(c1c(cc(cc1[N+](=O)[O-])C(F)(F)F)[N+](=O)[O-])CCC,mutagenic
+CCCN(c1c(cc(cc1[N+](=O)[O-])C(F)(F)F)[N+](=O)[O-])CCC,non-mutagenic
 OC(=O)c1ccco1,non-mutagenic
 [O-][N+](=O)OOC(=O)C,mutagenic
 Cc1ccccc1N=Nc1c(O)ccc2c1cccc2,mutagenic
@@ -3440,7 +3450,8 @@ CCc1ccccc1[N+](=O)[O-],non-mutagenic
 ClC(c1ccccc1)C(=O)Cl,mutagenic
 OC(=O)Cc1ccc(cc1)N,mutagenic
 CCCSC(=O)Cl,mutagenic
-[O-][N+](=O)NC(=N)NC,mutagenic non-mutagenic
+[O-][N+](=O)NC(=N)NC,mutagenic
+[O-][N+](=O)NC(=N)NC,non-mutagenic
 Oc1ccc2c(c1N=Nc1ccccc1)ccc(c2)S(=O)(=O)O,non-mutagenic
 COC(C1=C(N2CC2)C(=O)C(=C(C1=O)N1CC1)C)COC(=O)N,mutagenic
 COCC12OOC2(C)c2c(O1)cccc2,mutagenic
@@ -3634,7 +3645,8 @@ O=NN1CCCCCCCCCCCC1,mutagenic
 Nc1cc([N+](=O)[O-])c(c(c1C)C)N,mutagenic
 O=NN(Cc1ccc(cc1)C)C,non-mutagenic
 O=Nc1cc(ccc1C)[N+](=O)[O-],mutagenic
-Cc1cccc(c1N)C,mutagenic non-mutagenic
+Cc1cccc(c1N)C,mutagenic
+Cc1cccc(c1N)C,non-mutagenic
 [O-][N+](=O)c1ccc(c(c1)C)N,mutagenic
 Clc1ccc(cc1)C(=O)c1ccc(cc1)OC(C(=O)O)(C)C,non-mutagenic
 CC(=O)Nc1ccc(cc1)Oc1ccc(cc1)N,mutagenic
@@ -4066,7 +4078,8 @@ Nc1ccc2c(c1)cns2,mutagenic
 Sc1nc2c(s1)cccc2,non-mutagenic
 N#CCC[C@](C#N)(CBr)Br,non-mutagenic
 COc1ccc(cc1)N=[N+](c1ccc(cc1)OC)[O-],mutagenic
-COP(=S)(Oc1ccc(c(c1)C)[N+](=O)[O-])OC,mutagenic non-mutagenic
+COP(=S)(Oc1ccc(c(c1)C)[N+](=O)[O-])OC,mutagenic
+COP(=S)(Oc1ccc(c(c1)C)[N+](=O)[O-])OC,non-mutagenic
 CCOc1ccc(cc1)[N+](=O)[O-],mutagenic
 NCCCC[C@@H](C(=O)N1CCC[C@H]1C(=O)O)N[C@H](C(=O)O)CCc1ccccc1,non-mutagenic
 Clc1cc(N)c(cc1c1cc(Cl)c(cc1Cl)N)Cl,mutagenic
@@ -4737,7 +4750,8 @@ CCCCOC(=O)c1ccccc1C(=O)OC1CCCCC1,non-mutagenic
 CCc1[nH]c2c(n1)c1c(cc2)ccc2c1cc(O)cc2,mutagenic
 ClC(=O)c1ccccc1C(=O)Cl,mutagenic
 CCc1cccc2c1nccc2,mutagenic
-O=C(N(C)C)Nc1ccc(c(c1)Cl)Cl,mutagenic non-mutagenic
+O=C(N(C)C)Nc1ccc(c(c1)Cl)Cl,mutagenic
+O=C(N(C)C)Nc1ccc(c(c1)Cl)Cl,non-mutagenic
 Nc1ccc(c(c1)Cl)C,mutagenic
 [O-][N+](=O)c1ccc2c3c1cccc3CC2,mutagenic
 Nc1ccc2c(c1)nc1c(c2)ccc(c1)N,mutagenic
@@ -4753,7 +4767,8 @@ Oc1ccc(c(c1)C)Cl,non-mutagenic
 CCCCN(CC(=O)CC)N=O,mutagenic
 Oc1ccc(cc1)c1ccc(cc1)O,non-mutagenic
 c1ccc(cc1)c1ccccc1OCC1CO1,mutagenic
-COc1c(C/C=C(/CCC(=O)O)\C)c(O)c2c(c1C)COC2=O,mutagenic non-mutagenic
+COc1c(C/C=C(/CCC(=O)O)\C)c(O)c2c(c1C)COC2=O,mutagenic
+COc1c(C/C=C(/CCC(=O)O)\C)c(O)c2c(c1C)COC2=O,non-mutagenic
 OCc1ccc(cc1)Br,non-mutagenic
 OCCN(c1ccc(cc1)N=Nc1cccnc1)CCO,non-mutagenic
 ClCCSCC(C(=O)NCC(=O)OC)NC(=O)CCC(C(=O)O)N,mutagenic
@@ -5364,7 +5379,8 @@ Clc1ccc(c(c1)Cl)S(=O)(=O)n1ncc(c(c1=O)Cl)Cl,non-mutagenic
 OC(COc1ccc(cc1)NC(=O)C)CNC(C)C,non-mutagenic
 O=C(C(=C)C)OCC(COC(=O)C(=C)C)(C)C,non-mutagenic
 OCC1OC(OC23C=C(C)C4(C(C3C(=O)C(C2)(C)C)(C)O)CC4)C(C(C1OC(=O)/C=C/c1ccc(cc1)O)O)OC(=O)C,mutagenic
-CO/C(=C\C(=O)O)/C(=O)C(=C)C,mutagenic non-mutagenic
+CO/C(=C\C(=O)O)/C(=O)C(=C)C,mutagenic
+CO/C(=C\C(=O)O)/C(=O)C(=C)C,non-mutagenic
 O=C1CCc2c1c1c(cc2)ccc2c1cccc2,non-mutagenic
 CCCCOCCCC,non-mutagenic
 CCNC(=N)N([N+](=O)[O-])N=O,mutagenic
@@ -5798,7 +5814,8 @@ C=CCOC(=O)c1ccccc1C(=O)OCC=C,non-mutagenic
 CCC(=O)Nc1ccc(c(c1)Cl)Cl,non-mutagenic
 Cc1cccc2c1c1ccc3c(c1cc2)cccc3,mutagenic
 CC(=O)Nc1scc(n1)/C=C\c1ccc(o1)[N+](=O)[O-],mutagenic
-NC(=O)Cc1cccc2c1cccc2,mutagenic non-mutagenic
+NC(=O)Cc1cccc2c1cccc2,mutagenic
+NC(=O)Cc1cccc2c1cccc2,non-mutagenic
 [O-][N+](=O)c1ccc(cc1)n1cnc2c1ncnc2N,mutagenic
 Cc1cc2n(C)c(nc2c2c1nccn2)N,mutagenic
 O=C(N(C)C)Nc1ccc(c(c1)Cl)C,non-mutagenic
@@ -6199,7 +6216,8 @@ Cc1nc(C)cc(c1)c1cc2c(cc1F)n1c(n2C2CC2)cc(=O)n(c1=O)O,mutagenic
 ClCCN(c1ccc(cc1)c1[nH]c2c(n1)cc(cc2)CCCCCCc1nc2c([nH]1)ccc(c2)N1CCN(CC1)C)CCCl,non-mutagenic
 Cn1c(N)nc2c1cc1ncccc1n2,non-mutagenic
 COC(=O)C[C@H]1[C@@]2(C)[C@H](OC3C2=C(C)[C@@H](C3)c2cocc2)[C@H]2C3[C@]1(C)C(=O)C=C[C@@]3(C)C(=O)O2,non-mutagenic
-N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,non-mutagenic mutagenic
+N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,non-mutagenic
+N#CC(c1cccc(c1)Oc1ccccc1)OC(=O)C1C(C1(C)C)C=C(Cl)Cl,mutagenic
 OC1CC=Cc2c1cc1ccc3c4c1c2ccc4ccc3,mutagenic
 COc1ccc(cc1)C(C=C)O,non-mutagenic
 BrCC(=O)N(Cc1ccccc1)C,non-mutagenic
@@ -6423,7 +6441,8 @@ O[C@@H]1C=Cc2c([C@@H]1O)c1cc3ccc(c4c3c(c1cc2)CC4)C,mutagenic
 CC(=O)Nc1cccc2c1ncc(c2)F,mutagenic
 Nc1ccc(cc1)/C=C/c1cccc(c1)N,mutagenic
 CC(OC(=O)/C=C/c1ccc(o1)[N+](=O)[O-])C,mutagenic
-O=C1C=C(O)c2c(C1=O)cccc2,mutagenic non-mutagenic
+O=C1C=C(O)c2c(C1=O)cccc2,mutagenic
+O=C1C=C(O)c2c(C1=O)cccc2,non-mutagenic
 Cl/C=C\C[N+]12CN3CN(C2)CN(C1)C3,mutagenic
 [O-][N+](=O)c1cc(ccc1C)C(=O)O,mutagenic
 ClCCN(c1ccc(cc1)CC(=O)O[C@H]1CC[C@]2([C@H](C1)CC[C@@H]1[C@@H]2CC[C@]2([C@H]1CCC(=O)N2)C)C)CCCl,mutagenic
@@ -7270,7 +7289,7 @@ OC[C@H]1O[C@@H](O[C@@]23C=C(C)C4([C@]([C@@H]3C(=O)C(C2)(C)C)(C)O)CC4)[C@@H]([C@H
 Oc1ccc(cc1)/N=N/c1ccc(cc1)c1ccc(cc1)/N=N/c1c(N)c2c(cc1S(=O)(=O)O)cc(c(c2O)/N=N/c1ccccc1)S(=O)(=O)O.[Na+].[Na+],mutagenic
 N[C@@H]1CCC[C@H](C1)N,non-mutagenic
 O=C1c2ccccc2N/C/1=C\1/Nc2c(C1=O)cccc2,mutagenic
-C1CCC(CC1)N1[C@@H]2[C@H]1c1ccccc1-c1c2cccc1,mutagenic
+C1CCC(CC1)[N@@]1[C@@H]2[C@H]1c1ccccc1-c1c2cccc1,mutagenic
 Cl[C@@H](C=C)CCl,non-mutagenic
 COc1ccccc1N.Cl,mutagenic
 BrC[C@H]1CN(C)[C@H]2[C@H](C1)c1cccc3c1c(C2)c[nH]3,mutagenic
@@ -7343,7 +7362,7 @@ O=CCC=O,mutagenic
 Cc1nsc(c1)N.Cl,mutagenic
 CCCCN1[C@@H]2[C@H]1c1ccccc1-c1c2cccc1,mutagenic
 O[C@@H]1CC[C@]2([C@@H](C1)CC[C@@H]1[C@@H]2CC[C@]2([C@H]1CC[C@@H]2[C@@H](CCC(=O)O)C)C)C,non-mutagenic
-CCOC(=O)O[C@H](c1ccnc2c1cc(OC)cc2)[C@@H]1C[C@@H]2CCN1C[C@@H]2C=C,non-mutagenic
+CCOC(=O)O[C@H](c1ccnc2c1cc(OC)cc2)[C@@H]1C[C@@H]2CC[N@]1C[C@@H]2C=C,non-mutagenic
 OS(=O)(=O)O.OC[C@@H]1O[C@@H](O[C@H]([C@@H](C(=O)N[C@@H]([C@@H]([C@H](C(=O)N[C@@H](C(=O)NCCc2scc(n2)c2ncc(s2)C(=O)NCCC[S+](C)C)[C@@H](O)C)C)O)C)NC(=O)c2nc(nc(c2C)N)[C@H](CC(=O)N)NC[C@@H](C(=O)N)N)c2nc[nH]c2)[C@H]([C@@H]([C@@H]1O)O)O[C@@H]1O[C@H](CO)[C@H]([C@H]([C@@H]1O)OC(=O)N)O.OC[C@@H]1O[C@@H](O[C@H]([C@H](C(=O)N[C@@H]([C@@H]([C@@H](C(=O)N[C@@H](C(=O)NCCc2scc(n2)c2ncc(s2)C(=O)NCCC[S+](C)C)[C@@H](O)C)C)O)C)NC(=O)c2nc(nc(c2C)N)[C@H](CC(=O)N)NC[C@@H](C(=O)N)N)c2nc[nH]c2)[C@H]([C@@H]([C@H]1O)O)O[C@@H]1O[C@H](CO)[C@H]([C@H]([C@@H]1O)OC(=O)N)O,mutagenic
 O[C@@H]1C[C@@]2(C)[C@H]([C@]([C@@H]1O)(C)C(=O)O)CC[C@@]1([C@@H]2CC=C2[C@]1(C)CC[C@]1([C@H]2CC(C)(C)CC1)C(=O)O)C,non-mutagenic
 CN(CCN(c1ccccn1)Cc1cscc1)C.Cl,non-mutagenic
@@ -7435,7 +7454,7 @@ Clc1ccc(cc1)O[C@@H](C(=O)C(C)(C)C)n1cncc1,non-mutagenic
 OC[C@H]1O[C@H](C[C@H]1O)n1cc(CC)c(=O)[nH]c1=O,non-mutagenic
 OC(=O)[C@H](c1ccc(cc1)Oc1nccs1)C,non-mutagenic
 CCNCC#CC(OC(=O)[C@@](c1ccccc1)(C1CCCCC1)O)(C)C.Cl,non-mutagenic
-C=C[C@H]1CN2CC[C@H]1C[C@H]2[C@@H](c1ccnc2c1cc(OC)cc2)O.Cl.Cl,non-mutagenic
+C=C[C@H]1C[N@@]2CC[C@H]1C[C@H]2[C@@H](c1ccnc2c1cc(OC)cc2)O.Cl.Cl,non-mutagenic
 OCCNc1ccc(cc1)/N=N/c1ccc(cc1)NCCO,mutagenic
 CC(=C[C@H]1[C@@H](C1(C)C)C(=O)OCc1cccc(c1)Oc1ccccc1)C,mutagenic
 CC[C@H](OS(=O)(=O)C)C,mutagenic
@@ -7631,6 +7650,7 @@ CC(=O)OCc1ccccc1/N=N/c1ccc(cc1)N(C)C,mutagenic
 COc1nsc2c1cccc2OC[C@H]1CO1,mutagenic
 CNNC,non-mutagenic
 [N-]=[N+]=Nc1ccc(cc1)Nc1c2ccccc2nc2c1cccc2,mutagenic
+N#C[C@@H]1COCC[N@]1[C@H]1C[C@@H](O[C@H]([C@H]1O)C)O[C@H]1C[C@@](O)(Cc2c1c(O)c1c(c2O)C(=O)c2c(C1=O)c(OC)ccc2)C(=O)CO,non-mutagenic
 C[C@@H](c1ccccc1)N(C)C,non-mutagenic
 OC(=O)c1cn2[C@@H](C)COc3c2c(c1=O)cc(c3C1(N)CC1)F,non-mutagenic
 CCc1cccc2c1[nH]c1c2CCO[C@@]1(C)CC,non-mutagenic
@@ -7722,6 +7742,7 @@ Sc1ncnc2c1[nH]cn2.O,mutagenic
 SCCC(=O)N1[C@@H](CS[C@H]1c1ccccc1O)C(=O)O,non-mutagenic
 COc1cc(ccc1OC)C[C@H]1CO1,mutagenic
 CCCCOc1ccc2c(n1)c(NCCCNCCCl)c1c(n2)cc(cc1)Cl.Cl.Cl.O,mutagenic
+CC(CCC[C@H]([C@@H]1CC[C@@H]2[C@]1(C)CC[C@H]1[C@H]2C[C@H]2[C@@]3([C@]1(C)CC[C@@H](C3)OC(=O)C)[N@@]2N1C(=O)c2c(C1=O)cccc2)C)C,mutagenic
 OS(=O)(=O)OCc1c2ccccc2c2c3c1ccc1c3c(cc2)ccc1.[Na+],mutagenic
 C[n+]1c2ccccc2cc2c1cccc2N.Cl,mutagenic
 CCCn1cc2c3c1cccc3[C@@H]1[C@@H](C2)N(C#N)C[C@@H](C1)C,mutagenic
@@ -7843,7 +7864,7 @@ O=[P@@]1(OCc2c(O1)cccc2)Oc1ccccc1,mutagenic
 COC(=O)C1=C(CC)[C@@H](OC1=O)C,mutagenic
 Fc1ccc(cc1)Cn1c(nc2c1cccc2)N1CCC(CC1)N(c1nccc(=O)[nH]1)C,non-mutagenic
 Oc1ccc(cc1)[C@@H]1CC(=O)c2c(O1)cc(cc2O)O,non-mutagenic
-C=C[C@H]1CN2CC[C@H]1C[C@@H]2[C@H](c1ccnc2c1cc(OC)cc2)O,non-mutagenic
+C=C[C@H]1C[N@@]2CC[C@H]1C[C@@H]2[C@H](c1ccnc2c1cc(OC)cc2)O,non-mutagenic
 Nc1cc(N)c(cc1/N=N/c1ccccc1C)C,mutagenic
 CC[S@@](=O)CCSP(=O)(OC)OC,mutagenic
 c1cc2[C@@H]3O[C@@H]3c3c2c(c1)c1cc2ccccc2cc1c3,mutagenic
@@ -7913,13 +7934,14 @@ CNC(=O)Oc1ccc(cc1)c1ccccc1,mutagenic
 C[C@@H]1CCC[C@@H](N1CCC[C@](c1ccccn1)(c1ccccc1)O)C.Cl,non-mutagenic
 BrC[C@H](CO[P@@](=O)(OC[C@@H](CBr)Br)O)Br,mutagenic
 O=Nc1ccc2c(c1)cccc2,mutagenic
+OC[C@H]1O[C@@H](O[C@H]2CC[C@]3(C(=CC[C@@H]4[C@@H]3CC[C@]3([C@H]4C[C@@H]4[C@@H]3[C@H](C)[C@@H]3[N@@]4C[C@H](CC3)C)C)C2)C)[C@H]([C@H]([C@H]1O)O[C@@H]1O[C@@H](CO)[C@H]([C@H]([C@H]1O)O)O)O[C@@H]1O[C@@H](C)[C@@H]([C@H]([C@H]1O)O)O,non-mutagenic
 c1cc2ccc3c4c2c(c1)ccc4ccc3,mutagenic
 NCC(=O)O.Cl,non-mutagenic
 COc1cc(/N=N/c2ccccc2)ccc1N,mutagenic
 C=CCOc1ccccc1OC[C@@H](CNC(C)C)O,non-mutagenic
 C/C(=N\O)/C(=O)C,non-mutagenic
 c1ccc2c(-c3ccccc3[C@@H]3[C@H]2N3)c1,mutagenic
-COc1cc2c(cc1OC)N1[C@@H]3[C@@]42CCN2[C@H]4C[C@@H]4[C@H]3[C@H](CC1=O)OCC=C4C2,non-mutagenic
+COc1cc2c(cc1OC)N1[C@@H]3[C@@]42CC[N@@]2[C@H]4C[C@@H]4[C@H]3[C@H](CC1=O)OCC=C4C2,non-mutagenic
 OCc1cc(ccc1O)C(=O)CN(C(C)(C)C)Cc1ccccc1.Cl,mutagenic
 Oc1cc(O)c2c(c1)oc(c(c2=O)O)c1ccc(c(c1)O)O.O.O,mutagenic
 ClCc1cccc(c1)/N=N/c1ccc(cc1)N(C)C,mutagenic
@@ -7941,7 +7963,7 @@ O=C1CN(CCCN2CC(=O)NC(=O)C2)CC(=O)N1,non-mutagenic
 CC(c1ccc(c2-c(c1)c(C)cc2S(=O)(=O)O)C)C.[Na+],non-mutagenic
 ClC[C@@]12[C@H](Cl)[C@H]([C@H](C2(CCl)CCl)CC1(Cl)Cl)Cl,non-mutagenic
 OC(=O)COc1ccc(cc1Cl)Cl.CC(N)C,non-mutagenic
-C=C[C@@H]1CN2CC[C@H]1C[C@H]2[C@@H](c1ccnc2c1cc(OC)cc2)O.Br,non-mutagenic
+C=C[C@@H]1C[N@@]2CC[C@H]1C[C@H]2[C@@H](c1ccnc2c1cc(OC)cc2)O.Br,non-mutagenic
 O[C@H]1[C@H]2O[C@H]2c2c([C@@H]1O)ccc1c2cc2ccccc2c1C,mutagenic
 Nc1cc(N)c(cc1/N=N/c1ccc(cc1)c1ccc(cc1)/N=N/c1ccc(c(c1)C(=O)O)O)/N=N/c1ccc(cc1)S(=O)(=O)O,mutagenic
 COc1ccc2c(c1)[nH]c1c2CCN=C1C.Cl.O.O,non-mutagenic
@@ -7972,11 +7994,12 @@ C[C@H](C(=O)O)Oc1cc(Cl)c(cc1Cl)Cl,non-mutagenic
 [N-]=[N+]=Nc1ccc2c(c1)nc1c(c2Nc2ccc(cc2OC)NS(=O)(=O)C)cccc1,mutagenic
 O[C@@H]1[C@@H](O)[C@@H](O[C@@H]1n1ccc(=N)[nH]c1=O)COP(=O)(O)O,non-mutagenic
 O=CC1=C[C@@]2(O)CC(C[C@H]2[C@]2([C@]1(C=O)C2)C)(C)C,mutagenic
-C/C=C\1/CC(=C)[C@](O)(CO)C(=O)OCC2=CCN3[C@H]2[C@H](OC1=O)CC3,mutagenic
+C/C=C\1/CC(=C)[C@](O)(CO)C(=O)OCC2=CC[N@@]3[C@H]2[C@H](OC1=O)CC3,mutagenic
 CC(=O)O[C@@H]1C(=O)O[C@H]2[C@H]1OC(=O)[C@@H]2OC(=O)C,non-mutagenic
 CC(=O)N/N=C/c1c[n+]([O-])c2c([n+]1[O-])cccc2,mutagenic
 OC(=O)CN(CC(=O)O)CCN(CC(=O)O)CC(=O)O.[Na+].[Na+].[Na+].[Na+],non-mutagenic
 O=C1NC(=O)[C@](N1)(c1ccc(cc1)O)c1ccccc1,non-mutagenic
+C1C[N@@]2CC[N@]1CC2,non-mutagenic
 BrC[C@H]([C@H](O[P@](=O)(O[C@@H]([C@@H](CBr)Br)C)O[C@@H]([C@@H](CBr)Br)C)C)Br,mutagenic
 Nc1ccc2c(c1)cc1c(c2)cccc1,mutagenic
 Br/C=C(\c1ccc(cc1Cl)Cl)/OP(=O)(OC)OC,non-mutagenic
@@ -7987,6 +8010,7 @@ ClCC=CCCl,mutagenic
 [O-][N+](=O)c1ccc2c(c1)ccc1c2ccc(c1)[N+](=O)[O-],mutagenic
 CCC(c1cc(N2Nc3c(N2)cccc3)c(c(c1)C(CC)(C)C)O)(C)C,non-mutagenic
 C[C@]1(OC1)c1ccc(cc1)c1ccccc1,mutagenic
+OC[C@H]1O[C@@H](O[C@H]2CC[C@]3(C(=CC[C@H]4[C@H]3CC[C@]3([C@H]4C[C@@H]4[C@@H]3[C@H](C)[C@H]3[N@]4C[C@@H](CC3)C)C)C2)C)[C@@H]([C@H]([C@@H]1O[C@H]1O[C@@H](C)[C@@H]([C@H]([C@H]1O[C@H]1O[C@@H](C)[C@@H]([C@H]([C@H]1O)O)O)O)O)O)O,non-mutagenic
 O[C@@H]1C[C@@]23[C@](C1)(O3)C=CC=C2,non-mutagenic
 COc1ccccc1C[C@H]1CO1,mutagenic
 ClCC(=O)O[C@H](P(=O)(OC)OC)C(Cl)(Cl)Cl,non-mutagenic
@@ -8041,6 +8065,7 @@ CCCCCCCCCCCC(=O)OCCS(=O)(=O)O.[Na+],non-mutagenic
 C[C@@H](C(C)(C)C)O[P@@](=O)(Cl)C,non-mutagenic
 OC(=O)/C=C\C(=O)O.COc1ccc(cc1)CN(c1ccccn1)CCN(C)C,non-mutagenic
 Oc1ccc(cc1)/C=N/n1nnc2c(c1=O)[nH]c1c2cccc1,mutagenic
+C1[N@@]2C[N@@]3C[N@]1C[N@](C2)C3,mutagenic
 CC(=O)[C@@H]1C(=O)C=C2[C@](C1=O)(C)c1c(O)c(C)c(c(c1O2)C(=O)C)O,non-mutagenic
 O=C1C(=O)[C@]2(C([C@@H]1CC2)(C)C)C,non-mutagenic
 CCC/C=C/C(=O)O[C@@H]1C(C)(C)C[C@@H]2[C@]1(O)C=C(C=O)[C@@]13[C@@]2(C1)C(=O)O[C@@H]3O,mutagenic
@@ -8065,7 +8090,7 @@ CN(c1ccc(cc1)/C(=C\1/C=CC(=[N+](C)C)C=C1)/c1c2ccc(cc2cc(c1O)S(=O)(=O)O)S(=O)(=O)
 Cc1cc(ccc1/N=N/c1ccc2c(c1O)c(N)c(cc2S(=O)(=O)O)S(=O)(=O)O)c1ccc(c(c1)C)/N=N/c1ccc2c(c1O)c(N)c(cc2S(=O)(=O)O)S(=O)(=O)O,mutagenic
 Nc1cc(cc2c1c(O)c(c(c2)S(=O)(=O)O)/N=N/c1ccccc1)S(=O)(=O)O,non-mutagenic
 Cc1ccc(c(c1)C)/N=N/c1c2ccc(cc2cc(c1O)S(=O)(=O)O)S(=O)(=O)O,mutagenic
-Cl/C=C/C[N@+]12CN3CN(C2)CN(C1)C3,mutagenic
+Cl/C=C/C[N@+]12C[N@]3C[N@@](C2)C[N@@](C1)C3,mutagenic
 ClCCN(CCCl)CCC[C@H](Nc1c2cc(OC)ccc2nc2c1ccc(c2)Cl)C,mutagenic
 Oc1ccc(cc1)/N=N/c1ccc(cc1)c1ccc(cc1)/N=N/c1c(O)c2c(cc1S(=O)(=O)O)cc(c(c2N)/N=N/c1ccc(cc1)[N+](=O)[O-])S(=O)(=O)O,mutagenic
 ClCCCN(C)C,mutagenic
@@ -8077,7 +8102,7 @@ Cc1cc(ccc1/N=N/c1c(O)c2c(N)cc(cc2cc1S(=O)(=O)O)S(=O)(=O)O)c1ccc(c(c1)C)/N=N/c1c(
 [N-]=[N+]=CC(=O)OC[C@@H](C(=O)O)N,mutagenic
 O=c1[nH]ncc2c1cccc2,non-mutagenic
 CC(CCC[C@H]([C@@H]1CC[C@@H]2[C@]1(C)CC[C@H]1[C@H]2C[C@H]2[C@@]3([C@]1(C)CC[C@@H](C3)O)O2)C)C,mutagenic
-C/C=C\1/C[C@@H](C)[C@@](C)(O)C(=O)OCC2=CCN3[C@H]2[C@@H](OC1=O)CC3,non-mutagenic
+C/C=C\1/C[C@@H](C)[C@@](C)(O)C(=O)OCC2=CC[N@@]3[C@H]2[C@@H](OC1=O)CC3,non-mutagenic
 Nc1[nH]c(=O)c2c(n1)[nH]nn2,non-mutagenic
 CCCCCCCCCCCCCCCC(=O)OC[C@@H]([C@@H]1OC(=O)C(=C1O)O)O,non-mutagenic
 ClCCN(c1ccc(cc1)C[C@@H](C(=O)O)N)CCCl,mutagenic
@@ -8089,9 +8114,11 @@ CC(CC(=O)O[C@H]1C[C@@]2(OC(=O)C)[C@@H](C=C1C)O[C@@H]1[C@]3([C@@]2(C)[C@H](OC(=O)
 CCCCCC[C@@H]([C@H]1C2=C(C[C@@H](CC3=C([C@H]1O)C(=O)OC3=O)[C@H]([C@@H]1CC=CC(=O)O1)O)C(=O)OC2=O)O,non-mutagenic
 CC(=O)OC[C@@]12CCC(=C[C@H]1O[C@@H]1[C@@]3([C@@]2(C)[C@@H](OC(=O)C)[C@@H]1O)CO3)C,non-mutagenic
 OC[C@@]12[C@H](C=C(C(=O)[C@@H]1O)C)O[C@@H]1[C@@]3([C@@]2(C)[C@H](O)[C@H]1O)OC3,non-mutagenic
+COc1ccc2c(c1)[nH]c1c2CC[N@@]2[C@@H]1C[C@H]1[C@H](C2)C[C@H]([C@@H]([C@H]1C(=O)OC)OC)OC(=O)/C=C/c1cc(OC)c(c(c1)OC)OC,non-mutagenic
 ClC[C@@H]1[C@H](CCl)[C@@]2(C([C@@]1(Cl)C(=C2Cl)Cl)(Cl)Cl)Cl,non-mutagenic
 Clc1ccc2c(c1)C(=NC=C1N2C=NN1)c1ccccc1,non-mutagenic
 C=C[C@@H]1C[C@@H]2C[C@H]1C=C2,non-mutagenic
+O=C1O[C@H]2CC[N@]3[C@@H]2C(=CC3)COC(=O)[C@]([C@]([C@H]1C)(C)O)(C)O,non-mutagenic
 O=C(O[C@@H]1C[C@@](O)(C[C@H]([C@H]1O)O)C(=O)O)/C=C/c1ccc(c(c1)O)O,non-mutagenic
 Brc1cccc2c1cc1ccc3c(c1c2)cccc3,mutagenic
 COc1cc(cc(c1O)OC)[C@@H]1[C@H]2C(=O)OC[C@@H]2[C@@H](c2c1cc1OCOc1c2)O[C@@H]1O[C@@H]2CO[C@H](O[C@H]2[C@@H]([C@H]1O)O)C,non-mutagenic
@@ -8107,6 +8134,7 @@ CN[C@H]1CCc2c(-c3c1cc(=O)c(cc3)OC)c(OC)c(c(c2)OC)OC,non-mutagenic
 BrC[C@H]([C@H]([C@@H]([C@@H](CBr)O)O)O)O,mutagenic
 Oc1ccc2c(c1)CC[C@@H]1[C@H]2CC[C@]2([C@H]1C[C@H]([C@@H]2O)O)C,non-mutagenic
 Sc1ncnc2c1[nH]cn2,mutagenic
+COc1ccc2c(c1)[nH]c1c2CC[N@@]2[C@@H]1C[C@H]1[C@H](C2)C[C@H]([C@@H]([C@H]1C(=O)OC)OC)OC(=O)c1cc(OC)c(c(c1)OC)OC,non-mutagenic
 OC[C@H]1O[C@H](C[C@H]1O)n1cc(C)c(=O)[nH]c1=O,non-mutagenic
 Fc1c[nH]c(=O)[nH]c1=O,non-mutagenic
 C[C@@H]1CC[C@@]2(OC1)O[C@H]1[C@H]([C@@H]2C)[C@@]2([C@@H](C1)[C@@H]1CC=C3[C@]([C@H]1CC2)(C)CC[C@@H](C3)O)C,non-mutagenic
@@ -8135,7 +8163,7 @@ C#C[C@@]1(O)CC[C@@H]2[C@]1(C)CC[C@H]1[C@H]2CCC2=CC(=O)CC[C@H]12,non-mutagenic
 CO[C@H]([C@H]1Cc2cc3cc(O[C@H]4C[C@H](O[C@H]5C[C@H](O)[C@H]([C@@H](O5)C)OC)[C@@H]([C@@H](O4)C)OC(=O)C)c(c(c3c(c2C(=O)[C@H]1O[C@H]1C[C@H](O[C@@H]2C[C@H](O[C@@H]3O[C@@H](C)[C@H]([C@@](C3)(C)O)OC(=O)C)[C@@H]([C@@H](O2)C)O)[C@@H]([C@@H](O1)C)O)O)O)C)C(=O)[C@@H]([C@@H](O)C)O,non-mutagenic
 O=C1CC[C@]2(C(=C1)[C@@H](C)C[C@@H]1[C@@H]2CC[C@]2([C@H]1CC[C@]2(OC(=O)C)C(=O)C)C)C,non-mutagenic
 OC[C@@H]1C[C@H]([C@@H](O1)n1cnc2c1ncnc2N)O,mutagenic
-O[C@H]1CC[C@]2(C(=CC[C@@H]3[C@@H]2CC[C@]2([C@@H]3C[C@@H]3[C@@H]2[C@H](C)[C@@H]2N3C[C@H](CC2)C)C)C1)C,non-mutagenic
+O[C@H]1CC[C@]2(C(=CC[C@@H]3[C@@H]2CC[C@]2([C@@H]3C[C@@H]3[C@@H]2[C@H](C)[C@@H]2[N@@]3C[C@H](CC2)C)C)C1)C,non-mutagenic
 O[C@@H]1CC[C@]2([C@@H](C1)C[C@H]([C@@H]1[C@@H]2C[C@H](O)[C@]2([C@H]1CC[C@@H]2[C@@H](CCC(=O)O)C)C)O)C,non-mutagenic
 C/C/1=C\CCC(=C)C2C(CC1)C(C2)(C)C,non-mutagenic
 OC[C@H]([C@H]1OC(=O)C(=C1O)O)O,non-mutagenic
@@ -8163,7 +8191,7 @@ OC(=O)CN(CC(=O)O)CCN(CC(=O)O)CC(=O)O.[Na+].[Na+],non-mutagenic
 NC(=N)c1ccc(cc1)OCCCCCOc1ccc(cc1)C(=N)N.OCCS(=O)(=O)O.OCCS(=O)(=O)O,non-mutagenic
 Nc1ccccc1.Cl,non-mutagenic
 OC(=O)CC[C@@H](C(=O)O)N.[Na+],non-mutagenic
-OS(=O)(=O)O.COc1cc2N(C)[C@H]3[C@@]4(c2cc1[C@]1(C[C@H]2CN(CCc5c1[nH]c1c5cccc1)C[C@](C2)(O)CC)C(=O)OC)CCN1[C@H]4[C@@]([C@H]([C@]3(O)C(=O)OC)OC(=O)C)(CC)C=CC1,non-mutagenic
+OS(=O)(=O)O.COc1cc2N(C)[C@H]3[C@@]4(c2cc1[C@]1(C[C@H]2C[N@@](CCc5c1[nH]c1c5cccc1)C[C@](C2)(O)CC)C(=O)OC)CC[N@@]1[C@H]4[C@@]([C@H]([C@]3(O)C(=O)OC)OC(=O)C)(CC)C=CC1,non-mutagenic
 OC(=O)O.[Na+],non-mutagenic
 N/N=c/1\sc2c(n1C)cccc2.Cl,mutagenic
 COc1c2N(C)[C@@H]3[C@](c2cc(c1OC)Cl)(O)[C@H]([C@@]12N3C(=O)[C@@](C)(SS1)N(C2=O)C)O,non-mutagenic
@@ -8178,7 +8206,7 @@ OC(=O)C1=NN(C(=O)[C@H]1/N=N/c1ccc(cc1)S(=O)(=O)O)c1ccc(cc1)S(=O)(=O)O.[Na+].[Na+
 CSCC[C@@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N)Cc1ccccc1)CC(=O)O)NC(=O)[C@H](Cc1c[nH]c2c1cccc2)NC(=O)OCc1ccccc1,non-mutagenic
 COc1cc(ccc1N)c1ccc(c(c1)OC)N.Cl.Cl,mutagenic
 CN([C@H]1C(=C(C(=O)N)C(=O)[C@]2([C@H]1[C@H](O)[C@H]1C(=C2O)C(=O)c2c([C@]1(C)O)cccc2O)O)O)C.Cl,non-mutagenic
-OS(=O)(=O)O.O=CN1c2cc(OC)c(cc2[C@]23[C@H]1[C@@](O)(C(=O)OC)[C@H](OC(=O)C)[C@]1([C@@H]3N(CC2)CC=C1)CC)[C@]1(C[C@H]2CN(CCc3c1[nH]c1c3cccc1)C[C@](C2)(O)CC)C(=O)OC,non-mutagenic
+OS(=O)(=O)O.O=CN1c2cc(OC)c(cc2[C@]23[C@H]1[C@@](O)(C(=O)OC)[C@H](OC(=O)C)[C@]1([C@@H]3[N@@](CC2)CC=C1)CC)[C@]1(C[C@H]2C[N@@](CCc3c1[nH]c1c3cccc1)C[C@](C2)(O)CC)C(=O)OC,non-mutagenic
 COc1cccc2c1C(=O)c1c(C2=O)c(O)c2c(c1O)[C@@H](O[C@H]1C[C@H](N)[C@H]([C@@H](O1)C)O)C[C@](C2)(O)C(=O)C,mutagenic
 CCNc1nc(NC(C)C)[nH]c(=O)n1,non-mutagenic
 O[C@@H]1[C@@H]2C[C@@]34[C@@H]5[C@H]1[C@@]1(C(=C2O)C(=O)c2c(C1=O)c(O)c(cc2O)C)C[C@@H]([C@H]5O)C(=C3C(=O)c1c(C4=O)c(O)c(cc1O)C)O,non-mutagenic
@@ -8201,7 +8229,7 @@ NC(=N)C(/N=N/C(C(=N)N)(C)C)(C)C.Cl.Cl,mutagenic
 ClCC[N+](CCCl)(C)[O-].Cl,mutagenic
 N/N=C/1\N=NC=C2[C@@H]1C=CC=C2.Cl,mutagenic
 CCNC(=O)CC[C@@H](C(=O)O)N,non-mutagenic
-CC[C@H]1CN2CCc3c([C@H]2C[C@H]1C[C@H]1NCCc2c1cc(OC)c(c2)OC)cc(c(c3)OC)OC.Cl.Cl,non-mutagenic
+CC[C@H]1C[N@]2CCc3c([C@H]2C[C@H]1C[C@H]1NCCc2c1cc(OC)c(c2)OC)cc(c(c3)OC)OC.Cl.Cl,non-mutagenic
 [O-][N+](=O)c1ccc(s1)NC(=O)NCCCl,mutagenic
 CCN(CCCN(C1Cc2c(C1)cccc2)c1ccccc1)CC.Cl,non-mutagenic
 O[C@H]1[C@H](O)[C@H](O[C@H]1n1ccc(=O)[nH]c1=O)COP(=O)(O)O.[Na+].[Na+],non-mutagenic
@@ -8220,10 +8248,10 @@ OC[C@H]1O[C@H]([C@@H]([C@@H]1O)O)n1cnc2c1ncnc2NCc1ccc(cc1)[N+](=O)[O-],non-mutag
 OC(=O)[C@](Cc1ccc(c(c1)O)O)(N)C.O,non-mutagenic
 OC[C@H]1O[C@H](C[C@H]1O)n1cc(C=O)c(=O)[nH]c1=O,non-mutagenic
 O[C@@H]1[C@H](O)[C@H](O[C@H]1n1cnc2c1nc[nH]c2=O)COP(=O)(O)O.[Na+].[Na+],non-mutagenic
-C/C=C/1\CC(=C)[C@@](C)(O)C(=O)OCC2=CCN3[C@H]2[C@@H](OC1=O)CC3,mutagenic
+C/C=C/1\CC(=C)[C@@](C)(O)C(=O)OCC2=CC[N@@]3[C@H]2[C@@H](OC1=O)CC3,mutagenic
 NCC(=O)Nc1ccccc1.Cl,mutagenic
 O=c1[nH]cnc2c1cccc2,non-mutagenic
-CCC1=C(C[C@H]2NCCc3c2cc(OC)c(c3)OC)C[C@@H]2N(C1)CCc1c2cc(c(c1)OC)OC,non-mutagenic
+CCC1=C(C[C@H]2NCCc3c2cc(OC)c(c3)OC)C[C@@H]2[N@@](C1)CCc1c2cc(c(c1)OC)OC,non-mutagenic
 OP(=O)(O)O.OP(=O)(O)O.CCN(CCC[C@H](Nc1ccnc2c1ccc(c2)Cl)C)CC,mutagenic
 CC(C1=CC2=CC[C@H]3[C@]([C@H]2CC1)(C)CCC[C@@]3(C)C(=O)O)C,non-mutagenic
 Oc1ccc2c(c1)Oc1c(C32OC(=O)c2c3cccc2)ccc(c1)O.[Na+].[Na+],non-mutagenic
index 7cd85e0..aa031e5 100644 (file)
@@ -10,7 +10,7 @@ programs.each do |program|
   abort "Please install #{program} on your system." unless find_executable program
 end
 
-abort "Please install Rserve on your system. Execute 'install.packages('Rserve')' in a R console running as root ('sudo R')."  unless `R CMD Rserve --version`.match(/^Rserve/)
+abort "Please install the latest Rserve version on your system (the CRAN version is outdated). Execute 'install.packages('Rserve',,'http://www.rforge.net/')' in a R console running as root ('sudo R')."  unless `R CMD Rserve --version`.match(/^Rserve v1.8/)
 
 # install R packages
 r_dir = File.join main_dir, "R"
index 98e612d..17c2e61 100644 (file)
@@ -1,12 +1,14 @@
 libdir = commandArgs(trailingOnly=TRUE)[1]
 repo = "https://stat.ethz.ch/CRAN/"
-#install.packages("Rserve",lib=libdir,repos=repo,dependencies=TRUE)
-install.packages("stringi",lib=libdir,repos=repo,dependencies=TRUE);
-install.packages("iterators",lib=libdir,repos=repo,dependencies=TRUE);
-install.packages("foreach",lib=libdir,repos=repo,dependencies=TRUE);
-install.packages("gridExtra",lib=libdir,repos=repo,dependencies=TRUE);
-install.packages("ggplot2",lib=libdir,repos=repo,dependencies=TRUE);
-install.packages("pls",lib=libdir,repos=repo,dependencies=TRUE);
-install.packages("randomForest",lib=libdir,repos=repo,dependencies=TRUE);
-install.packages("caret",lib=libdir,repos=repo,dependencies=TRUE);
-install.packages("doMC",lib=libdir,repos=repo,dependencies=TRUE);
+#install.packages("Rserve",lib=libdir,repos=)
+# dependencies=TRUE installs unnecessary Suggests packages. The default, NA, means c("Depends", "Imports", "LinkingTo").
+install.packages("caret",lib=libdir,repos=repo);
+install.packages("stringi",lib=libdir,repos=repo);
+install.packages("iterators",lib=libdir,repos=repo);
+install.packages("labeling",lib=libdir,repos=repo);
+install.packages("foreach",lib=libdir,repos=repo);
+install.packages("gridExtra",lib=libdir,repos=repo);
+install.packages("ggplot2",lib=libdir,repos=repo);
+install.packages("pls",lib=libdir,repos=repo);
+install.packages("randomForest",lib=libdir,repos=repo);
+install.packages("doMC",lib=libdir,repos=repo);
index df17569..596c53c 100644 (file)
@@ -96,8 +96,14 @@ module OpenTox
 
     # Get nominal and numeric prediction features
     # @return [Array<OpenTox::NominalLazarPrediction,OpenTox::NumericLazarPrediction>]
-    def prediction_features
-      features.select{|f| f._type.match("Prediction")}
+    def prediction_feature
+      features.select{|f| f._type.match(/Prediction$/)}.first
+    end
+
+    # Get supporting nominal and numeric prediction features (class probabilities, prediction interval)
+    # @return [Array<OpenTox::LazarPredictionProbability,OpenTox::LazarPredictionInterval>]
+    def prediction_supporting_features
+      features.select{|f| f.is_a?(LazarPredictionProbability) or f.is_a?(LazarPredictionInterval)}
     end
 
     # Get nominal and numeric merged features
@@ -259,7 +265,7 @@ module OpenTox
       feature_names = table.shift.collect{|f| f.strip}
       raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
 
-      if feature_names[0] =~ /ID/i # check ID column
+      if feature_names[0] !~ /SMILES|InChI/i # check ID column
         original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => feature_names.shift)
       else
         original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => "LineID")
@@ -343,30 +349,52 @@ module OpenTox
 
     # Serialisation
     
-    # Convert dataset to csv format 
+    # Convert lazar prediction dataset to csv format 
     # @return [String]
-    def to_csv #inchi=false
-      CSV.generate() do |csv| 
-        
-        compound = substances.first.is_a? Compound
-        f = features - original_id_features - original_smiles_features - warnings_features
-        header = original_id_features.collect{|f| "ID "+Dataset.find(f.dataset_id).name}
-        header += original_smiles_features.collect{|f| "SMILES "+Dataset.find(f.dataset_id).name} if compound
-        compound ? header << "Canonical SMILES" : header << "Name"
-        header += f.collect{|f| f.name}
-        header += warnings_features.collect{|f| "Warnings "+Dataset.find(f.dataset_id).name} 
-        csv << header
-
-        substances.each do |substance|
-          row = original_id_features.collect{|f| values(substance,f).join(" ")}
-          row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound
-          compound ? row << substance.smiles : row << substance.name
-          row += f.collect{|f| values(substance,f).join(" ")}
-          row += warnings_features.collect{|f| values(substance,f).uniq.join(" ")} 
+    def to_prediction_csv
+      
+      compound = substances.first.is_a? Compound
+      header = ["ID"]
+      header << "Original SMILES" if compound
+      compound ? header << "Canonical SMILES" : header << "Name"
+      header << "Prediction" if prediction_feature
+      header << "Confidence" if confidence_feature
+      header += prediction_supporting_features.collect{|f| f.name}
+      header << "Measurements" 
+      csv = [header]
+
+      substances.each do |substance|
+        row = original_id_features.collect{|f| values(substance,f).join(" ")}
+        row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound
+        compound ? row << substance.smiles : row << substance.name
+        row << values(substance,prediction_feature).join(" ")
+        row << values(substance,confidence_feature).join(" ")
+        row += prediction_supporting_features.collect{|f| values(substance,f).join(" ")}
+        row << values(substance,bioactivity_features[0]).join(" ")
+        csv << row
+      end
+      csv.collect{|r| r.join(",")}.join("\n")
+    end
+    
+    # Convert dataset into csv formatted training data
+    # @return [String]
+    def to_training_csv 
+      
+      p features
+      p bioactivity_features
+      header = ["Canonical SMILES"]
+      header << bioactivity_features[0].name
+      csv = [header]
+
+      substances.each do |substance|
+        nr_activities = values(substance,bioactivity_features.first).size
+        (0..nr_activities-1).each do |n| # new row for each value
+          row = [substance.smiles]
+          row << values(substance,bioactivity_features[0])[n] 
           csv << row
         end
-
       end
+      csv.collect{|r| r.join(",")}.join("\n")
     end
 
     # Convert dataset to SDF format
@@ -396,7 +424,6 @@ module OpenTox
       predictions = {}
       substances.each do |s| 
         predictions[s] ||= {}
-        prediction_feature = prediction_features.first
         predictions[s][:value] = values(s,prediction_feature).first
         #predictions[s][:warnings] = []
         #warnings_features.each { |w| predictions[s][:warnings] += values(s,w) }
index f17d060..2546dc4 100644 (file)
@@ -122,7 +122,6 @@ module OpenTox
     # Combine mutagenicity data from Kazius, Hansen and EFSA and download into the data folder
     def self.mutagenicity
       $logger.debug "Mutagenicity"
-      # TODO add download/conversion programs to lazar dependencies
       hansen_url = "http://doc.ml.tu-berlin.de/toxbenchmark/Mutagenicity_N6512.csv"
       kazius_url = "http://cheminformatics.org/datasets/bursi/cas_4337.zip"
       efsa_url = "https://data.europa.eu/euodp/data/storage/f/2017-07-19T142131/GENOTOX data and dictionary.xls"
@@ -185,7 +184,7 @@ module OpenTox
       map = {"mutagen" => "mutagenic", "nonmutagen" => "non-mutagenic"}
       dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,nil,map], keep_original_features: false, remove_duplicates: true
       dataset.merged_features.first.name = "Mutagenicity"
-      File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv"),"w+"){|f| f.puts dataset.to_csv}
+      File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv"),"w+"){|f| f.puts dataset.to_training_csv}
       meta = {
         :species => "Salmonella typhimurium",
         :endpoint => "Mutagenicity",
index 72c26d7..296a174 100644 (file)
@@ -18,6 +18,9 @@ module OpenTox
   # Confidence
   class Confidence < Feature
     field :dataset_id, type: BSON::ObjectId
+    def name
+      "Confidence"
+    end
   end
 
   # Categorical variables
@@ -66,13 +69,13 @@ module OpenTox
     field :model_id, type: BSON::ObjectId
     field :training_feature_id, type: BSON::ObjectId
     def name
-      "#{self[:name]} Prediction"
+      "Prediction: #{self[:name]}"
     end
   end
 
   class LazarPredictionProbability < NominalLazarPrediction
     def name
-      "probability(#{self[:name]})"
+      "Probability: #{self[:name]}"
     end
   end
 
@@ -81,13 +84,13 @@ module OpenTox
     field :model_id, type: BSON::ObjectId
     field :training_feature_id, type: BSON::ObjectId
     def name
-      "#{self[:name]} Prediction"
+      "Prediction: #{self[:name]}"
     end
   end
 
   class LazarPredictionInterval < NumericLazarPrediction
     def name
-      "prediction_interval_#{self[:name]}"
+      "#{self[:name].capitalize} prediction interval"
     end
   end
 
index 2a3f749..e77de9d 100644 (file)
@@ -17,19 +17,22 @@ raise "Incorrect lazar environment variable LAZAR_ENV '#{ENV["LAZAR_ENV"]}', ple
 
 ENV["MONGOID_ENV"] = ENV["LAZAR_ENV"] 
 ENV["RACK_ENV"] = ENV["LAZAR_ENV"] # should set sinatra environment
+# CH: this interferes with /etc/hosts on my machine
 # search for a central mongo database in use
 # http://opentox.github.io/installation/2017/03/07/use-central-mongodb-in-docker-environment
-CENTRAL_MONGO_IP = `grep -oP '^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}(?=.*mongodb)' /etc/hosts`.chomp
+CENTRAL_MONGO_IP = `grep -oP '^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}(?=.*mongodb)' /etc/hosts`.chomp
 Mongoid.load_configuration({
   :clients => {
     :default => {
       :database => ENV["LAZAR_ENV"],
-      :hosts => (CENTRAL_MONGO_IP.blank? ? ["localhost:27017"] : ["#{CENTRAL_MONGO_IP}:27017"]),
+      #:hosts => (CENTRAL_MONGO_IP.blank? ? ["localhost:27017"] : ["#{CENTRAL_MONGO_IP}:27017"]),
+      :hosts => ["localhost:27017"]
     }
   }
 })
 Mongoid.raise_not_found_error = false # return nil if no document is found
-$mongo = Mongo::Client.new("mongodb://#{(CENTRAL_MONGO_IP.blank? ? "127.0.0.1" : CENTRAL_MONGO_IP)}:27017/#{ENV['LAZAR_ENV']}")
+#$mongo = Mongo::Client.new("mongodb://#{(CENTRAL_MONGO_IP.blank? ? "127.0.0.1" : CENTRAL_MONGO_IP)}:27017/#{ENV['LAZAR_ENV']}")
+$mongo = Mongo::Client.new("mongodb://127.0.0.1:27017/#{ENV['LAZAR_ENV']}")
 $gridfs = $mongo.database.fs
 
 # Logger setup
index cbfefe3..05cd113 100644 (file)
@@ -286,14 +286,14 @@ module OpenTox
         end
         if threshold == algorithms[:similarity][:min].first
           if prediction[:warnings].empty? 
-            prediction[:confidence] = "High (close to bioassay results)"
+            prediction[:confidence] = "Similar to bioassay results"
             return prediction
           else # try again with a lower threshold
             prediction[:warnings] << "Lowering similarity threshold to #{algorithms[:similarity][:min].last}."
             predict_substance substance, algorithms[:similarity][:min].last, prediction
           end
         elsif threshold < algorithms[:similarity][:min].first
-          prediction[:confidence] = "Low (lower than bioassay results)"
+          prediction[:confidence] = "Lower than bioassay results"
           return prediction
         end
       end
@@ -348,9 +348,9 @@ module OpenTox
             end
           elsif prediction_feature.is_a? NumericBioActivity
             f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id)
-            prediction_interval = {}
+            prediction_interval = []
             ["lower","upper"].each do |v|
-              prediction_interval[v] = LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
+              prediction_interval << LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
             end
           end
 
index 8a8970e..d603294 100644 (file)
@@ -18,7 +18,7 @@ module OpenTox
                 if pred[:value] == v
                   confusion_matrix[:all][i][i] += 1
                   self.nr_predictions[:all] += 1
-                  if pred[:confidence].match(/High/i)
+                  if pred[:confidence].match(/Similar/i)
                     confusion_matrix[:confidence_high][i][i] += 1
                     self.nr_predictions[:confidence_high] += 1
                   elsif pred[:confidence].match(/Low/i)
@@ -32,7 +32,7 @@ module OpenTox
                 if pred[:value] == v
                   confusion_matrix[:all][i][(i+1)%2] += 1
                   self.nr_predictions[:all] += 1
-                  if pred[:confidence].match(/High/i)
+                  if pred[:confidence].match(/Similar/i)
                     confusion_matrix[:confidence_high][i][(i+1)%2] += 1
                     self.nr_predictions[:confidence_high] += 1
                   elsif pred[:confidence].match(/Low/i)
index 79ccb98..c41b211 100644 (file)
@@ -84,13 +84,19 @@ class ClassificationModelTest < MiniTest::Test
     assert_kind_of Dataset, result
     assert_equal 7, result.features.size
     assert_equal 85, result.compounds.size
-    prediction_feature = result.prediction_features.first
+    prediction_feature = result.prediction_feature
     assert_equal ["carcinogenic"], result.values(result.compounds[1], prediction_feature)
     assert_equal ["non-carcinogenic"], result.values(result.compounds[5], prediction_feature)
     assert_nil result.predictions[result.compounds.first][:value]
     assert_equal "carcinogenic", result.predictions[result.compounds[1]][:value]
     assert_equal 0.27, result.predictions[result.compounds[1]][:probabilities]["non-carcinogenic"].round(2)
-    assert_match /High/i, result.predictions[result.compounds[1]][:confidence]
+    assert_match /Similar/i, result.predictions[result.compounds[1]][:confidence]
+    csv = result.to_prediction_csv
+    rows = csv.split("\n")
+    assert_equal "ID,Original SMILES,Canonical SMILES,Prediction,Confidence,Probability: carcinogenic,Probability: non-carcinogenic,Measurements", rows[0]
+    items = rows[2].split(",")
+    assert_equal "carcinogenic", items[3]
+    assert_equal 0.27, items[6].to_f.round(2) # probabilities
   end
 
   def test_carcinogenicity_rf_classification
index 8e230e0..b978512 100644 (file)
@@ -137,7 +137,6 @@ class DatasetTest < MiniTest::Test
       d = Dataset.from_csv_file File.join(DATA_DIR,"batch_prediction_#{type}_small.csv")
       assert_equal Dataset, d.class
       refute_nil d.id
-      dataset = Dataset.find d.id
       assert_equal 3, d.compounds.size
     end
   end
@@ -175,10 +174,16 @@ class DatasetTest < MiniTest::Test
     datasets = [hansen,efsa,kazius]
     map = {"mutagen" => "mutagenic", "nonmutagen" => "non-mutagenic"}
     dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,nil,map], keep_original_features: true, remove_duplicates: true
-    assert_equal 8281, dataset.compounds.size
-    assert_equal 9, dataset.features.size
+    csv = dataset.to_training_csv
+    rows = csv.split("\n")
+    header = rows.shift
+    assert_equal "Canonical SMILES,Mutagenicity",header
+    values = rows.collect{|r| r.split(",")[1]}.uniq
+    assert_equal 2, values.size
+    assert_equal 8290, dataset.compounds.size
     c = Compound.from_smiles("C/C=C/C=O")
     assert_equal ["mutagenic"], dataset.values(c,dataset.merged_features.first)
+    assert_equal 9, dataset.features.size
   end
 
   # serialisation
@@ -203,6 +208,13 @@ class DatasetTest < MiniTest::Test
   end
 
   # special cases/details
+  def test_daphnia_import
+    d = Dataset.from_csv_file File.join(File.dirname(__FILE__),"..","data", "Acute_toxicity-Daphnia_magna.csv")
+    assert 3, d.features.size
+    assert 546, d.compounds.size
+    puts d.to_training_csv
+  end
 
   def test_dataset_accessors
     d = Dataset.from_csv_file "#{DATA_DIR}/multicolumn.csv"
index 7f667dc..3b41171 100644 (file)
@@ -173,13 +173,26 @@ class LazarRegressionTest < MiniTest::Test
     model = Model::Lazar.create training_dataset: training_dataset
     result = model.predict training_dataset
     assert_kind_of Dataset, result
-    assert_equal 6, result.features.size
+    assert_equal 8, result.features.size
     assert_equal 88, result.compounds.size
     assert_equal [1.95], result.values(result.compounds.first, result.bioactivity_features[0]).collect{|v| v.round(2)}
     assert_equal [1.37], result.values(result.compounds[6], result.bioactivity_features[0]).collect{|v| v.round(2)}
-    assert_equal [1.79], result.values(result.compounds[6], result.prediction_features[0]).collect{|v| v.round(2)}
+    assert_equal [1.79], result.values(result.compounds[6], result.prediction_feature).collect{|v| v.round(2)}
     assert_equal [1.84,1.73], result.values(result.compounds[7], result.bioactivity_features[0]).collect{|v| v.round(2)}
     assert_match /Low/i, result.predictions[result.compounds[6]][:confidence]
+    csv = result.to_prediction_csv
+    rows = csv.split("\n")
+    assert_equal "ID,Original SMILES,Canonical SMILES,Prediction,Confidence,Lower prediction interval,Upper prediction interval,Measurements", rows[0]
+    items = rows[3].split(",")
+    # prediction and measurement within prediciton interval
+    prediction = items[3].to_f
+    pi_low = items[5].to_f
+    pi_hi = items[6].to_f
+    measurement = items[7].to_f
+    [prediction,measurement].each do |v|
+      assert(v > pi_low)
+      assert(v < pi_hi)
+    end
   end
 
 end