summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2015-10-08 10:43:43 +0200
committerChristoph Helma <helma@in-silico.ch>2015-10-08 10:43:43 +0200
commit1a56148aadef031c4f487bc23fda43f4ac5b7369 (patch)
tree3555c5883ed0c292b105c40c185ebba3e5bd4e3e /lib
parent394d564699756288569169ff3e198d6d7702f092 (diff)
parente3217075b602a950a0ee995fcfa731d97b5ba3eb (diff)
new master branch
Diffstat (limited to 'lib')
-rw-r--r--lib/SMARTS_InteLigand.txt983
-rw-r--r--lib/algorithm.rb21
-rw-r--r--lib/bbrc.rb165
-rw-r--r--lib/classification.rb110
-rw-r--r--lib/compound.rb370
-rw-r--r--lib/crossvalidation.rb302
-rw-r--r--lib/dataset.rb350
-rw-r--r--lib/descriptor.rb248
-rw-r--r--lib/error.rb66
-rw-r--r--lib/experiment.rb99
-rw-r--r--lib/feature.rb95
-rw-r--r--lib/lazar.rb72
-rw-r--r--lib/model.rb265
-rw-r--r--lib/opentox.rb22
-rw-r--r--lib/overwrite.rb148
-rw-r--r--lib/regression.rb262
-rw-r--r--lib/rest-client-wrapper.rb98
-rw-r--r--lib/similarity.rb58
-rw-r--r--lib/unique_descriptors.rb120
-rw-r--r--lib/validation.rb68
20 files changed, 3922 insertions, 0 deletions
diff --git a/lib/SMARTS_InteLigand.txt b/lib/SMARTS_InteLigand.txt
new file mode 100644
index 0000000..23bc6e2
--- /dev/null
+++ b/lib/SMARTS_InteLigand.txt
@@ -0,0 +1,983 @@
+#
+# SMARTS Patterns for Functional Group Classification
+#
+# written by Christian Laggner
+# Copyright 2005 Inte:Ligand Software-Entwicklungs und Consulting GmbH
+#
+# Released under the Lesser General Public License (LGPL license)
+# see http://www.gnu.org/copyleft/lesser.html
+# Modified from Version 221105
+#####################################################################################################
+
+# General Stuff:
+# These patters were written in an attempt to represent the classification of organic compounds
+# from the viewpoint of an organic chemist.
+# They are often very restrictive. This may be generally a good thing, but it also takes some time
+# for filtering/indexing large compound sets.
+# For filtering undesired groups (in druglike compounds) one will want to have more general patterns
+# (e.g. you don't want *any* halide of *any* acid, *neither* aldehyde *nor* formyl esters and amides, ...).
+#
+
+# Part I: Carbon
+# ==============
+
+
+# I.1: Carbon-Carbon Bonds
+# ------------------------
+
+# I.1.1 Alkanes:
+
+Primary_carbon: [CX4H3][#6]
+
+Secondary_carbon: [CX4H2]([#6])[#6]
+
+Tertiary_carbon: [CX4H1]([#6])([#6])[#6]
+
+Quaternary_carbon: [CX4]([#6])([#6])([#6])[#6]
+
+
+# I.1.2 C-C double and Triple Bonds
+
+Alkene: [CX3;$([H2]),$([H1][#6]),$(C([#6])[#6])]=[CX3;$([H2]),$([H1][#6]),$(C([#6])[#6])]
+# sp2 C may be substituted only by C or H -
+# does not hit ketenes and allenes, nor enamines, enols and the like
+
+Alkyne: [CX2]#[CX2]
+# non-carbon substituents (e.g. alkynol ethers) are rather rare, thus no further discrimination
+
+Allene: [CX3]=[CX2]=[CX3]
+
+
+# I.2: One Carbon-Hetero Bond
+# ---------------------------
+
+
+# I.2.1 Alkyl Halogenides
+
+Alkylchloride: [ClX1][CX4]
+# will also hit chloromethylethers and the like, but no chloroalkenes, -alkynes or -aromats
+# a more restrictive version can be obtained by modifying the Alcohol string.
+
+Alkylfluoride: [FX1][CX4]
+
+Alkylbromide: [BrX1][CX4]
+
+Alkyliodide: [IX1][CX4]
+
+
+# I.2.2 Alcohols and Ethers
+
+Alcohol: [OX2H][CX4;!$(C([OX2H])[O,S,#7,#15])]
+# nonspecific definition, no acetals, aminals, and the like
+
+Primary_alcohol: [OX2H][CX4H2;!$(C([OX2H])[O,S,#7,#15])]
+
+Secondary_alcohol: [OX2H][CX4H;!$(C([OX2H])[O,S,#7,#15])]
+
+Tertiary_alcohol: [OX2H][CX4D4;!$(C([OX2H])[O,S,#7,#15])]
+
+Dialkylether: [OX2]([CX4;!$(C([OX2])[O,S,#7,#15,F,Cl,Br,I])])[CX4;!$(C([OX2])[O,S,#7,#15])]
+# no acetals and the like; no enolethers
+
+Dialkylthioether: [SX2]([CX4;!$(C([OX2])[O,S,#7,#15,F,Cl,Br,I])])[CX4;!$(C([OX2])[O,S,#7,#15])]
+# no acetals and the like; no enolethers
+
+Alkylarylether: [OX2](c)[CX4;!$(C([OX2])[O,S,#7,#15,F,Cl,Br,I])]
+# no acetals and the like; no enolethers
+
+Diarylether: [c][OX2][c]
+
+Alkylarylthioether: [SX2](c)[CX4;!$(C([OX2])[O,S,#7,#15,F,Cl,Br,I])]
+
+Diarylthioether: [c][SX2][c]
+
+Oxonium: [O+;!$([O]~[!#6]);!$([S]*~[#7,#8,#15,#16])]
+# can't be aromatic, thus O and not #8
+
+# I.2.3 Amines
+
+Amine: [NX3+0,NX4+;!$([N]~[!#6]);!$([N]*~[#7,#8,#15,#16])]
+# hits all amines (prim/sec/tert/quart), including ammonium salts, also enamines, but not amides, imides, aminals, ...
+
+# the following amines include also the protonated forms
+
+Primary_aliph_amine: [NX3H2+0,NX4H3+;!$([N][!C]);!$([N]*~[#7,#8,#15,#16])]
+
+Secondary_aliph_amine: [NX3H1+0,NX4H2+;!$([N][!C]);!$([N]*~[#7,#8,#15,#16])]
+
+Tertiary_aliph_amine: [NX3H0+0,NX4H1+;!$([N][!C]);!$([N]*~[#7,#8,#15,#16])]
+
+Quaternary_aliph_ammonium: [NX4H0+;!$([N][!C]);!$([N]*~[#7,#8,#15,#16])]
+
+Primary_arom_amine: [NX3H2+0,NX4H3+]c
+
+Secondary_arom_amine: [NX3H1+0,NX4H2+;!$([N][!c]);!$([N]*~[#7,#8,#15,#16])]
+
+Tertiary_arom_amine: [NX3H0+0,NX4H1+;!$([N][!c]);!$([N]*~[#7,#8,#15,#16])]
+
+Quaternary_arom_ammonium: [NX4H0+;!$([N][!c]);!$([N]*~[#7,#8,#15,#16])]
+
+Secondary_mixed_amine: [NX3H1+0,NX4H2+;$([N]([c])[C]);!$([N]*~[#7,#8,#15,#16])]
+
+Tertiary_mixed_amine: [NX3H0+0,NX4H1+;$([N]([c])([C])[#6]);!$([N]*~[#7,#8,#15,#16])]
+
+Quaternary_mixed_ammonium: [NX4H0+;$([N]([c])([C])[#6][#6]);!$([N]*~[#7,#8,#15,#16])]
+
+Ammonium: [N+;!$([N]~[!#6]);!$(N=*);!$([N]*~[#7,#8,#15,#16])]
+# only C and H substituents allowed. Quaternary or protonated amines
+# NX4+ or Nv4+ is not recognized by Daylight's depictmatch if less than four C are present
+
+
+# I.2.4 Others
+
+Alkylthiol: [SX2H][CX4;!$(C([SX2H])~[O,S,#7,#15])]
+
+Dialkylthioether: [SX2]([CX4;!$(C([SX2])[O,S,#7,#15,F,Cl,Br,I])])[CX4;!$(C([SX2])[O,S,#7,#15])]
+
+Alkylarylthioether: [SX2](c)[CX4;!$(C([SX2])[O,S,#7,#15])]
+
+Disulfide: [SX2D2][SX2D2]
+
+1,2-Aminoalcohol: [OX2H][CX4;!$(C([OX2H])[O,S,#7,#15,F,Cl,Br,I])][CX4;!$(C([N])[O,S,#7,#15])][NX3;!$(NC=[O,S,N])]
+# does not hit alpha-amino acids, enaminoalcohols, 1,2-aminoacetals, o-aminophenols, etc.
+
+1,2-Diol: [OX2H][CX4;!$(C([OX2H])[O,S,#7,#15])][CX4;!$(C([OX2H])[O,S,#7,#15])][OX2H]
+# does not hit alpha-hydroxy acids, enolalcohols, 1,2-hydroxyacetals, 1,2-diphenols, etc.
+
+1,1-Diol: [OX2H][CX4;!$(C([OX2H])([OX2H])[O,S,#7,#15])][OX2H]
+
+Hydroperoxide: [OX2H][OX2]
+#does not neccessarily have to be connected to a carbon atom, includes also hydrotrioxides
+
+Peroxo: [OX2D2][OX2D2]
+
+Organolithium_compounds: [LiX1][#6,#14]
+
+Organomagnesium_compounds: [MgX2][#6,#14]
+# not restricted to Grignard compounds, also dialkyl Mg
+
+Organometallic_compounds: [!#1;!#5;!#6;!#7;!#8;!#9;!#14;!#15;!#16;!#17;!#33;!#34;!#35;!#52;!#53;!#85]~[#6;!-]
+# very general, includes all metals covalently bound to carbon
+
+
+# I.3: Two Carbon-Hetero Bonds (Carbonyl and Derivatives)
+# ----------------------------
+
+# I.3.1 Double Bond to Hetero
+
+Aldehyde: [$([CX3H][#6]),$([CX3H2])]=[OX1]
+# hits aldehydes including formaldehyde
+
+Ketone: [#6][CX3](=[OX1])[#6]
+# does not include oxo-groups connected to a (hetero-) aromatic ring
+
+Thioaldehyde: [$([CX3H][#6]),$([CX3H2])]=[SX1]
+
+Thioketone: [#6][CX3](=[SX1])[#6]
+# does not include thioxo-groups connected to a (hetero-) aromatic ring
+
+Imine: [NX2;$([N][#6]),$([NH]);!$([N][CX3]=[#7,#8,#15,#16])]=[CX3;$([CH2]),$([CH][#6]),$([C]([#6])[#6])]
+# nitrogen is not part of an amidelike strukture, nor of an aromatic ring, but can be part of an aminal or similar
+
+Immonium: [NX3+;!$([N][!#6]);!$([N][CX3]=[#7,#8,#15,#16])]
+
+Oxime: [NX2](=[CX3;$([CH2]),$([CH][#6]),$([C]([#6])[#6])])[OX2H]
+
+Oximether: [NX2](=[CX3;$([CH2]),$([CH][#6]),$([C]([#6])[#6])])[OX2][#6;!$(C=[#7,#8])]
+# ether, not ester or amide; does not hit isoxazole
+
+
+# I.3.2. Two Single Bonds to Hetero
+
+Acetal: [OX2]([#6;!$(C=[O,S,N])])[CX4;!$(C(O)(O)[!#6])][OX2][#6;!$(C=[O,S,N])]
+# does not hit hydroxy-methylesters, ketenacetals, hemiacetals, orthoesters, etc.
+
+Hemiacetal: [OX2H][CX4;!$(C(O)(O)[!#6])][OX2][#6;!$(C=[O,S,N])]
+
+Aminal: [NX3v3;!$(NC=[#7,#8,#15,#16])]([#6])[CX4;!$(C(N)(N)[!#6])][NX3v3;!$(NC=[#7,#8,#15,#16])][#6]
+# Ns are not part of an amide or similar. v3 ist to exclude nitro and similar groups
+
+Hemiaminal: [NX3v3;!$(NC=[#7,#8,#15,#16])]([#6])[CX4;!$(C(N)(N)[!#6])][OX2H]
+
+Thioacetal: [SX2]([#6;!$(C=[O,S,N])])[CX4;!$(C(S)(S)[!#6])][SX2][#6;!$(C=[O,S,N])]
+
+Thiohemiacetal: [SX2]([#6;!$(C=[O,S,N])])[CX4;!$(C(S)(S)[!#6])][OX2H]
+
+Halogen_acetal_like: [NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][FX1,ClX1,BrX1,IX1]
+# hits chloromethylenethers and other reactive alkylating agents
+
+Acetal_like: [NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][FX1,ClX1,BrX1,IX1,NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])]
+# includes all of the above and other combinations (S-C-N, hydrates, ...), but still no aminomethylenesters and similar
+
+Halogenmethylen_ester_and_similar: [NX3v3,SX2,OX2;$(**=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][FX1,ClX1,BrX1,IX1]
+# also reactive alkylating agents. Acid does not have to be carboxylic acid, also S- and P-based acids allowed
+
+NOS_methylen_ester_and_similar: [NX3v3,SX2,OX2;$(**=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])]
+# Same as above, but N,O or S instead of halogen. Ester/amide allowed only on one side
+
+Hetero_methylen_ester_and_similar: [NX3v3,SX2,OX2;$(**=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][FX1,ClX1,BrX1,IX1,NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])]
+# Combination of the last two patterns
+
+Cyanhydrine: [NX1]#[CX2][CX4;$([CH2]),$([CH]([CX2])[#6]),$(C([CX2])([#6])[#6])][OX2H]
+
+
+# I.3.3 Single Bond to Hetero, C=C Double Bond (Enols and Similar)
+
+Chloroalkene: [ClX1][CX3]=[CX3]
+
+Fluoroalkene: [FX1][CX3]=[CX3]
+
+Bromoalkene: [BrX1][CX3]=[CX3]
+
+Iodoalkene: [IX1][CX3]=[CX3]
+
+Enol: [OX2H][CX3;$([H1]),$(C[#6])]=[CX3]
+# no phenols
+
+Endiol: [OX2H][CX3;$([H1]),$(C[#6])]=[CX3;$([H1]),$(C[#6])][OX2H]
+# no 1,2-diphenols, ketenacetals, ...
+
+Enolether: [OX2]([#6;!$(C=[N,O,S])])[CX3;$([H0][#6]),$([H1])]=[CX3]
+# finds also endiodiethers, but not enolesters, no aromats
+
+Enolester: [OX2]([CX3]=[OX1])[#6X3;$([#6][#6]),$([H1])]=[#6X3;!$(C[OX2H])]
+
+
+Enamine: [NX3;$([NH2][CX3]),$([NH1]([CX3])[#6]),$([N]([CX3])([#6])[#6]);!$([N]*=[#7,#8,#15,#16])][CX3;$([CH]),$([C][#6])]=[CX3]
+# does not hit amines attached to aromatic rings, nor may the nitrogen be aromatic
+
+Thioenol: [SX2H][CX3;$([H1]),$(C[#6])]=[CX3]
+
+Thioenolether: [SX2]([#6;!$(C=[N,O,S])])[CX3;$(C[#6]),$([CH])]=[CX3]
+
+
+# I.4: Three Carbon-Hetero Bonds (Carboxyl and Derivatives)
+# ------------------------------
+
+Acylchloride: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[ClX1]
+
+Acylfluoride: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[FX1]
+
+Acylbromide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[BrX1]
+
+Acyliodide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[IX1]
+
+Acylhalide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[FX1,ClX1,BrX1,IX1]
+# all of the above
+
+
+# The following contains all simple carboxylic combinations of O, N, S, & Hal -
+# - acids, esters, amides, ... as well as a few extra cases (anhydride, hydrazide...)
+# Cyclic structures (including aromats) like lactones, lactames, ... got their own
+# definitions. Structures where both heteroatoms are part of an aromatic ring
+# (oxazoles, imidazoles, ...) were excluded.
+
+Carboxylic_acid: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[$([OX2H]),$([OX1-])]
+# includes carboxylate anions
+
+Carboxylic_ester: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[OX2][#6;!$(C=[O,N,S])]
+# does not hit anhydrides or lactones
+
+Lactone: [#6][#6X3R](=[OX1])[#8X2][#6;!$(C=[O,N,S])]
+# may also be aromatic
+
+Carboxylic_anhydride: [CX3;$([H0][#6]),$([H1])](=[OX1])[#8X2][CX3;$([H0][#6]),$([H1])](=[OX1])
+# anhydride formed by two carboxylic acids, no mixed anhydrides (e.g. between carboxylic acid and sulfuric acid); may be part of a ring, even aromatic
+
+Carboxylic_acid_derivative: [$([#6X3H0][#6]),$([#6X3H])](=[!#6])[!#6]
+# includes most of the structures of I.4 and many more, also 1,3-heteroaromatics such as isoxazole
+
+Carbothioic_acid: [CX3;!R;$([C][#6]),$([CH]);$([C](=[OX1])[$([SX2H]),$([SX1-])]),$([C](=[SX1])[$([OX2H]),$([OX1-])])]
+# hits both tautomeric forms, as well as anions
+
+Carbothioic_S_ester: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[SX2][#6;!$(C=[O,N,S])]
+
+Carbothioic_S_lactone: [#6][#6X3R](=[OX1])[#16X2][#6;!$(C=[O,N,S])]
+# may also be aromatic
+
+Carbothioic_O_ester: [CX3;$([H0][#6]),$([H1])](=[SX1])[OX2][#6;!$(C=[O,N,S])]
+
+Carbothioic_O_lactone: [#6][#6X3R](=[SX1])[#8X2][#6;!$(C=[O,N,S])]
+
+Carbothioic_halide: [CX3;$([H0][#6]),$([H1])](=[SX1])[FX1,ClX1,BrX1,IX1]
+
+Carbodithioic_acid: [CX3;!R;$([C][#6]),$([CH]);$([C](=[SX1])[SX2H])]
+
+Carbodithioic_ester: [CX3;!R;$([C][#6]),$([CH]);$([C](=[SX1])[SX2][#6;!$(C=[O,N,S])])]
+
+Carbodithiolactone: [#6][#6X3R](=[SX1])[#16X2][#6;!$(C=[O,N,S])]
+
+
+Amide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+# does not hit lactames
+
+Primary_amide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[NX3H2]
+
+Secondary_amide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H1][#6;!$(C=[O,N,S])]
+
+Tertiary_amide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H0]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])]
+
+Lactam: [#6R][#6X3R](=[OX1])[#7X3;$([H1][#6;!$(C=[O,N,S])]),$([H0]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+# cyclic amides, may also be aromatic
+
+Alkyl_imide: [#6X3;$([H0][#6]),$([H1])](=[OX1])[#7X3H0]([#6])[#6X3;$([H0][#6]),$([H1])](=[OX1])
+# may be part of a ring, even aromatic. only C allowed at central N. May also be triacyl amide
+
+N_hetero_imide: [#6X3;$([H0][#6]),$([H1])](=[OX1])[#7X3H0]([!#6])[#6X3;$([H0][#6]),$([H1])](=[OX1])
+# everything else than H or C at central N
+
+Imide_acidic: [#6X3;$([H0][#6]),$([H1])](=[OX1])[#7X3H1][#6X3;$([H0][#6]),$([H1])](=[OX1])
+# can be deprotonated
+
+Thioamide: [$([CX3;!R][#6]),$([CX3H;!R])](=[SX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+# does not hit thiolactames
+
+Thiolactam: [#6R][#6X3R](=[SX1])[#7X3;$([H1][#6;!$(C=[O,N,S])]),$([H0]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+# cyclic thioamides, may also be aromatic
+
+
+Oximester: [#6X3;$([H0][#6]),$([H1])](=[OX1])[#8X2][#7X2]=,:[#6X3;$([H0]([#6])[#6]),$([H1][#6]),$([H2])]
+# may also be part of a ring / aromatic
+
+Amidine: [NX3;!$(NC=[O,S])][CX3;$([CH]),$([C][#6])]=[NX2;!$(NC=[O,S])]
+# only basic amidines, not as part of aromatic ring (e.g. imidazole)
+
+Hydroxamic_acid: [CX3;$([H0][#6]),$([H1])](=[OX1])[#7X3;$([H1]),$([H0][#6;!$(C=[O,N,S])])][$([OX2H]),$([OX1-])]
+
+Hydroxamic_acid_ester: [CX3;$([H0][#6]),$([H1])](=[OX1])[#7X3;$([H1]),$([H0][#6;!$(C=[O,N,S])])][OX2][#6;!$(C=[O,N,S])]
+# does not hit anhydrides of carboxylic acids withs hydroxamic acids
+
+
+Imidoacid: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[$([OX2H]),$([OX1-])]
+# not cyclic
+
+Imidoacid_cyclic: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[$([OX2H]),$([OX1-])]
+# the enamide-form of lactames. may be aromatic like 2-hydroxypyridine
+
+Imidoester: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[OX2][#6;!$(C=[O,N,S])]
+# esters of the above structures. no anhydrides.
+
+Imidolactone: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[OX2][#6;!$(C=[O,N,S])]
+# no oxazoles and similar
+
+Imidothioacid: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[$([SX2H]),$([SX1-])]
+# not cyclic
+
+Imidothioacid_cyclic: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[$([SX2H]),$([SX1-])]
+# the enamide-form of thiolactames. may be aromatic like 2-thiopyridine
+
+Imidothioester: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[SX2][#6;!$(C=[O,N,S])]
+# thioesters of the above structures. no anhydrides.
+
+Imidothiolactone: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[SX2][#6;!$(C=[O,N,S])]
+# no thioxazoles and similar
+
+Amidine: [#7X3v3;!$(N([#6X3]=[#7X2])C=[O,S])][CX3R0;$([H1]),$([H0][#6])]=[NX2v3;!$(N(=[#6X3][#7X3])C=[O,S])]
+# only basic amidines, not substituted by carbonyl or thiocarbonyl, not as part of a ring
+
+Imidolactam: [#6][#6X3R;$([H0](=[NX2;!$(N(=[#6X3][#7X3])C=[O,S])])[#7X3;!$(N([#6X3]=[#7X2])C=[O,S])]),$([H0](-[NX3;!$(N([#6X3]=[#7X2])C=[O,S])])=,:[#7X2;!$(N(=[#6X3][#7X3])C=[O,S])])]
+# one of the two C~N bonds is part of a ring (may be aromatic), but not both - thus no imidazole
+
+Imidoylhalide: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[FX1,ClX1,BrX1,IX1]
+# not cyclic
+
+Imidoylhalide_cyclic: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[FX1,ClX1,BrX1,IX1]
+# may also be aromatic
+
+# may be ring, aromatic, substituted with carbonyls, hetero, ...
+# (everything else would get too complicated)
+
+Amidrazone: [$([$([#6X3][#6]),$([#6X3H])](=[#7X2v3])[#7X3v3][#7X3v3]),$([$([#6X3][#6]),$([#6X3H])]([#7X3v3])=[#7X2v3][#7X3v3])]
+# hits both tautomers. as above, it may be ring, aromatic, substituted with carbonyls, hetero, ...
+
+
+Alpha_aminoacid: [NX3,NX4+;!$([N]~[!#6]);!$([N]*~[#7,#8,#15,#16])][C][CX3](=[OX1])[OX2H,OX1-]
+# N may be alkylated, but not part of an amide (as in peptides), ionic forms are included
+# includes also non-natural aminoacids with double-bonded or two aliph./arom. substituents at alpha-C
+# N may not be aromatic as in 1H-pyrrole-2-carboxylic acid
+
+Alpha_hydroxyacid: [OX2H][C][CX3](=[OX1])[OX2H,OX1-]
+
+Peptide_middle: [NX3;$([N][CX3](=[OX1])[C][NX3,NX4+])][C][CX3](=[OX1])[NX3;$([N][C][CX3](=[OX1])[NX3,OX2,OX1-])]
+# finds peptidic structures which are neither C- nor N-terminal. Both neighbours must be amino-acids/peptides
+
+Peptide_C_term: [NX3;$([N][CX3](=[OX1])[C][NX3,NX4+])][C][CX3](=[OX1])[OX2H,OX1-]
+# finds C-terminal amino acids
+
+Peptide_N_term: [NX3,NX4+;!$([N]~[!#6]);!$([N]*~[#7,#8,#15,#16])][C][CX3](=[OX1])[NX3;$([N][C][CX3](=[OX1])[NX3,OX2,OX1-])]
+# finds N-terminal amino acids. As above, N may be substituted, but not part of an amide-bond.
+
+
+Carboxylic_orthoester: [#6][OX2][CX4;$(C[#6]),$([CH])]([OX2][#6])[OX2][#6]
+# hits also anhydride like struktures (e. g. HC(OMe)2-OC=O residues)
+
+Ketene: [CX3]=[CX2]=[OX1]
+
+Ketenacetal: [#7X2,#8X3,#16X2;$(*[#6,#14])][#6X3]([#7X2,#8X3,#16X2;$(*[#6,#14])])=[#6X3]
+# includes aminals, silylacetals, ketenesters, etc. C=C DB is not aromatic, everything else may be
+
+Nitrile: [NX1]#[CX2]
+# includes cyanhydrines
+
+Isonitrile: [CX1-]#[NX2+]
+
+
+Vinylogous_carbonyl_or_carboxyl_derivative: [#6X3](=[OX1])[#6X3]=,:[#6X3][#7,#8,#16,F,Cl,Br,I]
+# may be part of a ring, even aromatic
+
+Vinylogous_acid: [#6X3](=[OX1])[#6X3]=,:[#6X3][$([OX2H]),$([OX1-])]
+
+Vinylogous_ester: [#6X3](=[OX1])[#6X3]=,:[#6X3][#6;!$(C=[O,N,S])]
+
+Vinylogous_amide: [#6X3](=[OX1])[#6X3]=,:[#6X3][#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Vinylogous_halide: [#6X3](=[OX1])[#6X3]=,:[#6X3][FX1,ClX1,BrX1,IX1]
+
+
+
+# I.5: Four Carbon-Hetero Bonds (Carbonic Acid and Derivatives)
+# -----------------------------
+
+Carbonic_acid_dieester: [#6;!$(C=[O,N,S])][#8X2][#6X3](=[OX1])[#8X2][#6;!$(C=[O,N,S])]
+# may be part of a ring, even aromatic
+
+Carbonic_acid_esterhalide: [#6;!$(C=[O,N,S])][OX2;!R][CX3](=[OX1])[OX2][FX1,ClX1,BrX1,IX1]
+
+Carbonic_acid_monoester: [#6;!$(C=[O,N,S])][OX2;!R][CX3](=[OX1])[$([OX2H]),$([OX1-])]
+# unstable
+
+Carbonic_acid_derivatives: [!#6][#6X3](=[!#6])[!#6]
+
+
+Thiocarbonic_acid_dieester: [#6;!$(C=[O,N,S])][#8X2][#6X3](=[SX1])[#8X2][#6;!$(C=[O,N,S])]
+# may be part of a ring, even aromatic
+
+Thiocarbonic_acid_esterhalide: [#6;!$(C=[O,N,S])][OX2;!R][CX3](=[SX1])[OX2][FX1,ClX1,BrX1,IX1]
+
+Thiocarbonic_acid_monoester: [#6;!$(C=[O,N,S])][OX2;!R][CX3](=[SX1])[$([OX2H]),$([OX1-])]
+
+
+Urea:[#7X3;!$([#7][!#6])][#6X3](=[OX1])[#7X3;!$([#7][!#6])]
+# no check whether part of imide, biuret, etc. Aromatic structures are only hit if
+# both N share no double bonds, like in the dioxo-form of uracil
+
+Thiourea: [#7X3;!$([#7][!#6])][#6X3](=[SX1])[#7X3;!$([#7][!#6])]
+
+Isourea: [#7X2;!$([#7][!#6])]=,:[#6X3]([#8X2&!$([#8][!#6]),OX1-])[#7X3;!$([#7][!#6])]
+# O may be substituted. no check whether further amide-like bonds are present. Aromatic
+# structures are only hit if single bonded N shares no additional double bond, like in
+# the 1-hydroxy-3-oxo form of uracil
+
+Isothiourea: [#7X2;!$([#7][!#6])]=,:[#6X3]([#16X2&!$([#16][!#6]),SX1-])[#7X3;!$([#7][!#6])]
+
+Guanidine: [N;v3X3,v4X4+][CX3](=[N;v3X2,v4X3+])[N;v3X3,v4X4+]
+# also hits guanidinium salts. v3 and v4 to avoid nitroamidines
+
+Carbaminic_acid: [NX3]C(=[OX1])[O;X2H,X1-]
+# quite unstable, unlikely to be found. Also hits salts
+
+Urethan: [#7X3][#6](=[OX1])[#8X2][#6]
+# also hits when part of a ring, no check whether the last C is part of carbonyl
+
+Biuret: [#7X3][#6](=[OX1])[#7X3][#6](=[OX1])[#7X3]
+
+Semicarbazide: [#7X3][#7X3][#6X3]([#7X3;!$([#7][#7])])=[OX1]
+
+Carbazide: [#7X3][#7X3][#6X3]([#7X3][#7X3])=[OX1]
+
+Semicarbazone: [#7X2](=[#6])[#7X3][#6X3]([#7X3;!$([#7][#7])])=[OX1]
+
+Carbazone: [#7X2](=[#6])[#7X3][#6X3]([#7X3][#7X3])=[OX1]
+
+Thiosemicarbazide: [#7X3][#7X3][#6X3]([#7X3;!$([#7][#7])])=[SX1]
+
+Thiocarbazide: [#7X3][#7X3][#6X3]([#7X3][#7X3])=[SX1]
+
+Thiosemicarbazone: [#7X2](=[#6])[#7X3][#6X3]([#7X3;!$([#7][#7])])=[SX1]
+
+Thiocarbazone: [#7X2](=[#6])[#7X3][#6X3]([#7X3][#7X3])=[SX1]
+
+
+Isocyanate: [NX2]=[CX2]=[OX1]
+
+Cyanate: [OX2][CX2]#[NX1]
+
+Isothiocyanate: [NX2]=[CX2]=[SX1]
+
+Thiocyanate: [SX2][CX2]#[NX1]
+
+Carbodiimide: [NX2]=[CX2]=[NX2]
+
+Orthocarbonic_derivatives: [CX4H0]([O,S,#7])([O,S,#7])([O,S,#7])[O,S,#7,F,Cl,Br,I]
+# halogen allowed just once, to avoid mapping to -OCF3 and similar groups (much more
+# stable as for example C(OCH3)4)
+
+
+# I.6 Aromatics
+# -------------
+
+# I know that this classification is not very logical, arylamines are found under I.2 ...
+
+Phenol: [OX2H][c]
+
+1,2-Diphenol: [OX2H][c][c][OX2H]
+
+Arylchloride: [Cl][c]
+
+Arylfluoride: [F][c]
+
+Arylbromide: [Br][c]
+
+Aryliodide: [I][c]
+
+Arylthiol: [SX2H][c]
+
+Iminoarene: [c]=[NX2;$([H1]),$([H0][#6;!$([C]=[N,S,O])])]
+# N may be substituted with H or C, but not carbonyl or similar
+# aromatic atom is always C, not S or P (these are not planar when substituted)
+
+Oxoarene: [c]=[OX1]
+
+Thioarene: [c]=[SX1]
+
+Hetero_N_basic_H: [nX3H1+0]
+# as in pyrole. uncharged to exclude pyridinium ions
+
+Hetero_N_basic_no_H: [nX3H0+0]
+# as in N-methylpyrole. uncharged to exclude pyridinium ions
+
+Hetero_N_nonbasic: [nX2,nX3+]
+# as in pyridine, pyridinium
+
+Hetero_O: [o]
+
+Hetero_S: [sX2]
+# X2 because Daylight's depictmatch falsely describes C1=CS(=O)C=C1 as aromatic
+# (is not planar because of lonepair at S)
+
+Heteroaromatic: [a;!c]
+
+
+# Part II: N, S, P, Si, B
+# =======================
+
+
+# II.1 Nitrogen
+# -------------
+
+Nitrite: [NX2](=[OX1])[O;$([X2]),$([X1-])]
+# hits nitrous acid, its anion, esters, and other O-substituted derivatives
+
+Thionitrite: [SX2][NX2]=[OX1]
+
+Nitrate: [$([NX3](=[OX1])(=[OX1])[O;$([X2]),$([X1-])]),$([NX3+]([OX1-])(=[OX1])[O;$([X2]),$([X1-])])]
+# hits nitric acid, its anion, esters, and other O-substituted derivatives
+
+Nitro: [$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]
+# hits nitro groups attached to C,N, ... but not nitrates
+
+Nitroso: [NX2](=[OX1])[!#7;!#8]
+# no nitrites, no nitrosamines
+
+Azide: [NX1]~[NX2]~[NX2,NX1]
+# hits both mesomeric forms, also anion
+
+Acylazide: [CX3](=[OX1])[NX2]~[NX2]~[NX1]
+
+Diazo: [$([#6]=[NX2+]=[NX1-]),$([#6-]-[NX2+]#[NX1])]
+
+Diazonium: [#6][NX2+]#[NX1]
+
+Nitrosamine: [#7;!$(N*=O)][NX2]=[OX1]
+
+Nitrosamide: [NX2](=[OX1])N-*=O
+# includes nitrososulfonamides
+
+N-Oxide: [$([#7+][OX1-]),$([#7v5]=[OX1]);!$([#7](~[O])~[O]);!$([#7]=[#7])]
+# Hits both forms. Won't hit azoxy, nitro, nitroso, or nitrate.
+
+
+Hydrazine: [NX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6]);!$(NC=[O,N,S])][NX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6]);!$(NC=[O,N,S])]
+# no hydrazides
+
+Hydrazone: [NX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6]);!$(NC=[O,N,S])][NX2]=[#6]
+
+Hydroxylamine: [NX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6]);!$(NC=[O,N,S])][OX2;$([H1]),$(O[#6;!$(C=[N,O,S])])]
+# no discrimination between O-, N-, and O,N-substitution
+
+
+# II.2 Sulfur
+# -----------
+
+Sulfon: [$([SX4](=[OX1])(=[OX1])([#6])[#6]),$([SX4+2]([OX1-])([OX1-])([#6])[#6])]
+# can't be aromatic, thus S and not #16
+
+Sulfoxide: [$([SX3](=[OX1])([#6])[#6]),$([SX3+]([OX1-])([#6])[#6])]
+
+Sulfonium: [S+;!$([S]~[!#6]);!$([S]*~[#7,#8,#15,#16])]
+# can't be aromatic, thus S and not #16
+
+Sulfuric_acid: [SX4](=[OX1])(=[OX1])([$([OX2H]),$([OX1-])])[$([OX2H]),$([OX1-])]
+# includes anions
+
+Sulfuric_monoester: [SX4](=[OX1])(=[OX1])([$([OX2H]),$([OX1-])])[OX2][#6;!$(C=[O,N,S])]
+
+Sulfuric_diester: [SX4](=[OX1])(=[OX1])([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
+
+Sulfuric_monoamide: [SX4](=[OX1])(=[OX1])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[$([OX2H]),$([OX1-])]
+
+Sulfuric_diamide: [SX4](=[OX1])(=[OX1])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Sulfuric_esteramide: [SX4](=[OX1])(=[OX1])([#7X3][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
+
+Sulfuric_derivative: [SX4D4](=[!#6])(=[!#6])([!#6])[!#6]
+# everything else (would not be a "true" derivative of sulfuric acid, if one of the substituents were less electronegative
+# than sulfur, but this should be very very rare, anyway)
+
+
+
+#### sulfurous acid and derivatives missing!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+
+
+
+Sulfonic_acid: [SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[$([OX2H]),$([OX1-])]
+
+Sulfonamide: [SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Sulfonic_ester: [SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[OX2][#6;!$(C=[O,N,S])]
+
+Sulfonic_halide: [SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[FX1,ClX1,BrX1,IX1]
+
+Sulfonic_derivative: [SX4;$([H1]),$([H0][#6])](=[!#6])(=[!#6])[!#6]
+# includes all of the above and many more
+# for comparison: this is what "all sulfonic derivatives but not the ones above" would look like:
+# [$([SX4;$([H1]),$([H0][#6])](=[!#6])(=[!#6;!O])[!#6]),$([SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[!$([FX1,ClX1,BrX1,IX1]);!$([#6]);!$([OX2H]);!$([OX1-]);!$([OX2][#6;!$(C=[O,N,S])]);!$([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])])]
+
+
+Sulfinic_acid: [SX3;$([H1]),$([H0][#6])](=[OX1])[$([OX2H]),$([OX1-])]
+
+Sulfinic_amide: [SX3;$([H1]),$([H0][#6])](=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Sulfinic_ester: [SX3;$([H1]),$([H0][#6])](=[OX1])[OX2][#6;!$(C=[O,N,S])]
+
+Sulfinic_halide: [SX3;$([H1]),$([H0][#6])](=[OX1])[FX1,ClX1,BrX1,IX1]
+
+Sulfinic_derivative: [SX3;$([H1]),$([H0][#6])](=[!#6])[!#6]
+
+Sulfenic_acid: [SX2;$([H1]),$([H0][#6])][$([OX2H]),$([OX1-])]
+
+Sulfenic_amide: [SX2;$([H1]),$([H0][#6])][#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Sulfenic_ester: [SX2;$([H1]),$([H0][#6])][OX2][#6;!$(C=[O,N,S])]
+
+Sulfenic_halide: [SX2;$([H1]),$([H0][#6])][FX1,ClX1,BrX1,IX1]
+
+Sulfenic_derivative: [SX2;$([H1]),$([H0][#6])][!#6]
+
+
+# II.3 Phosphorous
+# ----------------
+
+Phosphine: [PX3;$([H3]),$([H2][#6]),$([H1]([#6])[#6]),$([H0]([#6])([#6])[#6])]
+# similar to amine, but less restrictive: includes also amide- and aminal-analogues
+
+Phosphine_oxide: [PX4;$([H3]=[OX1]),$([H2](=[OX1])[#6]),$([H1](=[OX1])([#6])[#6]),$([H0](=[OX1])([#6])([#6])[#6])]
+
+Phosphonium: [P+;!$([P]~[!#6]);!$([P]*~[#7,#8,#15,#16])]
+# similar to Ammonium
+
+Phosphorylen: [PX4;$([H3]=[CX3]),$([H2](=[CX3])[#6]),$([H1](=[CX3])([#6])[#6]),$([H0](=[CX3])([#6])([#6])[#6])]
+
+
+# conventions for the following acids and derivatives:
+# acids find protonated and deprotonated acids
+# esters do not find mixed anhydrides ( ...P-O-C(=O))
+# derivatives: subtituents which go in place of the OH and =O are not H or C (may also be O,
+# thus including acids and esters)
+
+Phosphonic_acid: [PX4;$([H1]),$([H0][#6])](=[OX1])([$([OX2H]),$([OX1-])])[$([OX2H]),$([OX1-])]
+# includes anions
+
+Phosphonic_monoester: [PX4;$([H1]),$([H0][#6])](=[OX1])([$([OX2H]),$([OX1-])])[OX2][#6;!$(C=[O,N,S])]
+
+Phosphonic_diester: [PX4;$([H1]),$([H0][#6])](=[OX1])([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
+
+Phosphonic_monoamide: [PX4;$([H1]),$([H0][#6])](=[OX1])([$([OX2H]),$([OX1-])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphonic_diamide: [PX4;$([H1]),$([H0][#6])](=[OX1])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphonic_esteramide: [PX4;$([H1]),$([H0][#6])](=[OX1])([OX2][#6;!$(C=[O,N,S])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphonic_acid_derivative: [PX4;$([H1]),$([H0][#6])](=[!#6])([!#6])[!#6]
+# all of the above and much more
+
+
+Phosphoric_acid: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([$([OX2H]),$([OX1-])])[$([OX2H]),$([OX1-])]
+# includes anions
+
+Phosphoric_monoester: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([$([OX2H]),$([OX1-])])[OX2][#6;!$(C=[O,N,S])]
+
+Phosphoric_diester: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
+
+Phosphoric_triester: [PX4D4](=[OX1])([OX2][#6;!$(C=[O,N,S])])([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
+
+Phosphoric_monoamide: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([$([OX2H]),$([OX1-])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphoric_diamide: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphoric_triamide: [PX4D4](=[OX1])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphoric_monoestermonoamide: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([OX2][#6;!$(C=[O,N,S])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphoric_diestermonoamide: [PX4D4](=[OX1])([OX2][#6;!$(C=[O,N,S])])([OX2][#6;!$(C=[O,N,S])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphoric_monoesterdiamide: [PX4D4](=[OX1])([OX2][#6;!$(C=[O,N,S])])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphoric_acid_derivative: [PX4D4](=[!#6])([!#6])([!#6])[!#6]
+
+
+Phosphinic_acid: [PX4;$([H2]),$([H1][#6]),$([H0]([#6])[#6])](=[OX1])[$([OX2H]),$([OX1-])]
+
+Phosphinic_ester: [PX4;$([H2]),$([H1][#6]),$([H0]([#6])[#6])](=[OX1])[OX2][#6;!$(C=[O,N,S])]
+
+Phosphinic_amide: [PX4;$([H2]),$([H1][#6]),$([H0]([#6])[#6])](=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphinic_acid_derivative: [PX4;$([H2]),$([H1][#6]),$([H0]([#6])[#6])](=[!#6])[!#6]
+
+
+Phosphonous_acid: [PX3;$([H1]),$([H0][#6])]([$([OX2H]),$([OX1-])])[$([OX2H]),$([OX1-])]
+
+Phosphonous_monoester: [PX3;$([H1]),$([H0][#6])]([$([OX2H]),$([OX1-])])[OX2][#6;!$(C=[O,N,S])]
+
+Phosphonous_diester: [PX3;$([H1]),$([H0][#6])]([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
+
+Phosphonous_monoamide: [PX3;$([H1]),$([H0][#6])]([$([OX2H]),$([OX1-])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphonous_diamide: [PX3;$([H1]),$([H0][#6])]([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphonous_esteramide: [PX3;$([H1]),$([H0][#6])]([OX2][#6;!$(C=[O,N,S])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphonous_derivatives: [PX3;$([D2]),$([D3][#6])]([!#6])[!#6]
+
+
+Phosphinous_acid: [PX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6])][$([OX2H]),$([OX1-])]
+
+Phosphinous_ester: [PX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6])][OX2][#6;!$(C=[O,N,S])]
+
+Phosphinous_amide: [PX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6])][#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphinous_derivatives: [PX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6])][!#6]
+
+
+# II.4 Silicon
+# ------------
+
+Quart_silane: [SiX4]([#6])([#6])([#6])[#6]
+# four C-substituents. non-reactive, non-toxic, in experimental phase for drug development
+
+Non-quart_silane: [SiX4;$([H1]([#6])([#6])[#6]),$([H2]([#6])[#6]),$([H3][#6]),$([H4])]
+# has 1-4 hydride(s), reactive. Daylight's depictmatch does not add hydrogens automatically to
+# the free positions at Si, thus Hs had to be added implicitly
+
+Silylmonohalide: [SiX4]([FX1,ClX1,BrX1,IX1])([#6])([#6])[#6]
+# reagents for inserting protection groups
+
+Het_trialkylsilane: [SiX4]([!#6])([#6])([#6])[#6]
+# mostly acid-labile protection groups such as trimethylsilyl-ethers
+
+Dihet_dialkylsilane: [SiX4]([!#6])([!#6])([#6])[#6]
+
+Trihet_alkylsilane: [SiX4]([!#6])([!#6])([!#6])[#6]
+
+Silicic_acid_derivative: [SiX4]([!#6])([!#6])([!#6])[!#6]
+# four substituent which are neither C nor H
+
+
+# II.5 Boron
+# ----------
+
+Trialkylborane: [BX3]([#6])([#6])[#6]
+# also carbonyls allowed
+
+Boric_acid_derivatives: [BX3]([!#6])([!#6])[!#6]
+# includes acids, esters, amides, ... H-substituent at B is very rare.
+
+Boronic_acid_derivative: [BX3]([!#6])([!#6])[!#6]
+# # includes acids, esters, amides, ...
+
+Borohydride: [BH1,BH2,BH3,BH4]
+# at least one H attached to B
+
+Quaternary_boron: [BX4]
+# mostly borates (negative charge), in complex with Lewis-base
+
+
+
+# Part III: Some Special Patterns
+# ===============================
+
+
+# III.1 Chains
+# ------------
+
+# some simple chains
+
+
+
+# III.2 Rings
+# -----------
+
+Aromatic: a
+
+Heterocyclic: [!#6;!R0]
+# may be aromatic or not
+
+Epoxide: [OX2r3]1[#6r3][#6r3]1
+# toxic/reactive. may be annelated to aromat, but must not be aromatic itself (oxirane-2,3-dione)
+
+NH_aziridine: [NX3H1r3]1[#6r3][#6r3]1
+# toxic/reactive according to Maybridge's garbage filter
+
+Spiro: [D4R;$(*(@*)(@*)(@*)@*)]
+# at least two different rings can be found which are sharing just one atom.
+# these two rings can be connected by a third ring, so it matches also some
+# bridged systems, like morphine
+
+Annelated_rings: [R;$(*(@*)(@*)@*);!$([R2;$(*(@*)(@*)(@*)@*)])]@[R;$(*(@*)(@*)@*);!$([R2;$(*(@*)(@*)(@*)@*)])]
+# two different rings sharing exactly two atoms
+
+Bridged_rings: [R;$(*(@*)(@*)@*);!$([D4R;$(*(@*)(@*)(@*)@*)]);!$([R;$(*(@*)(@*)@*);!$([R2;$(*(@*)(@*)(@*)@*)])]@[R;$(*(@*)(@*)@*);!$([R2;$(*(@*)(@*)(@*)@*)])])]
+# part of two or more rings, not spiro, not annelated -> finds bridgehead atoms,
+# but only if they are not annelated at the same time - otherwise impossible (?)
+# to distinguish from non-bridgehead annelated atoms
+
+# some basic ring-patterns (just size, no other information):
+
+
+
+
+
+# III.3 Sugars and Nucleosides/Nucleotides, Steroids
+# --------------------------------------------------
+
+# because of the large variety of sugar derivatives, different patterns can be applied.
+# The choice of patterns and their combinations will depend on the contents of the database
+# e.g. natural products, nucleoside analoges with modified sugars, ... as well as on the
+# desired restriction
+
+
+Sugar_pattern_1: [OX2;$([r5]1@C@C@C(O)@C1),$([r6]1@C@C@C(O)@C(O)@C1)]
+# 5 or 6-membered ring containing one O and at least one (r5) or two (r6) oxygen-substituents.
+
+Sugar_pattern_2: [OX2;$([r5]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)]
+# 5 or 6-membered ring containing one O and an acetal-like bond at postion 2.
+
+Sugar_pattern_combi: [OX2;$([r5]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C(O)@C1),$([r6]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C(O)@C(O)@C1)]
+# combination of the two above
+
+Sugar_pattern_2_reducing: [OX2;$([r5]1@C(!@[OX2H1])@C@C@C1),$([r6]1@C(!@[OX2H1])@C@C@C@C1)]
+# 5 or 6-membered cyclic hemi-acetal
+
+Sugar_pattern_2_alpha: [OX2;$([r5]1@[C@@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@[C@@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)]
+# 5 or 6-membered cyclic hemi-acetal
+
+Sugar_pattern_2_beta: [OX2;$([r5]1@[C@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@[C@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)]
+# 5 or 6-membered cyclic hemi-acetal
+
+##Poly_sugar_1: ([OX2;$([r5]1@C@C@C(O)@C1),$([r6]1@C@C@C(O)@C(O)@C1)].[OX2;$([r5]1@C@C@C(O)@C1),$([r6]1@C@C@C(O)@C(O)@C1)])
+# pattern1 occours more than once (in same molecule, but moieties don't have to be adjacent!)
+
+##Poly_sugar_2: ([OX2;$([r5]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)].[OX2;$([r5]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)])
+# pattern2 occours more than once (in same molecule, but moieties don't have to be adjacent!)
+
+
+# III.4 Everything else...
+# ------------------------
+
+Conjugated_double_bond: *=*[*]=,#,:[*]
+
+Conjugated_tripple_bond: *#*[*]=,#,:[*]
+
+Cis_double_bond: */[D2]=[D2]\*
+# only one single-bonded substituent on each DB-atom. no aromats.
+# only found when character of DB is explicitely stated.
+
+Trans_double_bond: */[D2]=[D2]/*
+# analog
+
+Mixed_anhydrides: [$(*=O),$([#16,#14,#5]),$([#7]([#6]=[OX1]))][#8X2][$(*=O),$([#16,#14,#5]),$([#7]([#6]=[OX1]))]
+# should hits all combinations of two acids
+
+Halogen_on_hetero: [FX1,ClX1,BrX1,IX1][!#6]
+
+Halogen_multi_subst: [F,Cl,Br,I;!$([X1]);!$([X0-])]
+# Halogen which is not mono-substituted nor an anion, e.g. chlorate.
+# Most of these cases should be also filtered by Halogen_on_hetero.
+
+Trifluoromethyl: [FX1][CX4;!$([H0][Cl,Br,I]);!$([F][C]([F])([F])[F])]([FX1])([FX1])
+# C with three F attached, connected to anything which is not another halogen
+
+C_ONS_bond: [#6]~[#7,#8,#16]
+# probably all drug-like molecules have at least one O, N, or S connected to a C -> nice filter
+
+## Mixture: (*).(*)
+# two or more seperate parts, may also be salt
+# component-level grouping is not yet supported in Open Babel Version 2.0
+
+
+Charged: [!+0]
+
+Anion: [-1,-2,-3,-4,-5,-6,-7]
+
+Kation: [+1,+2,+3,+4,+5,+6,+7]
+
+Salt: ([-1,-2,-3,-4,-5,-6,-7]).([+1,+2,+3,+4,+5,+6,+7])
+# two or more seperate components with opposite charges
+
+##Zwitterion: ([-1,-2,-3,-4,-5,-6,-7].[+1,+2,+3,+4,+5,+6,+7])
+# both negative and positive charges somewhere within the same molecule.
+
+1,3-Tautomerizable: [$([#7X2,OX1,SX1]=*[!H0;!$([a;!n])]),$([#7X3,OX2,SX2;!H0]*=*),$([#7X3,OX2,SX2;!H0]*:n)]
+# 1,3 migration of H allowed. Includes keto/enol and amide/enamide.
+# Aromatic rings must stay aromatic - no keto form of phenol
+
+1,5-Tautomerizable: [$([#7X2,OX1,SX1]=,:**=,:*[!H0;!$([a;!n])]),$([#7X3,OX2,SX2;!H0]*=**=*),$([#7X3,OX2,SX2;!H0]*=,:**:n)]
+
+Rotatable_bond: [!$(*#*)&!D1]-!@[!$(*#*)&!D1]
+# taken from http://www.daylight.com/support/contrib/smarts/content.html
+
+Michael_acceptor: [CX3]=[CX3][$([CX3]=[O,N,S]),$(C#[N]),$([S,P]=[OX1]),$([NX3]=O),$([NX3+](=O)[O-])]
+# the classical case: C=C near carbonyl, nitrile, nitro, or similar
+# Oxo-heteroaromats and similar are not included.
+
+Dicarbodiazene: [CX3](=[OX1])[NX2]=[NX2][CX3](=[OX1])
+# Michael-like acceptor, see Mitsunobu reaction
+
+# H-Bond_donor:
+
+# H-Bond_acceptor:
+
+# Pos_ionizable:
+
+# Neg_ionizable:
+
+# Unlikely_ions:
+# O+,N-,C+,C-, ...
+
+CH-acidic: [$([CX4;!$([H0]);!$(C[!#6;!$([P,S]=O);!$(N(~O)~O)])][$([CX3]=[O,N,S]),$(C#[N]),$([S,P]=[OX1]),$([NX3]=O),$([NX3+](=O)[O-]);!$(*[S,O,N;H1,H2]);!$([*+0][S,O;X1-])]),$([CX4;!$([H0])]1[CX3]=[CX3][CX3]=[CX3]1)]
+# C-H alpha to carbony, nitro or similar, C is not double-bonded, only C, H, S,P=O and nitro substituents allowed.
+# pentadiene is included. acids, their salts, prim./sec. amides, and imides are excluded.
+# hits also CH-acidic_strong
+
+CH-acidic_strong: [CX4;!$([H0]);!$(C[!#6;!$([P,S]=O);!$(N(~O)~O)])]([$([CX3]=[O,N,S]),$(C#[N]),$([S,P]=[OX1]),$([NX3]=O),$([NX3+](=O)[O-]);!$(*[S,O,N;H1,H2]);!$([*+0][S,O;X1-])])[$([CX3]=[O,N,S]),$(C#[N]),$([S,P]=[OX1]),$([NX3]=O),$([NX3+](=O)[O-]);!$(*[S,O,N;H1,H2]);!$([*+0][S,O;X1-])]
+# same as above (without pentadiene), but carbonyl or similar on two or three sides
+
+Chiral_center_specified: [$([*@](~*)(~*)(*)*),$([*@H](*)(*)*),$([*@](~*)(*)*),$([*@H](~*)~*)]
+# Hits atoms with tetrahedral chirality, if chiral center is specified in the SMILES string
+# depictmach does not find oxonium, sulfonium, or sulfoxides!
+
+# Chiral_center_unspecified: [$([*@?](~*)(~*)(*)*),$([*@?H](*)(*)*),$([*@?](~*)(*)*),$([*@?H](~*)~*)]
+# Hits atoms with tetrahedral chirality, if chiral center is not specified in the SMILES string
+# "@?" (unspecified chirality) is not yet supported in Open Babel Version 2.0
+ \ No newline at end of file
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
new file mode 100644
index 0000000..113f847
--- /dev/null
+++ b/lib/algorithm.rb
@@ -0,0 +1,21 @@
+module OpenTox
+
+ module Algorithm
+
+ # Generic method to execute algorithms
+ # Algorithms should:
+ # - accept a Compound, an Array of Compounds or a Dataset as first argument
+ # - optional parameters as second argument
+ # - return an object corresponding to the input type as result (eg. Compound -> value, Array of Compounds -> Array of values, Dataset -> Dataset with values
+ # @param [OpenTox::Compound,Array,OpenTox::Dataset] Input object
+ # @param [Hash] Algorithm parameters
+ # @return Algorithm result
+ def self.run algorithm, object, parameters=nil
+ bad_request_error "Cannot run '#{algorithm}' algorithm. Please provide an OpenTox::Algorithm." unless algorithm =~ /^OpenTox::Algorithm/
+ klass,method = algorithm.split('.')
+ parameters.nil? ? Object.const_get(klass).send(method,object) : Object.const_get(klass).send(method,object, parameters)
+ end
+
+ end
+end
+
diff --git a/lib/bbrc.rb b/lib/bbrc.rb
new file mode 100644
index 0000000..c83b9b3
--- /dev/null
+++ b/lib/bbrc.rb
@@ -0,0 +1,165 @@
+module OpenTox
+ module Algorithm
+ class Fminer
+ TABLE_OF_ELEMENTS = [
+"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"]
+
+ #
+ # Run bbrc algorithm on dataset
+ #
+ # @param [OpenTox::Dataset] training dataset
+ # @param [optional] parameters BBRC parameters, accepted parameters are
+ # - min_frequency Minimum frequency (default 5)
+ # - feature_type Feature type, can be 'paths' or 'trees' (default "trees")
+ # - backbone BBRC classes, pass 'false' to switch off mining for BBRC representatives. (default "true")
+ # - min_chisq_significance Significance threshold (between 0 and 1)
+ # - nr_hits Set to "true" to get hit count instead of presence
+ # - get_target Set to "true" to obtain target variable as feature
+ # @return [OpenTox::Dataset] Fminer Dataset
+ def self.bbrc training_dataset, params={}
+
+ time = Time.now
+ bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
+
+ prediction_feature = training_dataset.features.first
+ if params[:min_frequency]
+ minfreq = params[:min_frequency]
+ else
+ per_mil = 5 # value from latest version
+ per_mil = 8 # as suggested below
+ i = training_dataset.feature_ids.index prediction_feature.id
+ nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size
+ minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
+ minfreq = 2 unless minfreq > 2
+ minfreq = minfreq.round
+ end
+
+ @bbrc ||= Bbrc::Bbrc.new
+ @bbrc.Reset
+ if prediction_feature.numeric
+ @bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations!
+ else
+ bad_request_error "No accept values for "\
+ "dataset '#{training_dataset.id}' and "\
+ "feature '#{prediction_feature.id}'" unless prediction_feature.accept_values
+ value2act = Hash[[*prediction_feature.accept_values.map.with_index]]
+ end
+ @bbrc.SetMinfreq(minfreq)
+ @bbrc.SetType(1) if params[:feature_type] == "paths"
+ @bbrc.SetBackbone(false) if params[:backbone] == "false"
+ @bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance]
+ @bbrc.SetConsoleOut(false)
+
+ params[:nr_hits] ? nr_hits = params[:nr_hits] : nr_hits = false
+ feature_dataset = FminerDataset.new(
+ :training_dataset_id => training_dataset.id,
+ :training_algorithm => "#{self.to_s}.bbrc",
+ :training_feature_id => prediction_feature.id ,
+ :training_parameters => {
+ :min_frequency => minfreq,
+ :nr_hits => nr_hits,
+ :backbone => (params[:backbone] == false ? false : true)
+ }
+
+ )
+ feature_dataset.compounds = training_dataset.compounds
+
+ # add data
+ training_dataset.compounds.each_with_index do |compound,i|
+ act = value2act[training_dataset.data_entries[i].first]
+ if act # TODO check if this works
+ @bbrc.AddCompound(compound.smiles,i+1)
+ @bbrc.AddActivity(act,i+1)
+ end
+ end
+ #g_median=@fminer.all_activities.values.to_scale.median
+
+ #task.progress 10
+ #step_width = 80 / @bbrc.GetNoRootNodes().to_f
+
+ $logger.debug "BBRC setup: #{Time.now-time}"
+ time = Time.now
+ ftime = 0
+ itime = 0
+ rtime = 0
+
+ # run @bbrc
+ (0 .. @bbrc.GetNoRootNodes()-1).each do |j|
+ results = @bbrc.MineRoot(j)
+ results.each do |result|
+ rt = Time.now
+ f = YAML.load(result)[0]
+ smarts = f.shift
+ # convert fminer SMARTS representation into a more human readable format
+ smarts.gsub!(%r{\[#(\d+)&(\w)\]}) do
+ element = TABLE_OF_ELEMENTS[$1.to_i-1]
+ $2 == "a" ? element.downcase : element
+ end
+ p_value = f.shift
+ f.flatten!
+ compound_idxs = f.collect{|e| e.first.first-1}
+ # majority class
+ effect = compound_idxs.collect{|i| training_dataset.data_entries[i].first}.mode
+
+=begin
+ if (!@bbrc.GetRegression)
+ id_arrs = f[2..-1].flatten
+ max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc
+ effect = max+1
+ else #regression part
+ id_arrs = f[2]
+ # DV: effect calculation
+ f_arr=Array.new
+ f[2].each do |id|
+ id=id.keys[0] # extract id from hit count hash
+ f_arr.push(@fminer.all_activities[id])
+ end
+ f_median=f_arr.to_scale.median
+ if g_median >= f_median
+ effect = 'activating'
+ else
+ effect = 'deactivating'
+ end
+ end
+=end
+ rtime += Time.now - rt
+
+ ft = Time.now
+ feature = OpenTox::FminerSmarts.find_or_create_by({
+ "smarts" => smarts,
+ "p_value" => p_value.to_f.abs.round(5),
+ "effect" => effect,
+ "dataset_id" => feature_dataset.id
+ })
+ feature_dataset.feature_ids << feature.id
+ ftime += Time.now - ft
+
+ it = Time.now
+ f.each do |id_count_hash|
+ id_count_hash.each do |id,count|
+ nr_hits ? count = count.to_i : count = 1
+ feature_dataset.data_entries[id-1] ||= []
+ feature_dataset.data_entries[id-1][feature_dataset.feature_ids.size-1] = count
+ end
+ end
+ itime += Time.now - it
+
+ end
+ end
+
+ $logger.debug "Fminer: #{Time.now-time} (read: #{rtime}, iterate: #{itime}, find/create Features: #{ftime})"
+ time = Time.now
+
+ feature_dataset.fill_nil_with 0
+
+ $logger.debug "Prepare save: #{Time.now-time}"
+ time = Time.now
+ feature_dataset.save_all
+
+ $logger.debug "Save: #{Time.now-time}"
+ feature_dataset
+
+ end
+ end
+ end
+end
diff --git a/lib/classification.rb b/lib/classification.rb
new file mode 100644
index 0000000..b4b2e59
--- /dev/null
+++ b/lib/classification.rb
@@ -0,0 +1,110 @@
+module OpenTox
+ module Algorithm
+
+ class Classification
+
+ def self.weighted_majority_vote compound, params
+ neighbors = params[:neighbors]
+ return {:value => nil,:confidence => nil,:warning => "Cound not find similar compounds."} if neighbors.empty?
+ weighted_sum = {}
+ sim_sum = 0.0
+ confidence = 0.0
+ neighbors.each do |row|
+ n,sim,acts = row
+ #confidence = sim if sim > confidence # distance to nearest neighbor
+ acts.each do |act|
+ weighted_sum[act] ||= 0
+ weighted_sum[act] += sim
+ end
+ end
+ case weighted_sum.size
+ when 1
+ return {:value => weighted_sum.keys.first, :confidence => weighted_sum.values.first/neighbors.size.abs}
+ when 2
+ sim_sum = weighted_sum[weighted_sum.keys[0]]
+ sim_sum -= weighted_sum[weighted_sum.keys[1]]
+ sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1]
+ confidence = (sim_sum/neighbors.size).abs
+ return {:value => prediction,:confidence => confidence}
+ else
+ bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
+ end
+ end
+
+ # Classification with majority vote from neighbors weighted by similarity
+ # @param [Hash] params Keys `:activities, :sims, :value_map` are required
+ # @return [Numeric] A prediction value.
+ def self.fminer_weighted_majority_vote neighbors, training_dataset
+
+ neighbor_contribution = 0.0
+ confidence_sum = 0.0
+
+ $logger.debug "Weighted Majority Vote Classification."
+
+ values = neighbors.collect{|n| n[2]}.uniq
+ neighbors.each do |neighbor|
+ i = training_dataset.compound_ids.index n.id
+ neighbor_weight = neighbor[1]
+ activity = values.index(neighbor[2]) + 1 # map values to integers > 1
+ neighbor_contribution += activity * neighbor_weight
+ if values.size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
+ case activity
+ when 1
+ confidence_sum -= neighbor_weight
+ when 2
+ confidence_sum += neighbor_weight
+ end
+ else
+ confidence_sum += neighbor_weight
+ end
+ end
+ if values.size == 2
+ if confidence_sum >= 0.0
+ prediction = values[1]
+ elsif confidence_sum < 0.0
+ prediction = values[0]
+ end
+ elsif values.size == 1 # all neighbors have the same value
+ prediction = values[0]
+ else
+ prediction = (neighbor_contribution/confidence_sum).round # AM: new multinomial prediction
+ end
+
+ confidence = (confidence_sum/neighbors.size).abs
+ {:value => prediction, :confidence => confidence.abs}
+ end
+
+ # Local support vector regression from neighbors
+ # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
+ # @return [Numeric] A prediction value.
+ def self.local_svm_classification(params)
+
+ confidence = 0.0
+ prediction = nil
+
+ $logger.debug "Local SVM."
+ if params[:activities].size>0
+ if params[:props]
+ n_prop = params[:props][0].collect.to_a
+ q_prop = params[:props][1].collect.to_a
+ props = [ n_prop, q_prop ]
+ end
+ activities = params[:activities].collect.to_a
+ activities = activities.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification
+ prediction = local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
+ prediction = prediction.sub(/Val/,"") if prediction # Convert back
+ confidence = 0.0 if prediction.nil?
+ #$logger.debug "Prediction: '" + prediction.to_s + "' ('#{prediction.class}')."
+ confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]})
+ end
+ {:value => prediction, :confidence => confidence}
+
+ end
+
+
+
+ end
+
+ end
+end
+
diff --git a/lib/compound.rb b/lib/compound.rb
new file mode 100644
index 0000000..7a3dc5c
--- /dev/null
+++ b/lib/compound.rb
@@ -0,0 +1,370 @@
+# TODO: check
+# *** Open Babel Error in ParseFile
+# Could not find contribution data file.
+
+CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/"
+
+module OpenTox
+
+ class Compound
+ include OpenTox
+
+ DEFAULT_FINGERPRINT = "MP2D"
+
+ field :inchi, type: String
+ field :smiles, type: String
+ field :inchikey, type: String
+ field :names, type: Array
+ field :warning, type: String
+ field :cid, type: String
+ field :chemblid, type: String
+ field :png_id, type: BSON::ObjectId
+ field :svg_id, type: BSON::ObjectId
+ field :sdf_id, type: BSON::ObjectId
+ field :fingerprints, type: Hash, default: {}
+ field :default_fingerprint_size, type: Integer
+
+ index({smiles: 1}, {unique: true})
+
+ # Overwrites standard Mongoid method to create fingerprints before database insertion
+ def self.find_or_create_by params
+ compound = self.find_or_initialize_by params
+ compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT)
+ compound.save
+ compound
+ end
+
+ def fingerprint type="MP2D"
+ unless fingerprints[type]
+ return [] unless self.smiles
+ #http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format
+ if type == "MP2D"
+ fp = obconversion(smiles,"smi","mpd").strip.split("\t")
+ name = fp.shift # remove Title
+ fingerprints[type] = fp
+ #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html
+ elsif type== "MNA"
+ level = 2 # TODO: level as parameter, evaluate level 1, see paper
+ fp = obconversion(smiles,"smi","mna","xL\"#{level}\"").split("\n")
+ fp.shift # remove Title
+ fingerprints[type] = fp
+ else # standard fingerprints
+ fp = OpenBabel::OBFingerprint.find_fingerprint(type)
+ obmol = OpenBabel::OBMol.new
+ obconversion = OpenBabel::OBConversion.new
+ obconversion.set_in_format "smi"
+ obconversion.read_string obmol, self.smiles
+ result = OpenBabel::VectorUnsignedInt.new
+ fp.get_fingerprint(obmol,result)
+ # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i
+ #p OpenBabel::OBFingerprint.describe_bits(result)
+ # convert result to a list of the bits that are set
+ # from openbabel/scripts/python/pybel.py line 830
+ # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints
+ result = result.to_a
+ bitsperint = OpenBabel::OBFingerprint.getbitsperint()
+ bits_set = []
+ start = 1
+ result.each do |x|
+ i = start
+ while x > 0 do
+ bits_set << i if (x % 2) == 1
+ x >>= 1
+ i += 1
+ end
+ start += bitsperint
+ end
+ fingerprints[type] = bits_set
+ end
+ save
+ end
+ fingerprints[type]
+ end
+
+ # Create a compound from smiles string
+ # @example
+ # compound = OpenTox::Compound.from_smiles("c1ccccc1")
+ # @param [String] smiles Smiles string
+ # @return [OpenTox::Compound] Compound
+ def self.from_smiles smiles
+ smiles = obconversion(smiles,"smi","can")
+ if smiles.empty?
+ return nil
+ #Compound.find_or_create_by(:warning => "SMILES parsing failed for '#{smiles}', this may be caused by an incorrect SMILES string.")
+ else
+ Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can")
+ end
+ end
+
+ # Create a compound from inchi string
+ # @param inchi [String] smiles InChI string
+ # @return [OpenTox::Compound] Compound
+ def self.from_inchi inchi
+ # Temporary workaround for OpenBabels Inchi bug
+ # http://sourceforge.net/p/openbabel/bugs/957/
+ # bug has not been fixed in latest git/development version
+ #smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip
+ smiles = obconversion(inchi,"inchi","can")
+ if smiles.empty?
+ Compound.find_or_create_by(:warning => "InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries.")
+ else
+ Compound.find_or_create_by(:smiles => smiles, :inchi => inchi)
+ end
+ end
+
+ # Create a compound from sdf string
+ # @param sdf [String] smiles SDF string
+ # @return [OpenTox::Compound] Compound
+ def self.from_sdf sdf
+ # do not store sdf because it might be 2D
+ Compound.from_smiles obconversion(sdf,"sdf","can")
+ end
+
+ # Create a compound from name. Relies on an external service for name lookups.
+ # @example
+ # compound = OpenTox::Compound.from_name("Benzene")
+ # @param name [String] can be also an InChI/InChiKey, CAS number, etc
+ # @return [OpenTox::Compound] Compound
+ def self.from_name name
+ Compound.from_smiles RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"smiles"))
+ end
+
+ # Get InChI
+ # @return [String] InChI string
+ def inchi
+ unless self["inchi"]
+
+ result = obconversion(smiles,"smi","inchi")
+ #result = `echo "#{self.smiles}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -ismi - -oinchi`.chomp
+ update(:inchi => result.chomp) if result and !result.empty?
+ end
+ self["inchi"]
+ end
+
+ # Get InChIKey
+ # @return [String] InChIKey string
+ def inchikey
+ update(:inchikey => obconversion(smiles,"smi","inchikey")) unless self["inchikey"]
+ self["inchikey"]
+ end
+
+ # Get (canonical) smiles
+ # @return [String] Smiles string
+ def smiles
+ update(:smiles => obconversion(self["smiles"],"smi","can")) unless self["smiles"]
+ self["smiles"]
+ end
+
+ # Get sdf
+ # @return [String] SDF string
+ def sdf
+ if self.sdf_id.nil?
+ sdf = obconversion(smiles,"smi","sdf")
+ file = Mongo::Grid::File.new(sdf, :filename => "#{id}.sdf",:content_type => "chemical/x-mdl-sdfile")
+ sdf_id = $gridfs.insert_one file
+ update :sdf_id => sdf_id
+ end
+ $gridfs.find_one(_id: self.sdf_id).data
+ end
+
+ # Get SVG image
+ # @return [image/svg] Image data
+ def svg
+ if self.svg_id.nil?
+ svg = obconversion(smiles,"smi","svg")
+ file = Mongo::Grid::File.new(svg, :filename => "#{id}.svg", :content_type => "image/svg")
+ update(:svg_id => $gridfs.insert_one(file))
+ end
+ $gridfs.find_one(_id: self.svg_id).data
+
+ end
+
+ # Get png image
+ # @example
+ # image = compound.png
+ # @return [image/png] Image data
+ def png
+ if self.png_id.nil?
+ png = obconversion(smiles,"smi","_png2")
+ file = Mongo::Grid::File.new(Base64.encode64(png), :filename => "#{id}.png", :content_type => "image/png")
+ update(:png_id => $gridfs.insert_one(file))
+ end
+ Base64.decode64($gridfs.find_one(_id: self.png_id).data)
+
+ end
+
+ # Get all known compound names. Relies on an external service for name lookups.
+ # @example
+ # names = compound.names
+ # @return [String] Compound names
+ def names
+ update(:names => RestClientWrapper.get("#{CACTUS_URI}#{inchi}/names").split("\n")) unless self["names"]
+ self["names"]
+ end
+
+ # @return [String] PubChem Compound Identifier (CID), derieved via restcall to pubchem
+ def cid
+ pug_uri = "http://pubchem.ncbi.nlm.nih.gov/rest/pug/"
+ update(:cid => RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"]
+ self["cid"]
+ end
+
+ # @return [String] ChEMBL database compound id, derieved via restcall to chembl
+ def chemblid
+ # https://www.ebi.ac.uk/chembldb/ws#individualCompoundByInChiKey
+ uri = "http://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json"
+ update(:chemblid => JSON.parse(RestClientWrapper.get(uri))["compounds"].first["chemblId"]) unless self["chemblid"]
+ self["chemblid"]
+ end
+
+ def fingerprint_count_neighbors params
+ # TODO fix
+ neighbors = []
+ query_fingerprint = self.fingerprint params[:type]
+ training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
+ unless self == compound
+ candidate_fingerprint = compound.fingerprint params[:type]
+ features = (query_fingerprint + candidate_fingerprint).uniq
+ min_sum = 0
+ max_sum = 0
+ features.each do |f|
+ min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax
+ min_sum += min
+ max_sum += max
+ end
+ max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f
+ neighbors << [compound.id, sim] if sim and sim >= params[:min_sim]
+ end
+ end
+ neighbors.sort{|a,b| b.last <=> a.last}
+ end
+
+ def fingerprint_neighbors params
+ bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim]
+ neighbors = []
+ #if params[:type] == DEFAULT_FINGERPRINT
+ #neighbors = db_neighbors params
+ #p neighbors
+ #else
+ query_fingerprint = self.fingerprint params[:type]
+ training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
+ unless self == compound
+ candidate_fingerprint = compound.fingerprint params[:type]
+ sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
+ neighbors << [compound.id, sim] if sim >= params[:min_sim]
+ end
+ end
+ #end
+ neighbors.sort{|a,b| b.last <=> a.last}
+ end
+
+ def fminer_neighbors params
+ bad_request_error "Incorrect parameters for Compound#fminer_neighbors. Please provide :feature_dataset_id, :min_sim." unless params[:feature_dataset_id] and params[:min_sim]
+ feature_dataset = Dataset.find params[:feature_dataset_id]
+ query_fingerprint = Algorithm::Descriptor.smarts_match(self, feature_dataset.features)
+ neighbors = []
+
+ # find neighbors
+ feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
+ sim = Algorithm::Similarity.tanimoto candidate_fingerprint, query_fingerprint
+ if sim >= params[:min_sim]
+ neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
+ end
+ end
+ neighbors
+ end
+
+ def physchem_neighbors params
+ feature_dataset = Dataset.find params[:feature_dataset_id]
+ query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors]
+ neighbors = []
+ feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
+ # TODO implement pearson and cosine similarity separatly
+ R.assign "x", query_fingerprint
+ R.assign "y", candidate_fingerprint
+ # pearson r
+ #sim = R.eval("cor(x,y,use='complete.obs',method='pearson')").to_ruby
+ #p "pearson"
+ #p sim
+ #p "cosine"
+ sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first
+ #p sim
+ if sim >= params[:min_sim]
+ neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
+ end
+ end
+ neighbors
+ end
+
+ def db_neighbors params
+ p "DB NEIGHBORS"
+ p params
+ # TODO restrict to dataset
+ # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
+ qn = fingerprint(params[:type]).size
+ #qmin = qn * threshold
+ #qmax = qn / threshold
+ #not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...)
+ #reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)]
+ aggregate = [
+ #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
+ {'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
+ {'$project' => {
+ 'tanimoto' => {'$let' => {
+ 'vars' => {'common' => {'$size' => {'$setIntersection' => ["'$#{DEFAULT_FINGERPRINT}'", DEFAULT_FINGERPRINT]}}},
+ 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [qn, '$default_fingerprint_size']}, '$$common']}]}
+ }},
+ '_id' => 1
+ }},
+ {'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}},
+ {'$sort' => {'tanimoto' => -1}}
+ ]
+
+ $mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] }
+
+ end
+
+ private
+
+ def self.obconversion(identifier,input_format,output_format,option=nil)
+ obconversion = OpenBabel::OBConversion.new
+ obconversion.set_options(option, OpenBabel::OBConversion::OUTOPTIONS) if option
+ obmol = OpenBabel::OBMol.new
+ obconversion.set_in_and_out_formats input_format, output_format
+ return nil if identifier.nil?
+ obconversion.read_string obmol, identifier
+ case output_format
+ when /smi|can|inchi/
+ obconversion.write_string(obmol).gsub(/\s/,'').chomp
+ when /sdf/
+ # TODO: find disconnected structures
+ # strip_salts
+ # separate
+ obmol.add_hydrogens
+ builder = OpenBabel::OBBuilder.new
+ builder.build(obmol)
+
+ sdf = obconversion.write_string(obmol)
+print sdf
+ if sdf.match(/.nan/)
+
+ $logger.warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure"
+ obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS)
+ sdf = obconversion.write_string(obmol)
+ if sdf.match(/.nan/)
+ $logger.warn "2D generation failed for compound #{identifier}, rendering without coordinates."
+ obconversion.remove_option("gen2D", OpenBabel::OBConversion::GENOPTIONS)
+ sdf = obconversion.write_string(obmol)
+ end
+ end
+ sdf
+ else
+ obconversion.write_string(obmol)
+ end
+ end
+
+ def obconversion(identifier,input_format,output_format,option=nil)
+ self.class.obconversion(identifier,input_format,output_format,option)
+ end
+ end
+end
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
new file mode 100644
index 0000000..cbffb7c
--- /dev/null
+++ b/lib/crossvalidation.rb
@@ -0,0 +1,302 @@
+module OpenTox
+
+ class CrossValidation
+ field :validation_ids, type: Array, default: []
+ field :model_id, type: BSON::ObjectId
+ field :folds, type: Integer
+ field :nr_instances, type: Integer
+ field :nr_unpredicted, type: Integer
+ field :predictions, type: Array, default: []
+ field :finished_at, type: Time
+
+ def time
+ finished_at - created_at
+ end
+
+ def validations
+ validation_ids.collect{|vid| Validation.find vid}
+ end
+
+ def model
+ Model::Lazar.find model_id
+ end
+
+ def self.create model, n=10
+ model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation
+ bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass
+ cv = klass.new(
+ name: model.name,
+ model_id: model.id,
+ folds: n
+ )
+ cv.save # set created_at
+ nr_instances = 0
+ nr_unpredicted = 0
+ predictions = []
+ training_dataset = Dataset.find model.training_dataset_id
+ training_dataset.folds(n).each_with_index do |fold,fold_nr|
+ fork do # parallel execution of validations
+ $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
+ t = Time.now
+ validation = Validation.create(model, fold[0], fold[1],cv)
+ $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds"
+ end
+ end
+ Process.waitall
+ cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id)
+ cv.validations.each do |validation|
+ nr_instances += validation.nr_instances
+ nr_unpredicted += validation.nr_unpredicted
+ predictions += validation.predictions
+ end
+ cv.update_attributes(
+ nr_instances: nr_instances,
+ nr_unpredicted: nr_unpredicted,
+ predictions: predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
+ )
+ $logger.debug "Nr unpredicted: #{nr_unpredicted}"
+ cv.statistics
+ cv
+ end
+ end
+
+ class ClassificationCrossValidation < CrossValidation
+
+ field :accept_values, type: Array
+ field :confusion_matrix, type: Array
+ field :weighted_confusion_matrix, type: Array
+ field :accuracy, type: Float
+ field :weighted_accuracy, type: Float
+ field :true_rate, type: Hash
+ field :predictivity, type: Hash
+ field :confidence_plot_id, type: BSON::ObjectId
+ # TODO auc, f-measure (usability??)
+
+ def statistics
+ accept_values = Feature.find(model.prediction_feature_id).accept_values
+ confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
+ weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
+ true_rate = {}
+ predictivity = {}
+ predictions.each do |pred|
+ compound_id,activity,prediction,confidence = pred
+ if activity and prediction and confidence.numeric?
+ if prediction == activity
+ if prediction == accept_values[0]
+ confusion_matrix[0][0] += 1
+ weighted_confusion_matrix[0][0] += confidence
+ elsif prediction == accept_values[1]
+ confusion_matrix[1][1] += 1
+ weighted_confusion_matrix[1][1] += confidence
+ end
+ elsif prediction != activity
+ if prediction == accept_values[0]
+ confusion_matrix[0][1] += 1
+ weighted_confusion_matrix[0][1] += confidence
+ elsif prediction == accept_values[1]
+ confusion_matrix[1][0] += 1
+ weighted_confusion_matrix[1][0] += confidence
+ end
+ end
+ else
+ nr_unpredicted += 1 if prediction.nil?
+ end
+ end
+ true_rate = {}
+ predictivity = {}
+ accept_values.each_with_index do |v,i|
+ true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
+ predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
+ end
+ confidence_sum = 0
+ weighted_confusion_matrix.each do |r|
+ r.each do |c|
+ confidence_sum += c
+ end
+ end
+ update_attributes(
+ accept_values: accept_values,
+ confusion_matrix: confusion_matrix,
+ weighted_confusion_matrix: weighted_confusion_matrix,
+ accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
+ weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
+ true_rate: true_rate,
+ predictivity: predictivity,
+ finished_at: Time.now
+ )
+ $logger.debug "Accuracy #{accuracy}"
+ end
+
+ def confidence_plot
+ tmpfile = "/tmp/#{id.to_s}_confidence.svg"
+ accuracies = []
+ confidences = []
+ correct_predictions = 0
+ incorrect_predictions = 0
+ predictions.each do |p|
+ if p[1] and p[2]
+ p[1] == p [2] ? correct_predictions += 1 : incorrect_predictions += 1
+ accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
+ confidences << p[3]
+
+ end
+ end
+ R.assign "accuracy", accuracies
+ R.assign "confidence", confidences
+ R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
+ plot_id = $gridfs.insert_one(file)
+ update(:confidence_plot_id => plot_id)
+ $gridfs.find_one(_id: confidence_plot_id).data
+ end
+
+ #Average area under roc 0.646
+ #Area under roc 0.646
+ #F measure carcinogen: 0.769, noncarcinogen: 0.348
+ end
+
+ class RegressionCrossValidation < CrossValidation
+
+ field :rmse, type: Float
+ field :mae, type: Float
+ field :weighted_rmse, type: Float
+ field :weighted_mae, type: Float
+ field :r_squared, type: Float
+ field :correlation_plot_id, type: BSON::ObjectId
+ field :confidence_plot_id, type: BSON::ObjectId
+
+ def statistics
+ rmse = 0
+ weighted_rmse = 0
+ rse = 0
+ weighted_rse = 0
+ mae = 0
+ weighted_mae = 0
+ rae = 0
+ weighted_rae = 0
+ confidence_sum = 0
+ predictions.each do |pred|
+ compound_id,activity,prediction,confidence = pred
+ if activity and prediction
+ error = Math.log10(prediction)-Math.log10(activity)
+ rmse += error**2
+ weighted_rmse += confidence*error**2
+ mae += error.abs
+ weighted_mae += confidence*error.abs
+ confidence_sum += confidence
+ else
+ warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+ $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+ end
+ end
+ x = predictions.collect{|p| p[1]}
+ y = predictions.collect{|p| p[2]}
+ R.assign "measurement", x
+ R.assign "prediction", y
+ R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
+ r = R.eval("r").to_ruby
+
+ mae = mae/predictions.size
+ weighted_mae = weighted_mae/confidence_sum
+ rmse = Math.sqrt(rmse/predictions.size)
+ weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
+ update_attributes(
+ mae: mae,
+ rmse: rmse,
+ weighted_mae: weighted_mae,
+ weighted_rmse: weighted_rmse,
+ r_squared: r**2,
+ finished_at: Time.now
+ )
+ $logger.debug "R^2 #{r**2}"
+ $logger.debug "RMSE #{rmse}"
+ $logger.debug "MAE #{mae}"
+ end
+
+ def misclassifications n=nil
+ #n = predictions.size unless n
+ n ||= 10
+ model = Model::Lazar.find(self.model_id)
+ training_dataset = Dataset.find(model.training_dataset_id)
+ prediction_feature = training_dataset.features.first
+ predictions.collect do |p|
+ unless p.include? nil
+ compound = Compound.find(p[0])
+ neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters)
+ neighbors.collect! do |n|
+ neighbor = Compound.find(n[0])
+ values = training_dataset.values(neighbor,prediction_feature)
+ { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values}
+ end
+ {
+ :smiles => compound.smiles,
+ #:fingerprint => compound.fp4.collect{|id| Smarts.find(id).name},
+ :measured => p[1],
+ :predicted => p[2],
+ #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs,
+ :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs,
+ :relative_error => (p[1]-p[2]).abs/p[1],
+ :confidence => p[3],
+ :neighbors => neighbors
+ }
+ end
+ end.compact.sort{|a,b| p a; b[:relative_error] <=> a[:relative_error]}[0..n-1]
+ end
+
+ def confidence_plot
+ tmpfile = "/tmp/#{id.to_s}_confidence.svg"
+ sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact
+ R.assign "error", sorted_predictions.collect{|p| p[0]}
+ R.assign "confidence", sorted_predictions.collect{|p| p[1]}
+ # TODO fix axis names
+ R.eval "image = qplot(confidence,error)"
+ R.eval "image = image + stat_smooth(method='lm', se=FALSE)"
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
+ plot_id = $gridfs.insert_one(file)
+ update(:confidence_plot_id => plot_id)
+ $gridfs.find_one(_id: confidence_plot_id).data
+ end
+
+ def correlation_plot
+ unless correlation_plot_id
+ tmpfile = "/tmp/#{id.to_s}_correlation.svg"
+ x = predictions.collect{|p| p[1]}
+ y = predictions.collect{|p| p[2]}
+ attributes = Model::Lazar.find(self.model_id).attributes
+ attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
+ attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
+ R.assign "measurement", x
+ R.assign "prediction", y
+ R.eval "all = c(-log(measurement),-log(prediction))"
+ R.eval "range = c(min(all), max(all))"
+ R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
+ R.eval "image = image + geom_abline(intercept=0, slope=1)"
+ R.eval "ggsave(file='#{tmpfile}', plot=image)"
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg")
+ plot_id = $gridfs.insert_one(file)
+ update(:correlation_plot_id => plot_id)
+ end
+ $gridfs.find_one(_id: correlation_plot_id).data
+ end
+ end
+
+ class RepeatedCrossValidation
+ field :crossvalidation_ids, type: Array, default: []
+ def self.create model, folds=10, repeats=3
+ repeated_cross_validation = self.new
+ repeats.times do |n|
+ $logger.debug "Crossvalidation #{n+1} for #{model.name}"
+ repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
+ end
+ repeated_cross_validation.save
+ repeated_cross_validation
+ end
+ def crossvalidations
+ crossvalidation_ids.collect{|id| CrossValidation.find(id)}
+ end
+ end
+
+
+end
diff --git a/lib/dataset.rb b/lib/dataset.rb
new file mode 100644
index 0000000..60f3bb5
--- /dev/null
+++ b/lib/dataset.rb
@@ -0,0 +1,350 @@
+require 'csv'
+require 'tempfile'
+
+module OpenTox
+
+ class Dataset
+
+ attr_writer :data_entries
+
+ # associations like has_many, belongs_to deteriorate performance
+ field :feature_ids, type: Array, default: []
+ field :compound_ids, type: Array, default: []
+ field :data_entries_id, type: BSON::ObjectId#, default: []
+ field :source, type: String
+
+ # Save all data including data_entries
+ # Should be used instead of save
+ def save_all
+ dump = Marshal.dump(@data_entries)
+ file = Mongo::Grid::File.new(dump, :filename => "#{self.id.to_s}.data_entries")
+ entries_id = $gridfs.insert_one(file)
+ update(:data_entries_id => entries_id)
+ end
+
+ # Readers
+
+ # Get all compounds
+ def compounds
+ @compounds ||= self.compound_ids.collect{|id| OpenTox::Compound.find id}
+ @compounds
+ end
+
+ # Get all features
+ def features
+ @features ||= self.feature_ids.collect{|id| OpenTox::Feature.find(id)}
+ @features
+ end
+
+ # Get all data_entries
+ def data_entries
+ unless @data_entries
+ t = Time.now
+ data_entry_file = $gridfs.find_one(_id: data_entries_id)
+ if data_entry_file.nil?
+ @data_entries = []
+ else
+ @data_entries = Marshal.load(data_entry_file.data)
+ bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array
+ unless @data_entries.first.size == feature_ids.size
+ # TODO: fix (unknown) source of empty data_entries
+ sleep 1
+ data_entry_file = $gridfs.find_one(_id: data_entries_id)
+ @data_entries = Marshal.load(data_entry_file.data)
+ end
+ bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size
+ # TODO: data_entries can be empty, poorly reproducible, mongo problem?
+ bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size
+ #$logger.debug "Retrieving data: #{Time.now-t}"
+ end
+ end
+ @data_entries
+ end
+
+ # Find data entry values for a given compound and feature
+ # @param compound [OpenTox::Compound] OpenTox Compound object
+ # @param feature [OpenTox::Feature] OpenTox Feature object
+ # @return [Array] Data entry values
+ def values(compound, feature)
+ rows = compound_ids.each_index.select{|r| compound_ids[r] == compound.id }
+ col = feature_ids.index feature.id
+ rows.collect{|row| data_entries[row][col]}
+ end
+
+ # Writers
+
+ # Set compounds
+ def compounds=(compounds)
+ self.compound_ids = compounds.collect{|c| c.id}
+ end
+
+ # Set features
+ def features=(features)
+ self.feature_ids = features.collect{|f| f.id}
+ end
+
+ # Dataset operations
+
+ # Split a dataset into n folds
+ # @param [Integer] number of folds
+ # @return [Array] Array with folds [training_dataset,test_dataset]
+ def folds n
+ len = self.compound_ids.size
+ indices = (0..len-1).to_a.shuffle
+ mid = (len/n)
+ chunks = []
+ start = 0
+ 1.upto(n) do |i|
+ last = start+mid
+ last = last-1 unless len%n >= i
+ test_idxs = indices[start..last] || []
+ test_cids = test_idxs.collect{|i| self.compound_ids[i]}
+ test_data_entries = test_idxs.collect{|i| self.data_entries[i]}
+ test_dataset = self.class.new(:compound_ids => test_cids, :feature_ids => self.feature_ids, :data_entries => test_data_entries)
+ training_idxs = indices-test_idxs
+ training_cids = training_idxs.collect{|i| self.compound_ids[i]}
+ training_data_entries = training_idxs.collect{|i| self.data_entries[i]}
+ training_dataset = self.class.new(:compound_ids => training_cids, :feature_ids => self.feature_ids, :data_entries => training_data_entries)
+ test_dataset.save_all
+ training_dataset.save_all
+ chunks << [training_dataset,test_dataset]
+ start = last+1
+ end
+ chunks
+ end
+
+ # Diagnostics
+
+ def correlation_plot training_dataset
+ # TODO: create/store svg
+ R.assign "features", data_entries
+ R.assign "activities", training_dataset.data_entries.collect{|de| de.first}
+ R.eval "featurePlot(features,activities)"
+ end
+
+ def density_plot
+ # TODO: create/store svg
+ R.assign "acts", data_entries.collect{|r| r.first }#.compact
+ R.eval "plot(density(-log(acts),na.rm= TRUE), main='-log(#{features.first.name})')"
+ end
+
+ # Serialisation
+
+ # converts dataset to csv format including compound smiles as first column, other column headers are feature names
+ # @return [String]
+ def to_csv(inchi=false)
+ CSV.generate() do |csv| #{:force_quotes=>true}
+ csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
+ compounds.each_with_index do |c,i|
+ csv << [inchi ? c.inchi : c.smiles] + data_entries[i]
+ end
+ end
+ end
+
+
+ # Parsers
+
+ # Create a dataset from file (csv,sdf,...)
+ # @param filename [String]
+ # @return [String] dataset uri
+ # TODO
+ #def self.from_sdf_file
+ #end
+
+ # Create a dataset from CSV file
+ # TODO: document structure
+ def self.from_csv_file file, source=nil, bioassay=true
+ source ||= file
+ name = File.basename(file,".*")
+ dataset = self.find_by(:source => source, :name => name)
+ if dataset
+ $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})."
+ else
+ $logger.debug "Parsing #{file}."
+ table = CSV.read file, :skip_blanks => true
+ dataset = self.new(:source => source, :name => name)
+ dataset.parse_table table, bioassay
+ end
+ dataset
+ end
+
+ # parse data in tabular format (e.g. from csv)
+ # does a lot of guesswork in order to determine feature types
+ def parse_table table, bioassay=true
+
+ time = Time.now
+
+ # features
+ feature_names = table.shift.collect{|f| f.strip}
+ warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size
+ compound_format = feature_names.shift.strip
+ bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
+
+ numeric = []
+ # guess feature types
+ feature_names.each_with_index do |f,i|
+ metadata = {:name => f}
+ values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
+ types = values.collect{|v| v.numeric? ? true : false}.uniq
+ if values.size == 0 # empty feature
+ elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes
+ metadata["numeric"] = true
+ numeric[i] = true
+ else
+ metadata["nominal"] = true
+ metadata["accept_values"] = values
+ numeric[i] = false
+ end
+ if bioassay
+ if metadata["numeric"]
+ feature = NumericBioAssay.find_or_create_by(metadata)
+ elsif metadata["nominal"]
+ feature = NominalBioAssay.find_or_create_by(metadata)
+ end
+ else
+ metadata.merge({:measured => false, :calculated => true})
+ if metadata["numeric"]
+ feature = NumericFeature.find_or_create_by(metadata)
+ elsif metadata["nominal"]
+ feature = NominalFeature.find_or_create_by(metadata)
+ end
+ end
+ feature_ids << feature.id if feature
+ end
+
+ $logger.debug "Feature values: #{Time.now-time}"
+ time = Time.now
+
+ r = -1
+ compound_time = 0
+ value_time = 0
+
+ # compounds and values
+ @data_entries = [] #Array.new(table.size){Array.new(table.first.size-1)}
+
+ table.each_with_index do |vals,i|
+ ct = Time.now
+ identifier = vals.shift
+ warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty?
+ begin
+ case compound_format
+ when /SMILES/i
+ compound = OpenTox::Compound.from_smiles(identifier)
+ when /InChI/i
+ compound = OpenTox::Compound.from_inchi(identifier)
+ end
+ rescue
+ compound = nil
+ end
+ if compound.nil?
+ # compound parsers may return nil
+ warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
+ next
+ end
+ # TODO insert empty compounds to keep positions?
+ compound_time += Time.now-ct
+
+ r += 1
+ unless vals.size == feature_ids.size # way cheaper than accessing features
+ warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
+ next
+ end
+
+ compound_ids << compound.id
+ table.first.size == 0 ? @data_entries << Array.new(0) : @data_entries << Array.new(table.first.size-1)
+
+ vals.each_with_index do |v,j|
+ if v.blank?
+ warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
+ next
+ elsif numeric[j]
+ @data_entries.last[j] = v.to_f
+ else
+ @data_entries.last[j] = v.strip
+ end
+ end
+ end
+ compounds.duplicates.each do |compound|
+ positions = []
+ compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi}
+ warnings << "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
+ end
+
+ $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
+ time = Time.now
+ save_all
+ $logger.debug "Saving: #{Time.now-time}"
+
+ end
+
+ # Fill unset data entries
+ # @param any value
+ def fill_nil_with n
+ (0 .. compound_ids.size-1).each do |i|
+ @data_entries[i] ||= []
+ (0 .. feature_ids.size-1).each do |j|
+ @data_entries[i][j] ||= n
+ end
+ end
+ end
+
+ def scale
+ scaled_data_entries = Array.new(data_entries.size){Array.new(data_entries.first.size)}
+ centers = []
+ scales = []
+ feature_ids.each_with_index do |feature_id,col|
+ R.assign "x", data_entries.collect{|de| de[col]}
+ R.eval "scaled = scale(x,center=T,scale=T)"
+ centers[col] = R.eval("attr(scaled, 'scaled:center')").to_ruby
+ scales[col] = R.eval("attr(scaled, 'scaled:scale')").to_ruby
+ R.eval("scaled").to_ruby.each_with_index do |value,row|
+ scaled_data_entries[row][col] = value
+ end
+ end
+ scaled_dataset = ScaledDataset.new(attributes)
+ scaled_dataset["_id"] = BSON::ObjectId.new
+ scaled_dataset["_type"] = "OpenTox::ScaledDataset"
+ scaled_dataset.centers = centers
+ scaled_dataset.scales = scales
+ scaled_dataset.data_entries = scaled_data_entries
+ scaled_dataset.save_all
+ scaled_dataset
+ end
+ end
+
+ # Dataset for lazar predictions
+ class LazarPrediction < Dataset
+ field :creator, type: String
+ field :prediction_feature_id, type: String
+
+ def prediction_feature
+ Feature.find prediction_feature_id
+ end
+
+ end
+
+ # Dataset for descriptors (physchem)
+ class DescriptorDataset < Dataset
+ field :feature_calculation_algorithm, type: String
+
+ end
+
+ class ScaledDataset < DescriptorDataset
+
+ field :centers, type: Array, default: []
+ field :scales, type: Array, default: []
+
+ def original_value value, i
+ value * scales[i] + centers[i]
+ end
+ end
+
+ # Dataset for fminer descriptors
+ class FminerDataset < DescriptorDataset
+ field :training_algorithm, type: String
+ field :training_dataset_id, type: BSON::ObjectId
+ field :training_feature_id, type: BSON::ObjectId
+ field :training_parameters, type: Hash
+ end
+
+end
diff --git a/lib/descriptor.rb b/lib/descriptor.rb
new file mode 100644
index 0000000..9733bde
--- /dev/null
+++ b/lib/descriptor.rb
@@ -0,0 +1,248 @@
+require 'digest/md5'
+ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk"
+# TODO store descriptors in mongodb
+
+module OpenTox
+
+ module Algorithm
+
+ # Class for descriptor calculations
+ class Descriptor
+ include OpenTox
+
+ JAVA_DIR = File.join(File.dirname(__FILE__),"..","java")
+ CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last
+ JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar")
+ LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar")
+ JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
+
+ obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"]
+ OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
+ name,description = d.split(/\s+/,2)
+ ["Openbabel."+name,description] unless obexclude.include? name
+ end.compact.sort{|a,b| a[0] <=> b[0]}]
+
+ cdk_desc = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`)
+ CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}]
+ CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"."+name } }.flatten
+
+ # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
+ joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
+ # strip Joelib messages from stdout
+ JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
+ name = d[:java_class].sub(/^joelib2.feature.types./,'')
+ # impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java
+ ["Joelib."+name, "no description available"] unless joelibexclude.include? name
+ end.compact.sort{|a,b| a[0] <=> b[0]}]
+
+ DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
+ DESCRIPTOR_VALUES = OBDESCRIPTORS.keys + CDKDESCRIPTOR_VALUES + JOELIBDESCRIPTORS.keys
+
+ require_relative "unique_descriptors.rb"
+
+ # Description of available descriptors
+ def self.description descriptor
+ lib = descriptor.split('.').first
+ case lib
+ when "Openbabel"
+ OBDESCRIPTORS[descriptor]
+ when "Cdk"
+ name = descriptor.split('.')[0..-2].join('.')
+ CDKDESCRIPTORS[name]
+ when "Joelib"
+ JOELIBDESCRIPTORS[descriptor]
+ when "lookup"
+ "Read feature values from a dataset"
+ end
+ end
+
+ # Match an array of smarts features
+ def self.smarts_match compounds, smarts_features, count=false
+ bad_request_error "Compounds for smarts_match are empty" unless compounds
+ bad_request_error "Smarts features for smarts_match are empty" unless smarts_features
+ parse compounds
+ @count = count
+ obconversion = OpenBabel::OBConversion.new
+ obmol = OpenBabel::OBMol.new
+ obconversion.set_in_format('smi')
+ smarts_pattern = OpenBabel::OBSmartsPattern.new
+ smarts_features = [smarts_features] if smarts_features.is_a?(Feature)
+ @smarts = smarts_features.collect{|f| f.smarts}
+ @physchem_descriptors = nil
+ @data_entries = Array.new(@compounds.size){Array.new(@smarts.size,false)}
+ @compounds.each_with_index do |compound,c|
+ obconversion.read_string(obmol,compound.smiles)
+ @smarts.each_with_index do |smart,s|
+ smarts_pattern.init(smart)
+ if smarts_pattern.match(obmol)
+ count ? value = smarts_pattern.get_map_list.to_a.size : value = 1
+ else
+ value = 0
+ end
+ @data_entries[c][s] = value
+ end
+ end
+ serialize
+ end
+
+ # Count matches of an array with smarts features
+ def self.smarts_count compounds, smarts
+ # TODO: non-overlapping matches?
+ smarts_match compounds,smarts,true
+ end
+
+ # Calculate physchem descriptors
+ # @param [OpenTox::Compound,Array,OpenTox::Dataset] input object, either a compound, an array of compounds or a dataset
+ def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS
+ parse compounds
+ @data_entries = Array.new(@compounds.size){[]}
+ @descriptors = descriptors
+ @smarts = nil
+ @physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features
+ des = {}
+ @descriptors.each do |d|
+ lib, descriptor = d.split(".",2)
+ lib = lib.downcase.to_sym
+ des[lib] ||= []
+ des[lib] << descriptor
+ end
+ des.each do |lib,descriptors|
+ p lib, descriptors
+ send(lib, descriptors)
+ end
+ serialize
+ end
+
+ def self.openbabel descriptors
+ $logger.debug "compute #{descriptors.size} openbabel descriptors for #{@compounds.size} compounds"
+ obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d}
+ obmol = OpenBabel::OBMol.new
+ obconversion = OpenBabel::OBConversion.new
+ obconversion.set_in_format 'smi'
+ last_feature_idx = @physchem_descriptors.size
+ @compounds.each_with_index do |compound,c|
+ obconversion.read_string obmol, compound.smiles
+ obdescriptors.each_with_index do |descriptor,d|
+ @data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol))
+ end
+ end
+ @physchem_descriptors += descriptors.collect{|d| "Openbabel.#{d}"}
+ end
+
+ def self.java_descriptors descriptors, lib
+ $logger.debug "compute #{descriptors.size} cdk descriptors for #{@compounds.size} compounds"
+ sdf = sdf_3d
+ # use java system call (rjb blocks within tasks)
+ # use Tempfiles to avoid "Argument list too long" error
+ case lib
+ when "cdk"
+ run_cmd "java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf} #{descriptors.join(" ")}"
+ when "joelib"
+ run_cmd "java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf} #{descriptors.join(' ')}"
+ end
+ last_feature_idx = @physchem_descriptors.size
+ YAML.load_file("#{sdf}#{lib}.yaml").each_with_index do |calculation,i|
+ # TODO create warnings
+ #$logger.error "Descriptor calculation failed for compound #{@compounds[i].inchi}." if calculation.empty?
+ # CDK Descriptors may calculate multiple values, they are stored in separate features
+ @physchem_descriptors += calculation.keys if i == 0
+ calculation.keys.each_with_index do |name,j|
+ @data_entries[i][j+last_feature_idx] = fix_value(calculation[name])
+ end
+ end
+ FileUtils.rm "#{sdf}#{lib}.yaml"
+ end
+
+ def self.cdk descriptors
+ java_descriptors descriptors, "cdk"
+ end
+
+ def self.joelib descriptors
+ java_descriptors descriptors, "joelib"
+ end
+
+ def self.lookup compounds, features, dataset
+ parse compounds
+ fingerprint = []
+ compounds.each do |compound|
+ fingerprint << []
+ features.each do |feature|
+ end
+ end
+ end
+
+ def self.run_cmd cmd
+ cmd = "#{cmd} 2>&1"
+ $logger.debug "running external cmd: '#{cmd}'"
+ p = IO.popen(cmd) do |io|
+ while line = io.gets
+ $logger.debug "> #{line.chomp}"
+ end
+ io.close
+ raise "external cmd failed '#{cmd}' (see log file for error msg)" unless $?.to_i == 0
+ end
+ end
+
+ def self.sdf_3d
+ # TODO check if 3d sdfs are stored in GridFS
+ sdf = ""
+ @compounds.each do |compound|
+ sdf << compound.sdf
+ end
+ sdf_file = "/tmp/#{SecureRandom.uuid}.sdf"
+ File.open(sdf_file,"w+"){|f| f.print sdf}
+ sdf_file
+ end
+
+ def self.parse compounds
+ @input_class = compounds.class.to_s
+ case @input_class
+ when "OpenTox::Compound"
+ @compounds = [compounds]
+ when "Array"
+ @compounds = compounds
+ when "OpenTox::Dataset"
+ @compounds = compounds.compounds
+ else
+ bad_request_error "Cannot calculate descriptors for #{compounds.class} objects."
+ end
+ end
+
+ def self.serialize
+ @data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}}
+ case @input_class
+ when "OpenTox::Compound"
+ @data_entries.first
+ when "Array"
+ @data_entries
+ when "OpenTox::Dataset"
+ dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id})
+ if @smarts
+ dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id}
+ @count ? algo = "count" : algo = "match"
+ dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}"
+
+ elsif @physchem_descriptors
+ dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id}
+ dataset.data_entries = @data_entries
+ dataset.feature_calculation_algorithm = "#{self}.physchem"
+ #TODO params?
+ end
+ dataset.save_all
+ dataset
+ end
+ end
+
+ def self.fix_value val
+ val = val.first if val.is_a? Array and val.size == 1
+ val = nil if val == "NaN"
+ if val.numeric?
+ val = Float(val)
+ val = nil if val.nan? or val.infinite?
+ end
+ val
+ end
+ private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize
+ end
+ end
+end
diff --git a/lib/error.rb b/lib/error.rb
new file mode 100644
index 0000000..39b3c76
--- /dev/null
+++ b/lib/error.rb
@@ -0,0 +1,66 @@
+module OpenToxError
+ attr_accessor :http_code, :message, :cause
+ def initialize message=nil
+ message = message.to_s.gsub(/\A"|"\Z/, '') if message # remove quotes
+ super message
+ @http_code ||= 500
+ @message = message.to_s
+ @cause = cut_backtrace(caller)
+ $logger.error("\n"+JSON.pretty_generate({
+ :http_code => @http_code,
+ :message => @message,
+ :cause => @cause
+ }))
+ end
+
+ def cut_backtrace(trace)
+ if trace.is_a?(Array)
+ cut_index = trace.find_index{|line| line.match(/sinatra|minitest/)}
+ cut_index ||= trace.size
+ cut_index -= 1
+ cut_index = trace.size-1 if cut_index < 0
+ trace[0..cut_index]
+ else
+ trace
+ end
+ end
+
+end
+
+class RuntimeError
+ include OpenToxError
+end
+
+# clutters log file with library errors
+#class NoMethodError
+ #include OpenToxError
+#end
+
+module OpenTox
+
+ class Error < RuntimeError
+ include OpenToxError
+
+ def initialize(code, message=nil)
+ @http_code = code
+ super message
+ end
+ end
+
+ # OpenTox errors
+ RestClientWrapper.known_errors.each do |error|
+ # create error classes
+ c = Class.new Error do
+ define_method :initialize do |message=nil|
+ super error[:code], message
+ end
+ end
+ OpenTox.const_set error[:class],c
+
+ # define global methods for raising errors, eg. bad_request_error
+ Object.send(:define_method, error[:method]) do |message|
+ raise c.new(message)
+ end
+ end
+
+end
diff --git a/lib/experiment.rb b/lib/experiment.rb
new file mode 100644
index 0000000..0dfdf86
--- /dev/null
+++ b/lib/experiment.rb
@@ -0,0 +1,99 @@
+module OpenTox
+
+ class Experiment
+ field :dataset_ids, type: Array
+ field :model_settings, type: Array, default: []
+ field :results, type: Hash, default: {}
+
+ def run
+ dataset_ids.each do |dataset_id|
+ dataset = Dataset.find(dataset_id)
+ results[dataset_id.to_s] = []
+ model_settings.each do |setting|
+ setting = setting.dup
+ model_algorithm = setting.delete :model_algorithm #if setting[:model_algorithm]
+ model = Object.const_get(model_algorithm).create dataset, setting
+ $logger.debug model
+ model.save
+ repeated_crossvalidation = RepeatedCrossValidation.create model
+ results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id}
+ end
+ end
+ save
+ end
+
+ def report
+ # statistical significances http://www.r-bloggers.com/anova-and-tukeys-test-on-r/
+ report = {}
+ report[:name] = name
+ report[:experiment_id] = self.id.to_s
+ report[:results] = {}
+ parameters = []
+ dataset_ids.each do |dataset_id|
+ dataset_name = Dataset.find(dataset_id).name
+ report[:results][dataset_name] = {}
+ report[:results][dataset_name][:anova] = {}
+ report[:results][dataset_name][:data] = []
+ # TODO results[dataset_id.to_s] does not exist
+ results[dataset_id.to_s].each do |result|
+ model = Model::Lazar.find(result[:model_id])
+ repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id])
+ crossvalidations = repeated_cv.crossvalidations
+ if crossvalidations.first.is_a? ClassificationCrossValidation
+ parameters = [:accuracy,:true_rate,:predictivity]
+ elsif crossvalidations.first.is_a? RegressionCrossValidation
+ parameters = [:rmse,:mae,:r_squared]
+ end
+ summary = {}
+ [:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key|
+ summary[key] = model[key]
+ end
+ summary[:nr_instances] = crossvalidations.first.nr_instances
+ summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted}
+ summary[:time] = crossvalidations.collect{|cv| cv.time}
+ parameters.each do |param|
+ summary[param] = crossvalidations.collect{|cv| cv.send(param)}
+ end
+ report[:results][dataset_name][:data] << summary
+ end
+ end
+ report[:results].each do |dataset,results|
+ ([:time,:nr_unpredicted]+parameters).each do |param|
+ experiments = []
+ outcome = []
+ results[:data].each_with_index do |result,i|
+ result[param].each do |p|
+ experiments << i
+ p = nil if p.kind_of? Float and p.infinite? # TODO fix @ division by 0
+ outcome << p
+ end
+ end
+ begin
+ R.assign "experiment_nr",experiments.collect{|i| "Experiment #{i}"}
+ R.eval "experiment_nr = factor(experiment_nr)"
+ R.assign "outcome", outcome
+ R.eval "data = data.frame(experiment_nr,outcome)"
+ # one-way ANOVA
+ R.eval "fit = aov(outcome ~ experiment_nr, data=data,na.action='na.omit')"
+ # http://stackoverflow.com/questions/3366506/extract-p-value-from-aov
+ p_value = R.eval("summary(fit)[[1]][['Pr(>F)']][[1]]").to_ruby
+ # aequivalent
+ # sum = R.eval("summary(fit)")
+ #p_value = sum.to_ruby.first.last.first
+ rescue
+ p_value = nil
+ end
+ report[:results][dataset][:anova][param] = p_value
+=begin
+=end
+ end
+ end
+ report
+ end
+
+ def summary
+ report[:results].collect{|dataset,data| {dataset => data[:anova].select{|param,p_val| p_val < 0.1}}}
+ end
+ end
+
+end
diff --git a/lib/feature.rb b/lib/feature.rb
new file mode 100644
index 0000000..13fa6d1
--- /dev/null
+++ b/lib/feature.rb
@@ -0,0 +1,95 @@
+module OpenTox
+
+ # Basic feature class
+ class Feature
+ field :nominal, type: Boolean
+ field :numeric, type: Boolean
+ field :measured, type: Boolean
+ end
+
+ # Feature for categorical variables
+ class NominalFeature < Feature
+ # TODO check if accept_values are still needed
+ field :accept_values, type: Array
+ def initialize params
+ super params
+ nominal = true
+ end
+ end
+
+ # Feature for quantitative variables
+ class NumericFeature < Feature
+ def initialize params
+ super params
+ numeric = true
+ end
+ end
+
+ # Feature for SMARTS fragments
+ class Smarts < NominalFeature
+ field :smarts, type: String
+ index "smarts" => 1
+ def self.from_smarts smarts
+ self.find_or_create_by :smarts => smarts
+ end
+ end
+
+ # Feature for supervised fragments from Fminer algorithm
+ class FminerSmarts < Smarts
+ field :p_value, type: Float
+ # TODO check if effect is used
+ field :effect, type: String
+ field :dataset_id
+ end
+
+ # Feature for database fingerprints
+ # needs count for efficient retrieval (see compound.rb)
+ class FingerprintSmarts < Smarts
+ field :count, type: Integer
+ def self.fingerprint
+=begin
+ @@fp4 ||= OpenTox::FingerprintSmarts.all
+ unless @@fp4.size == 306
+ @@fp4 = []
+ # OpenBabel FP4 fingerprints
+ # OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html
+ # TODO investigate other types of fingerprints (MACCS)
+ # OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html
+ # http://www.dalkescientific.com/writings/diary/archive/2008/06/26/fingerprint_background.html
+ # OpenBabel MNA http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html#multilevel-neighborhoods-of-atoms-mna
+ # Morgan ECFP, FCFP
+ # http://cdk.github.io/cdk/1.5/docs/api/org/openscience/cdk/fingerprint/CircularFingerprinter.html
+ # http://www.rdkit.org/docs/GettingStartedInPython.html
+ # Chemfp
+ # https://chemfp.readthedocs.org/en/latest/using-tools.html
+ # CACTVS/PubChem
+
+ File.open(File.join(File.dirname(__FILE__),"SMARTS_InteLigand.txt")).each do |l|
+ l.strip!
+ unless l.empty? or l.match /^#/
+ name,smarts = l.split(': ')
+ @@fp4 << OpenTox::FingerprintSmarts.find_or_create_by(:name => name, :smarts => smarts) unless smarts.nil?
+ end
+ end
+ end
+ @@fp4
+=end
+ end
+ end
+
+ # Feature for physico-chemical descriptors
+ class PhysChemDescriptor < NumericFeature
+ field :algorithm, type: String, default: "OpenTox::Algorithm::Descriptor.physchem"
+ field :parameters, type: Hash
+ field :creator, type: String
+ end
+
+ # Feature for categorical bioassay results
+ class NominalBioAssay < NominalFeature
+ end
+
+ # Feature for quantitative bioassay results
+ class NumericBioAssay < NumericFeature
+ end
+
+end
diff --git a/lib/lazar.rb b/lib/lazar.rb
new file mode 100644
index 0000000..f801062
--- /dev/null
+++ b/lib/lazar.rb
@@ -0,0 +1,72 @@
+require 'rubygems'
+require "bundler/setup"
+require "rest-client"
+require 'yaml'
+require 'json'
+require 'logger'
+require 'mongoid'
+require 'rserve'
+require "nokogiri"
+require "base64"
+
+# Mongo setup
+# TODO retrieve correct environment from Rack/Sinatra
+ENV["MONGOID_ENV"] ||= "development"
+# TODO remove config files, change default via ENV or directly in Mongoid class
+Mongoid.load!("#{File.expand_path(File.join(File.dirname(__FILE__),'..','mongoid.yml'))}")
+Mongoid.raise_not_found_error = false # return nil if no document is found
+$mongo = Mongo::Client.new('mongodb://127.0.0.1:27017/opentox')
+#$mongo = Mongoid.default_client
+$gridfs = $mongo.database.fs
+
+# R setup
+R = Rserve::Connection.new
+R.eval "library(ggplot2)"
+R.eval "library(grid)"
+R.eval "library(gridExtra)"
+
+# Logger setup
+STDOUT.sync = true # for redirection, etc see http://stackoverflow.com/questions/8549443/why-doesnt-logger-output-to-stdout-get-redirected-to-files
+$logger = Logger.new STDOUT # STDERR did not work on my development machine (CH)
+$logger.level = Logger::DEBUG
+Mongo::Logger.level = Logger::WARN
+#Mongo::Logger.logger = $logger
+
+# Require sub-Repositories
+require_relative '../libfminer/libbbrc/bbrc' # include before openbabel
+require_relative '../libfminer/liblast/last' #
+require_relative '../last-utils/lu.rb'
+require_relative '../openbabel/lib/openbabel'
+
+# Fminer environment variables
+ENV['FMINER_SMARTS'] = 'true'
+ENV['FMINER_NO_AROMATIC'] = 'true'
+ENV['FMINER_PVALUES'] = 'true'
+ENV['FMINER_SILENT'] = 'true'
+ENV['FMINER_NR_HITS'] = 'true'
+
+# OpenTox classes and includes
+CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
+
+[ # be aware of the require sequence as it affects class/method overwrites
+ "overwrite.rb",
+ "rest-client-wrapper.rb",
+ "error.rb",
+ "opentox.rb",
+ "feature.rb",
+ "compound.rb",
+ "dataset.rb",
+ "descriptor.rb",
+ "algorithm.rb",
+ "descriptor.rb",
+ "bbrc.rb",
+ "model.rb",
+ "similarity.rb",
+ #"neighbor.rb",
+ "classification.rb",
+ "regression.rb",
+ "validation.rb",
+ "crossvalidation.rb",
+ "experiment.rb",
+].each{ |f| require_relative f }
+
diff --git a/lib/model.rb b/lib/model.rb
new file mode 100644
index 0000000..98433d0
--- /dev/null
+++ b/lib/model.rb
@@ -0,0 +1,265 @@
+module OpenTox
+
+ module Model
+
+ class Model
+ include OpenTox
+ include Mongoid::Document
+ include Mongoid::Timestamps
+ store_in collection: "models"
+
+ field :name, type: String
+ field :creator, type: String, default: __FILE__
+ # datasets
+ field :training_dataset_id, type: BSON::ObjectId
+ # algorithms
+ field :prediction_algorithm, type: String
+ # prediction feature
+ field :prediction_feature_id, type: BSON::ObjectId
+
+ def training_dataset
+ Dataset.find(training_dataset_id)
+ end
+ end
+
+ class Lazar < Model
+
+ # algorithms
+ field :neighbor_algorithm, type: String
+ field :neighbor_algorithm_parameters, type: Hash, default: {}
+
+ # Create a lazar model from a training_dataset and a feature_dataset
+ # @param [OpenTox::Dataset] training_dataset
+ # @return [OpenTox::Model::Lazar] Regression or classification model
+ def initialize training_dataset, params={}
+
+ super params
+ bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
+
+ # TODO document convention
+ prediction_feature = training_dataset.features.first
+ # set defaults for empty parameters
+ self.prediction_feature_id ||= prediction_feature.id
+ self.training_dataset_id ||= training_dataset.id
+ self.name ||= "#{training_dataset.name} #{prediction_feature.name}"
+ self.neighbor_algorithm_parameters ||= {}
+ self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id
+ save
+ self
+ end
+
+ def predict object, use_database_values=true
+
+ t = Time.now
+ at = Time.now
+
+ training_dataset = Dataset.find training_dataset_id
+ prediction_feature = Feature.find prediction_feature_id
+
+ # parse data
+ compounds = []
+ case object.class.to_s
+ when "OpenTox::Compound"
+ compounds = [object]
+ when "Array"
+ compounds = object
+ when "OpenTox::Dataset"
+ compounds = object.compounds
+ else
+ bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter."
+ end
+
+ # make predictions
+ predictions = []
+ neighbors = []
+ compounds.each_with_index do |compound,c|
+ t = Time.new
+ database_activities = training_dataset.values(compound,prediction_feature)
+ if use_database_values and database_activities and !database_activities.empty?
+ database_activities = database_activities.first if database_activities.size == 1
+ predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."}
+ next
+ end
+ neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
+
+ # add activities
+ # TODO: improve efficiency, takes 3 times longer than previous version
+ neighbors.collect! do |n|
+ rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first}
+ acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact
+ acts.empty? ? nil : n << acts
+ end
+ neighbors.compact! # remove neighbors without training activities
+ predictions << Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_size => training_dataset.data_entries.size})
+=begin
+# TODO scaled dataset for physchem
+ p neighbor_algorithm_parameters
+ p (neighbor_algorithm_parameters["feature_dataset_id"])
+ d = Dataset.find(neighbor_algorithm_parameters["feature_dataset_id"])
+ p d
+ p d.class
+ if neighbor_algorithm_parameters["feature_dataset_id"] and Dataset.find(neighbor_algorithm_parameters["feature_dataset_id"]).kind_of? ScaledDataset
+ p "SCALED"
+ end
+=end
+ end
+
+ # serialize result
+ case object.class.to_s
+ when "OpenTox::Compound"
+ prediction = predictions.first
+ prediction[:neighbors] = neighbors.sort{|a,b| b[1] <=> a[1]} # sort according to similarity
+ return prediction
+ when "Array"
+ return predictions
+ when "OpenTox::Dataset"
+ # prepare prediction dataset
+ prediction_dataset = LazarPrediction.new(
+ :name => "Lazar prediction for #{prediction_feature.name}",
+ :creator => __FILE__,
+ :prediction_feature_id => prediction_feature.id
+
+ )
+ confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Prediction confidence" )
+ # TODO move into warnings field
+ warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings")
+ prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ]
+ prediction_dataset.compounds = compounds
+ prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence], p[:warning]]}
+ prediction_dataset.save_all
+ return prediction_dataset
+ end
+
+ end
+
+ def training_activities
+ i = training_dataset.feature_ids.index prediction_feature_id
+ training_dataset.data_entries.collect{|de| de[i]}
+ end
+
+ end
+
+ class LazarClassification < Lazar
+
+ def self.create training_dataset, params={}
+ model = self.new training_dataset, params
+ model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm
+ model.neighbor_algorithm ||= "fingerprint_neighbors"
+ model.neighbor_algorithm_parameters ||= {}
+ {
+ :type => "MP2D",
+ :training_dataset_id => training_dataset.id,
+ :min_sim => 0.1
+ }.each do |key,value|
+ model.neighbor_algorithm_parameters[key] ||= value
+ end
+ model.save
+ model
+ end
+ end
+
+ class LazarRegression < Lazar
+
+ def self.create training_dataset, params={}
+ model = self.new training_dataset, params
+ model.neighbor_algorithm ||= "fingerprint_neighbors"
+ model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.weighted_average"
+ model.neighbor_algorithm_parameters ||= {}
+ {
+ :type => "MP2D",
+ :training_dataset_id => training_dataset.id,
+ :min_sim => 0.1
+ #:type => "FP4",
+ #:training_dataset_id => training_dataset.id,
+ #:min_sim => 0.7
+ }.each do |key,value|
+ model.neighbor_algorithm_parameters[key] ||= value
+ end
+ model.save
+ model
+ end
+ end
+
+ class LazarFminerClassification < LazarClassification
+ field :feature_calculation_parameters, type: Hash
+
+ def self.create training_dataset, fminer_params={}
+ model = super(training_dataset)
+ model.update "_type" => self.to_s # adjust class
+ model = self.find model.id # adjust class
+ model.neighbor_algorithm = "fminer_neighbors"
+ model.neighbor_algorithm_parameters = {
+ :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match",
+ :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset,fminer_params).id,
+ :min_sim => 0.3
+ }
+ model.feature_calculation_parameters = fminer_params
+ model.save
+ model
+ end
+ end
+
+ class Prediction
+ include OpenTox
+ include Mongoid::Document
+ include Mongoid::Timestamps
+
+ # TODO cv -> repeated cv
+ # TODO field Validations
+ field :endpoint, type: String
+ field :species, type: String
+ field :source, type: String
+ field :unit, type: String
+ field :model_id, type: BSON::ObjectId
+ field :repeated_crossvalidation_id, type: BSON::ObjectId
+
+ def predict object
+ Lazar.find(model_id).predict object
+ end
+
+ def training_dataset
+ model.training_dataset
+ end
+
+ def model
+ Lazar.find model_id
+ end
+
+ def repeated_crossvalidation
+ RepeatedCrossValidation.find repeated_crossvalidation_id
+ end
+
+ def crossvalidations
+ repeated_crossvalidation.crossvalidations
+ end
+
+ def regression?
+ training_dataset.features.first.numeric?
+ end
+
+ def classification?
+ training_dataset.features.first.nominal?
+ end
+
+ def self.from_csv_file file
+ metadata_file = file.sub(/csv$/,"json")
+ bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
+ prediction_model = self.new JSON.parse(File.read(metadata_file))
+ training_dataset = Dataset.from_csv_file file
+ model = nil
+ if training_dataset.features.first.nominal?
+ #model = LazarFminerClassification.create training_dataset
+ model = LazarClassification.create training_dataset
+ elsif training_dataset.features.first.numeric?
+ model = LazarRegression.create training_dataset
+ end
+ prediction_model[:model_id] = model.id
+ prediction_model[:repeated_crossvalidation_id] = RepeatedCrossValidation.create(model).id
+ prediction_model.save
+ prediction_model
+ end
+ end
+
+ end
+
+end
diff --git a/lib/opentox.rb b/lib/opentox.rb
new file mode 100644
index 0000000..186c87a
--- /dev/null
+++ b/lib/opentox.rb
@@ -0,0 +1,22 @@
+module OpenTox
+
+ # Ruby interface
+
+ # create default OpenTox classes (defined in opentox-client.rb)
+ # provides Mongoid's query and persistence methods
+ # http://mongoid.org/en/mongoid/docs/persistence.html
+ # http://mongoid.org/en/mongoid/docs/querying.html
+ CLASSES.each do |klass|
+ c = Class.new do
+ include OpenTox
+ include Mongoid::Document
+ include Mongoid::Timestamps
+ store_in collection: klass.downcase.pluralize
+ field :name, type: String
+ field :warnings, type: Array, default: []
+ end
+ OpenTox.const_set klass,c
+ end
+
+end
+
diff --git a/lib/overwrite.rb b/lib/overwrite.rb
new file mode 100644
index 0000000..c92ad2b
--- /dev/null
+++ b/lib/overwrite.rb
@@ -0,0 +1,148 @@
+require "base64"
+class Object
+ # An object is blank if it's false, empty, or a whitespace string.
+ # For example, "", " ", +nil+, [], and {} are all blank.
+ def blank?
+ respond_to?(:empty?) ? empty? : !self
+ end
+
+ def numeric?
+ true if Float(self) rescue false
+ end
+
+ # Returns dimension of nested arrays
+ def dimension
+ self.class == Array ? 1 + self[0].dimension : 0
+ end
+end
+
+class Numeric
+ def percent_of(n)
+ self.to_f / n.to_f * 100.0
+ end
+end
+
+module Enumerable
+ # @return [Array] only the duplicates of an enumerable
+ def duplicates
+ inject({}) {|h,v| h[v]=h[v].to_i+1; h}.reject{|k,v| v==1}.keys
+ end
+ # http://stackoverflow.com/questions/2562256/find-most-common-string-in-an-array
+ Enumerable.class_eval do
+ def mode
+ group_by do |e|
+ e
+ end.values.max_by(&:size).first
+ end
+ end
+end
+
+class String
+ # @return [String] converts camel-case to underscore-case (OpenTox::SuperModel -> open_tox/super_model)
+ def underscore
+ self.gsub(/::/, '/').
+ gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
+ gsub(/([a-z\d])([A-Z])/,'\1_\2').
+ tr("-", "_").
+ downcase
+ end
+
+ # convert strings to boolean values
+ # @return [TrueClass,FalseClass] true or false
+ def to_boolean
+ return true if self == true || self =~ (/(true|t|yes|y|1)$/i)
+ return false if self == false || self.nil? || self =~ (/(false|f|no|n|0)$/i)
+ bad_request_error "invalid value for Boolean: \"#{self}\""
+ end
+
+end
+
+class File
+ # @return [String] mime_type including charset using linux cmd command
+ def mime_type
+ `file -ib '#{self.path}'`.chomp
+ end
+end
+
+class Array
+
+ # Sum up the size of single arrays in an array of arrays
+ # @param [Array] Array of arrays
+ # @return [Integer] Sum of size of array elements
+ def sum_size
+ self.inject(0) { |s,a|
+ if a.respond_to?('size')
+ s+=a.size
+ else
+ internal_server_error "No size available: #{a.inspect}"
+ end
+ }
+ end
+
+ # For symbolic features
+ # @param [Array] Array to test.
+ # @return [Boolean] Whether the array has just one unique value.
+ def zero_variance?
+ return self.uniq.size == 1
+ end
+
+ def median
+ sorted = self.sort
+ len = sorted.length
+ (sorted[(len - 1) / 2] + sorted[len / 2]) / 2.0
+ end
+
+ def mean
+ self.inject{ |sum, el| sum + el }.to_f / self.size
+ end
+
+ def sample_variance
+ m = self.mean
+ sum = self.inject(0){|accum, i| accum +(i-m)**2 }
+ sum/(self.length - 1).to_f
+ end
+
+ def standard_deviation
+ Math.sqrt(self.sample_variance)
+ end
+
+end
+
+module URI
+
+ def self.ssl? uri
+ URI.parse(uri).instance_of? URI::HTTPS
+ end
+
+ # @return [Boolean] checks if resource exists by making a HEAD-request
+ def self.accessible?(uri)
+ parsed_uri = URI.parse(uri + (OpenTox::RestClientWrapper.subjectid ? "?subjectid=#{CGI.escape OpenTox::RestClientWrapper.subjectid}" : ""))
+ http_code = URI.task?(uri) ? 600 : 400
+ http = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
+ unless (URI.ssl? uri) == true
+ http = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
+ request = Net::HTTP::Head.new(parsed_uri.request_uri)
+ http.request(request).code.to_i < http_code
+ else
+ http = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
+ http.use_ssl = true
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+ request = Net::HTTP::Head.new(parsed_uri.request_uri)
+ http.request(request).code.to_i < http_code
+ end
+ rescue
+ false
+ end
+
+ def self.valid? uri
+ u = URI.parse(uri)
+ u.scheme!=nil and u.host!=nil
+ rescue URI::InvalidURIError
+ false
+ end
+
+ def self.task? uri
+ uri =~ /task/ and URI.valid? uri
+ end
+
+end
diff --git a/lib/regression.rb b/lib/regression.rb
new file mode 100644
index 0000000..868c25f
--- /dev/null
+++ b/lib/regression.rb
@@ -0,0 +1,262 @@
+# TODO install R packages kernlab, caret, doMC, class, e1071
+
+
+ # log transform activities (create new dataset)
+ # scale, normalize features, might not be necessary
+ # http://stats.stackexchange.com/questions/19216/variables-are-often-adjusted-e-g-standardised-before-making-a-model-when-is
+ # http://stats.stackexchange.com/questions/7112/when-and-how-to-use-standardized-explanatory-variables-in-linear-regression
+ # zero-order correlation and the semi-partial correlation
+ # seems to be necessary for svm
+ # http://stats.stackexchange.com/questions/77876/why-would-scaling-features-decrease-svm-performance?lq=1
+ # http://stackoverflow.com/questions/15436367/svm-scaling-input-values
+ # use lasso or elastic net??
+ # select relevant features
+ # remove features with a single value
+ # remove correlated features
+ # remove features not correlated with endpoint
+module OpenTox
+ module Algorithm
+
+ class Regression
+
+ def self.weighted_average compound, params
+ weighted_sum = 0.0
+ sim_sum = 0.0
+ confidence = 0.0
+ neighbors = params[:neighbors]
+ activities = []
+ neighbors.each do |row|
+ n,sim,acts = row
+ confidence = sim if sim > confidence # distance to nearest neighbor
+ # TODO add LOO errors
+ acts.each do |act|
+ weighted_sum += sim*Math.log10(act)
+ activities << act
+ sim_sum += sim
+ end
+ end
+ #R.assign "activities", activities
+ #R.eval "cv = cv(activities)"
+ #confidence /= activities.standard_deviation#/activities.mean
+ #confidence = sim_sum*neighbors.size.to_f/params[:training_dataset_size]
+ #confidence = sim_sum/neighbors.size.to_f
+ #confidence = neighbors.size.to_f
+ confidence = 0 if confidence.nan?
+ sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
+ {:value => prediction,:confidence => confidence}
+ end
+
+ def self.local_linear_regression compound, neighbors
+ p neighbors.size
+ return nil unless neighbors.size > 0
+ features = neighbors.collect{|n| Compound.find(n.first).fp4}.flatten.uniq
+ p features
+ training_data = Array.new(neighbors.size){Array.new(features.size,0)}
+ neighbors.each_with_index do |n,i|
+ #p n.first
+ neighbor = Compound.find n.first
+ features.each_with_index do |f,j|
+ training_data[i][j] = 1 if neighbor.fp4.include? f
+ end
+ end
+ p training_data
+
+ R.assign "activities", neighbors.collect{|n| n[2].median}
+ R.assign "features", training_data
+ R.eval "model <- lm(activities ~ features)"
+ R.eval "summary <- summary(model)"
+ p R.summary
+ compound_features = features.collect{|f| compound.fp4.include? f ? 1 : 0}
+ R.assign "compound_features", compound_features
+ R.eval "prediction <- predict(model,compound_features)"
+ p R.prediction
+
+ end
+
+ def self.weighted_average_with_relevant_fingerprints neighbors
+ weighted_sum = 0.0
+ sim_sum = 0.0
+ fingerprint_features = []
+ neighbors.each do |row|
+ n,sim,acts = row
+ neighbor = Compound.find n
+ fingerprint_features += neighbor.fp4
+ end
+ fingerprint_features.uniq!
+ p fingerprint_features
+=begin
+ p n
+ acts.each do |act|
+ weighted_sum += sim*Math.log10(act)
+ sim_sum += sim
+ end
+ end
+=end
+ confidence = sim_sum/neighbors.size.to_f
+ sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
+ {:value => prediction,:confidence => confidence}
+ end
+
+ # Local support vector regression from neighbors
+ # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
+ # @return [Numeric] A prediction value.
+ def self.local_svm_regression neighbors, params={:min_train_performance => 0.1}
+
+ confidence = 0.0
+ prediction = nil
+
+ $logger.debug "Local SVM."
+ props = neighbors.collect{|row| row[3] }
+ neighbors.shift
+ activities = neighbors.collect{|n| n[2]}
+ prediction = self.local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
+ prediction = nil if (!prediction.nil? && prediction.infinite?)
+ $logger.debug "Prediction: '#{prediction}' ('#{prediction.class}')."
+ if prediction
+ confidence = get_confidence({:sims => neighbors.collect{|n| n[1]}, :activities => activities})
+ else
+ confidence = nil if prediction.nil?
+ end
+ [prediction, confidence]
+
+ end
+
+
+ # Local support vector prediction from neighbors.
+ # Uses propositionalized setting.
+ # Not to be called directly (use local_svm_regression or local_svm_classification).
+ # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
+ # @param [Array] activities, activities for neighbors.
+ # @param [Float] min_train_performance, parameter to control censoring
+ # @return [Numeric] A prediction value.
+ def self.local_svm_prop(props, activities, min_train_performance)
+
+ $logger.debug "Local SVM (Propositionalization / Kernlab Kernel)."
+ n_prop = props[1..-1] # is a matrix, i.e. two nested Arrays.
+ q_prop = props[0] # is an Array.
+
+ prediction = nil
+ if activities.uniq.size == 1
+ prediction = activities[0]
+ else
+ t = Time.now
+ #$logger.debug gram_matrix.to_yaml
+ #@r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests
+ @r = Rserve::Connection.new#(true,false) # global R instance leads to Socket errors after a large number of requests
+ rs = []
+ ["caret", "doMC", "class"].each do |lib|
+ #raise "failed to load R-package #{lib}" unless @r.void_eval "suppressPackageStartupMessages(library('#{lib}'))"
+ rs << "suppressPackageStartupMessages(library('#{lib}'))"
+ end
+ #@r.eval "registerDoMC()" # switch on parallel processing
+ rs << "registerDoMC()" # switch on parallel processing
+ #@r.eval "set.seed(1)"
+ rs << "set.seed(1)"
+ $logger.debug "Loading R packages: #{Time.now-t}"
+ t = Time.now
+ p n_prop
+ begin
+
+ # set data
+ rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
+ rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
+ rs << "n_prop_x_size <- c(#{n_prop.size})"
+ rs << "n_prop_y_size <- c(#{n_prop[0].size})"
+ rs << "y <- c(#{activities.join(',')})"
+ rs << "q_prop <- c(#{q_prop.join(',')})"
+ rs << "y = matrix(y)"
+ rs << "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)"
+ rs << "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)"
+
+ $logger.debug "Setting R data: #{Time.now-t}"
+ t = Time.now
+ # prepare data
+ rs << "
+ weights=NULL
+ if (!(class(y) == 'numeric')) {
+ y = factor(y)
+ weights=unlist(as.list(prop.table(table(y))))
+ weights=(weights-1)^2
+ }
+ "
+
+ rs << "
+ rem = nearZeroVar(prop_matrix)
+ if (length(rem) > 0) {
+ prop_matrix = prop_matrix[,-rem,drop=F]
+ q_prop = q_prop[,-rem,drop=F]
+ }
+ rem = findCorrelation(cor(prop_matrix))
+ if (length(rem) > 0) {
+ prop_matrix = prop_matrix[,-rem,drop=F]
+ q_prop = q_prop[,-rem,drop=F]
+ }
+ "
+
+ #p @r.eval("y").to_ruby
+ #p "weights"
+ #p @r.eval("weights").to_ruby
+ $logger.debug "Preparing R data: #{Time.now-t}"
+ t = Time.now
+ # model + support vectors
+ #train_success = @r.eval <<-EOR
+ rs << '
+ model = train(prop_matrix,y,
+ method="svmRadial",
+ preProcess=c("center", "scale"),
+ class.weights=weights,
+ trControl=trainControl(method="LGOCV",number=10),
+ tuneLength=8
+ )
+ perf = ifelse ( class(y)!="numeric", max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
+ '
+ File.open("/tmp/r.r","w+"){|f| f.puts rs.join("\n")}
+ p rs.join("\n")
+ p `Rscript /tmp/r.r`
+=begin
+ @r.void_eval <<-EOR
+ model = train(prop_matrix,y,
+ method="svmRadial",
+ #preProcess=c("center", "scale"),
+ #class.weights=weights,
+ #trControl=trainControl(method="LGOCV",number=10),
+ #tuneLength=8
+ )
+ perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
+ EOR
+=end
+
+ $logger.debug "Creating R SVM model: #{Time.now-t}"
+ t = Time.now
+ if train_success
+ # prediction
+ @r.eval "predict(model,q_prop); p = predict(model,q_prop)" # kernlab bug: predict twice
+ #@r.eval "p = predict(model,q_prop)" # kernlab bug: predict twice
+ @r.eval "if (class(y)!='numeric') p = as.character(p)"
+ prediction = @r.p
+
+ # censoring
+ prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance.to_f )
+ prediction = nil if prediction =~ /NA/
+ $logger.debug "Performance: '#{sprintf("%.2f", @r.perf)}'"
+ else
+ $logger.debug "Model creation failed."
+ prediction = nil
+ end
+ $logger.debug "R Prediction: #{Time.now-t}"
+ rescue Exception => e
+ $logger.debug "#{e.class}: #{e.message}"
+ $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+ ensure
+ #puts @r.inspect
+ #TODO: broken pipe
+ #@r.quit # free R
+ end
+ end
+ prediction
+ end
+ end
+
+ end
+end
+
diff --git a/lib/rest-client-wrapper.rb b/lib/rest-client-wrapper.rb
new file mode 100644
index 0000000..de1b74f
--- /dev/null
+++ b/lib/rest-client-wrapper.rb
@@ -0,0 +1,98 @@
+module OpenTox
+
+ class RestClientWrapper
+
+ attr_accessor :request, :response
+
+ @@subjectid = nil
+
+ def self.subjectid=(subjectid)
+ @@subjectid = subjectid
+ end
+
+ def self.subjectid
+ @@subjectid
+ end
+
+ # REST methods
+ # Raises OpenTox::Error if call fails (rescued in overwrite.rb -> halt 502)
+ # Does not wait for task to finish and returns task uri
+ # @param [String] destination URI
+ # @param [optional,Hash|String] Payload data posted to the service
+ # @param [optional,Hash] Headers with params like :accept, :content_type, :subjectid, :verify_ssl
+ # @return [RestClient::Response] REST call response
+ [:head,:get,:post,:put,:delete].each do |method|
+
+ define_singleton_method method do |uri,payload={},headers={},waiting_task=nil|
+
+ # check input
+ bad_request_error "Headers are not a hash: #{headers.inspect}", uri unless headers==nil or headers.is_a?(Hash)
+ headers[:subjectid] ||= @@subjectid
+ bad_request_error "Invalid URI: '#{uri}'", uri unless URI.valid? uri
+ #resource_not_found_error "URI '#{uri}' not found.", uri unless URI.accessible?(uri, @subjectid) unless URI.ssl?(uri)
+ # make sure that no header parameters are set in the payload
+ [:accept,:content_type,:subjectid].each do |header|
+ if defined? $aa || URI(uri).host == URI($aa[:uri]).host
+ else
+ bad_request_error "#{header} should be submitted in the headers", uri if payload and payload.is_a?(Hash) and payload[header]
+ end
+ end
+
+ # create request
+ args={}
+ args[:method] = method
+ args[:url] = uri
+ args[:verify_ssl] = 0 if headers[:verify_ssl].nil? || headers[:verify_ssl].empty?
+ args[:timeout] = 1800
+ args[:payload] = payload
+ headers.each{ |k,v| headers.delete(k) if v==nil } if headers #remove keys with empty values, as this can cause problems
+ args[:headers] = headers
+
+ $logger.debug "post to #{uri} with params #{payload.inspect.to_s[0..1000]}" if method.to_s=="post"
+
+ @request = RestClient::Request.new(args)
+ # ignore error codes from Task services (may return error codes >= 400 according to API, which causes exceptions in RestClient and RDF::Reader)
+ @response = @request.execute do |response, request, result|
+ if [301, 302, 307].include? response.code and request.method == :get
+ response.follow_redirection(request, result)
+ elsif response.code >= 400 and !URI.task?(uri)
+ #TODO add parameters to error-report
+ #parameters = request.args
+ #parameters[:headers][:subjectid] = "REMOVED" if parameters[:headers] and parameters[:headers][:subjectid]
+ #parameters[:url] = parameters[:url].gsub(/(http|https|)\:\/\/[a-zA-Z0-9\-]+\:[a-zA-Z0-9]+\@/, "REMOVED@") if parameters[:url]
+ #message += "\nREST parameters:\n#{parameters.inspect}"
+ error = known_errors.collect{|e| e if e[:code] == response.code}.compact.first
+ begin # errors are returned as error reports in json, try to parse
+ # TODO: may be the reason for failure of task.rb -n test_11_wait_for_error_task
+ content = JSON.parse(response)
+ msg = content["message"].to_s
+ cause = content["errorCause"].to_s
+ raise if msg.size==0 && cause.size==0 # parsing failed
+ rescue # parsing error failed, use complete content as message
+ msg = "Could not parse error response from rest call '#{method}' to '#{uri}':\n#{response}"
+ cause = nil
+ end
+ Object.method(error[:method]).call msg, uri, cause # call error method
+ else
+ response
+ end
+ end
+ end
+ end
+
+ #@return [Array] of hashes with error code, method and class
+ def self.known_errors
+ errors = []
+ RestClient::STATUSES.each do |code,k|
+ if code >= 400
+ method = k.underscore.gsub(/ |'/,'_')
+ method += "_error" unless method.match(/_error$/)
+ klass = method.split("_").collect{|s| s.capitalize}.join("")
+ errors << {:code => code, :method => method.to_sym, :class => klass}
+ end
+ end
+ errors
+ end
+
+ end
+end
diff --git a/lib/similarity.rb b/lib/similarity.rb
new file mode 100644
index 0000000..91e18db
--- /dev/null
+++ b/lib/similarity.rb
@@ -0,0 +1,58 @@
+=begin
+* Name: similarity.rb
+* Description: Similarity algorithms
+* Author: Andreas Maunz <andreas@maunz.de
+* Date: 10/2012
+=end
+
+module OpenTox
+ module Algorithm
+
+ class Similarity
+
+ #TODO weighted tanimoto
+
+ # Tanimoto similarity
+ # @param [Array] a fingerprints of first compound
+ # @param [Array] b fingerprints of second compound
+ # @return [Float] Tanimoto similarity
+ def self.tanimoto(a,b)
+ bad_request_error "fingerprints #{a} and #{b} don't have equal size" unless a.size == b.size
+ #common = 0.0
+ #a.each_with_index do |n,i|
+ #common += 1 if n == b[i]
+ #end
+ #common/a.size
+ # TODO check if calculation speed can be improved
+ common_p_sum = 0.0
+ all_p_sum = 0.0
+ (0...a.size).each { |idx|
+ common_p_sum += [ a[idx], b[idx] ].min
+ all_p_sum += [ a[idx], b[idx] ].max
+ }
+ common_p_sum/all_p_sum
+ end
+
+
+ # Cosine similarity
+ # @param [Array] a fingerprints of first compound
+ # @param [Array] b fingerprints of second compound
+ # @return [Float] Cosine similarity, the cosine of angle enclosed between vectors a and b
+ def self.cosine(a, b)
+ val = 0.0
+ if a.size>0 and b.size>0
+ if a.size>12 && b.size>12
+ a = a[0..11]
+ b = b[0..11]
+ end
+ a_vec = a.to_gv
+ b_vec = b.to_gv
+ val = a_vec.dot(b_vec) / (a_vec.norm * b_vec.norm)
+ end
+ val
+ end
+
+ end
+
+ end
+end
diff --git a/lib/unique_descriptors.rb b/lib/unique_descriptors.rb
new file mode 100644
index 0000000..cf9cbf3
--- /dev/null
+++ b/lib/unique_descriptors.rb
@@ -0,0 +1,120 @@
+# set of non redundant descriptors, faster algorithms are preferred
+# TODO:
+# select logP algorithm
+# select l5 algorithm
+# use smarts matcher for atom counts
+# check correlations
+UNIQUEDESCRIPTORS = [
+ "Openbabel.abonds", #Number of aromatic bonds
+ "Openbabel.atoms", #Number of atoms
+ "Openbabel.bonds", #Number of bonds
+ "Openbabel.dbonds", #Number of double bonds
+ "Openbabel.HBA1", #Number of Hydrogen Bond Acceptors 1 (JoelLib)
+ "Openbabel.HBA2", #Number of Hydrogen Bond Acceptors 2 (JoelLib)
+ "Openbabel.HBD", #Number of Hydrogen Bond Donors (JoelLib)
+ #"Openbabel.L5", #Lipinski Rule of Five# TODO Openbabel.L5 returns nil, investigate!!!
+ "Openbabel.logP", #octanol/water partition coefficient
+ "Openbabel.MP", #Melting point
+ "Openbabel.MR", #molar refractivity
+ "Openbabel.MW", #Molecular Weight filter
+ "Openbabel.nF", #Number of Fluorine Atoms
+ "Openbabel.sbonds", #Number of single bonds
+ "Openbabel.tbonds", #Number of triple bonds
+ "Openbabel.TPSA", #topological polar surface area
+ "Cdk.ALOGP", #Calculates atom additive logP and molar refractivity values as described by Ghose and Crippen and
+ "Cdk.APol", #Descriptor that calculates the sum of the atomic polarizabilities (including implicit hydrogens).
+ "Cdk.AcidicGroupCount", #Returns the number of acidic groups.
+ "Cdk.AminoAcidCount", #Returns the number of amino acids found in the system
+ #"Cdk.AromaticAtomsCount", #Descriptor based on the number of aromatic atoms of a molecule.
+ #"Cdk.AromaticBondsCount", #Descriptor based on the number of aromatic bonds of a molecule.
+ #"Cdk.AtomCount", #Descriptor based on the number of atoms of a certain element type.
+ "Cdk.AutocorrelationCharge", #The Moreau-Broto autocorrelation descriptors using partial charges
+ "Cdk.AutocorrelationMass", #The Moreau-Broto autocorrelation descriptors using atomic weight
+ "Cdk.AutocorrelationPolarizability", #The Moreau-Broto autocorrelation descriptors using polarizability
+ "Cdk.BCUT", #Eigenvalue based descriptor noted for its utility in chemical diversity described by Pearlman et al. .
+ "Cdk.BPol", #Descriptor that calculates the sum of the absolute value of the difference between atomic polarizabilities of all bonded atoms in the molecule (including implicit hydrogens).
+ "Cdk.BasicGroupCount", #Returns the number of basic groups.
+ #"Cdk.BondCount", #Descriptor based on the number of bonds of a certain bond order.
+ "Cdk.CPSA", #A variety of descriptors combining surface area and partial charge information
+ "Cdk.CarbonTypes", #Characterizes the carbon connectivity in terms of hybridization
+ "Cdk.ChiChain", #Evaluates the Kier & Hall Chi chain indices of orders 3,4,5 and 6
+ "Cdk.ChiCluster", #Evaluates the Kier & Hall Chi cluster indices of orders 3,4,5,6 and 7
+ "Cdk.ChiPathCluster", #Evaluates the Kier & Hall Chi path cluster indices of orders 4,5 and 6
+ "Cdk.ChiPath", #Evaluates the Kier & Hall Chi path indices of orders 0,1,2,3,4,5,6 and 7
+ "Cdk.EccentricConnectivityIndex", #A topological descriptor combining distance and adjacency information.
+ "Cdk.FMF", #Descriptor characterizing molecular complexity in terms of its Murcko framework
+ "Cdk.FragmentComplexity", #Class that returns the complexity of a system. The complexity is defined as @cdk.cite{Nilakantan06}
+ "Cdk.GravitationalIndex", #Descriptor characterizing the mass distribution of the molecule.
+ #"Cdk.HBondAcceptorCount", #Descriptor that calculates the number of hydrogen bond acceptors.
+ #"Cdk.HBondDonorCount", #Descriptor that calculates the number of hydrogen bond donors.
+ "Cdk.HybridizationRatio", #Characterizes molecular complexity in terms of carbon hybridization states.
+ "Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential.
+ "Cdk.KappaShapeIndices", #Descriptor that calculates Kier and Hall kappa molecular shape indices.
+ "Cdk.KierHallSmarts", #Counts the number of occurrences of the E-state fragments
+ "Cdk.LargestChain", #Returns the number of atoms in the largest chain
+ "Cdk.LargestPiSystem", #Returns the number of atoms in the largest pi chain
+ "Cdk.LengthOverBreadth", #Calculates the ratio of length to breadth.
+ "Cdk.LongestAliphaticChain", #Returns the number of atoms in the longest aliphatic chain
+ "Cdk.MDE", #Evaluate molecular distance edge descriptors for C, N and O
+ #"Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms .
+ "Cdk.MomentOfInertia", #Descriptor that calculates the principal moments of inertia and ratios of the principal moments. Als calculates the radius of gyration.
+ "Cdk.PetitjeanNumber", #Descriptor that calculates the Petitjean Number of a molecule.
+ "Cdk.PetitjeanShapeIndex", #The topological and geometric shape indices described Petitjean and Bath et al. respectively. Both measure the anisotropy in a molecule.
+ "Cdk.RotatableBondsCount", #Descriptor that calculates the number of nonrotatable bonds on a molecule.
+ #"Cdk.RuleOfFive", #This Class contains a method that returns the number failures of the Lipinski's Rule Of Five.
+ #"Cdk.TPSA", #Calculation of topological polar surface area based on fragment contributions .
+ "Cdk.VABC", #Describes the volume of a molecule.
+ "Cdk.VAdjMa", #Descriptor that calculates the vertex adjacency information of a molecule.
+ "Cdk.WHIM", #Holistic descriptors described by Todeschini et al .
+ #"Cdk.Weight", #Descriptor based on the weight of atoms of a certain element type. If no element is specified, the returned value is the Molecular Weight
+ "Cdk.WeightedPath", #The weighted path (molecular ID) descriptors described by Randic. They characterize molecular branching.
+ "Cdk.WienerNumbers", #This class calculates Wiener path number and Wiener polarity number.
+ "Cdk.XLogP", #Prediction of logP based on the atom-type method called XLogP.
+ "Cdk.ZagrebIndex", #The sum of the squared atom degrees of all heavy atoms.
+ "Joelib.count.NumberOfS", #no description available
+ "Joelib.count.NumberOfP", #no description available
+ "Joelib.count.NumberOfO", #no description available
+ "Joelib.count.NumberOfN", #no description available
+ #"Joelib.count.AromaticBonds", #no description available
+ "Joelib.count.NumberOfI", #no description available
+ "Joelib.count.NumberOfF", #no description available
+ "Joelib.count.NumberOfC", #no description available
+ "Joelib.count.NumberOfB", #no description available
+ "Joelib.count.HydrophobicGroups", #no description available
+ #"Joelib.KierShape3", #no description available
+ #"Joelib.KierShape2", #no description available
+ #"Joelib.KierShape1", #no description available
+ #"Joelib.count.AcidicGroups", #no description available
+ "Joelib.count.AliphaticOHGroups", #no description available
+ #"Joelib.count.NumberOfAtoms", #no description available
+ "Joelib.TopologicalRadius", #no description available
+ "Joelib.GeometricalShapeCoefficient", #no description available
+ #"Joelib.MolecularWeight", #no description available
+ "Joelib.FractionRotatableBonds", #no description available
+ #"Joelib.count.HBD2", #no description available
+ #"Joelib.count.HBD1", #no description available
+ "Joelib.LogP", #no description available
+ "Joelib.GraphShapeCoefficient", #no description available
+ "Joelib.count.BasicGroups", #no description available
+ #"Joelib.count.RotatableBonds", #no description available
+ "Joelib.count.HeavyBonds", #no description available
+ "Joelib.PolarSurfaceArea", #no description available
+ #"Joelib.ZagrebIndex1", #no description available
+ "Joelib.GeometricalRadius", #no description available
+ "Joelib.count.SO2Groups", #no description available
+ "Joelib.count.AromaticOHGroups", #no description available
+ "Joelib.GeometricalDiameter", #no description available
+ #"Joelib.MolarRefractivity", #no description available
+ "Joelib.count.NumberOfCl", #no description available
+ "Joelib.count.OSOGroups", #no description available
+ "Joelib.count.NumberOfBr", #no description available
+ "Joelib.count.NO2Groups", #no description available
+ "Joelib.count.HeteroCycles", #no description available
+ #"Joelib.count.HBA2", #no description available
+ #"Joelib.count.HBA1", #no description available
+ #"Joelib.count.NumberOfBonds", #no description available
+ "Joelib.count.SOGroups", #no description available
+ "Joelib.TopologicalDiameter", #no description available
+ "Joelib.count.NumberOfHal", #no description available
+
+].sort
diff --git a/lib/validation.rb b/lib/validation.rb
new file mode 100644
index 0000000..c52ffc0
--- /dev/null
+++ b/lib/validation.rb
@@ -0,0 +1,68 @@
+module OpenTox
+
+ class Validation
+
+ field :model_id, type: BSON::ObjectId
+ field :prediction_dataset_id, type: BSON::ObjectId
+ field :crossvalidation_id, type: BSON::ObjectId
+ field :test_dataset_id, type: BSON::ObjectId
+ field :nr_instances, type: Integer
+ field :nr_unpredicted, type: Integer
+ field :predictions, type: Array
+
+ def prediction_dataset
+ Dataset.find prediction_dataset_id
+ end
+
+ def test_dataset
+ Dataset.find test_dataset_id
+ end
+
+ def model
+ Model::Lazar.find model_id
+ end
+
+ def self.create model, training_set, test_set, crossvalidation=nil
+
+ atts = model.attributes.dup # do not modify attributes from original model
+ atts["_id"] = BSON::ObjectId.new
+ atts[:training_dataset_id] = training_set.id
+ validation_model = model.class.create training_set, atts
+ validation_model.save
+ test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
+ prediction_dataset = validation_model.predict test_set_without_activities
+ predictions = []
+ nr_unpredicted = 0
+ activities = test_set.data_entries.collect{|de| de.first}
+ prediction_dataset.data_entries.each_with_index do |de,i|
+ if de[0] and de[1] and de[1].numeric?
+ activity = activities[i]
+ prediction = de.first
+ confidence = de[1]
+ predictions << [prediction_dataset.compound_ids[i], activity, prediction, de[1]]
+ else
+ nr_unpredicted += 1
+ end
+ end
+ validation = self.new(
+ :model_id => validation_model.id,
+ :prediction_dataset_id => prediction_dataset.id,
+ :test_dataset_id => test_set.id,
+ :nr_instances => test_set.compound_ids.size,
+ :nr_unpredicted => nr_unpredicted,
+ :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence
+ )
+ validation.crossvalidation_id = crossvalidation.id if crossvalidation
+ validation.save
+ validation
+ end
+
+ end
+
+ class ClassificationValidation < Validation
+ end
+
+ class RegressionValidation < Validation
+ end
+
+end