new master branch

author: Christoph Helma <helma@in-silico.ch> 2015-10-08 10:43:43 +0200
committer: Christoph Helma <helma@in-silico.ch> 2015-10-08 10:43:43 +0200
commit: 1a56148aadef031c4f487bc23fda43f4ac5b7369 (patch)
tree: 3555c5883ed0c292b105c40c185ebba3e5bd4e3e /lib
parent: 394d564699756288569169ff3e198d6d7702f092 (diff)
parent: e3217075b602a950a0ee995fcfa731d97b5ba3eb (diff)
20 files changed, 3922 insertions, 0 deletions
diff --git a/lib/SMARTS_InteLigand.txt b/lib/SMARTS_InteLigand.txt
new file mode 100644
index 0000000..23bc6e2
--- /dev/null
+++ b/lib/SMARTS_InteLigand.txt
@@ -0,0 +1,983 @@
+#
+#              SMARTS Patterns for Functional Group Classification 
+#
+#              written by Christian Laggner 
+#              Copyright 2005 Inte:Ligand Software-Entwicklungs und Consulting GmbH
+#
+#              Released under the Lesser General Public License (LGPL license)
+#              see http://www.gnu.org/copyleft/lesser.html
+#              Modified from Version 221105
+#####################################################################################################
+
+# General Stuff:
+# These patters were written in an attempt to represent the classification of organic compounds 
+# from the viewpoint of an organic chemist.
+# They are often very restrictive. This may be generally a good thing, but it also takes some time
+# for filtering/indexing large compound sets. 
+# For filtering undesired groups (in druglike compounds) one will want to have more general patterns 
+# (e.g. you don't want *any* halide of *any* acid, *neither* aldehyde *nor* formyl esters and amides, ...). 
+#
+
+# Part I: Carbon 
+# ==============
+
+
+# I.1: Carbon-Carbon Bonds
+# ------------------------
+
+# I.1.1 Alkanes:
+
+Primary_carbon: [CX4H3][#6]
+
+Secondary_carbon: [CX4H2]([#6])[#6]
+
+Tertiary_carbon: [CX4H1]([#6])([#6])[#6]
+
+Quaternary_carbon: [CX4]([#6])([#6])([#6])[#6]
+
+
+# I.1.2 C-C double and Triple Bonds
+
+Alkene: [CX3;$([H2]),$([H1][#6]),$(C([#6])[#6])]=[CX3;$([H2]),$([H1][#6]),$(C([#6])[#6])] 
+# sp2 C may be substituted only by C or H - 
+# does not hit ketenes and allenes, nor enamines, enols and the like
+
+Alkyne: [CX2]#[CX2]
+# non-carbon substituents (e.g. alkynol ethers) are rather rare, thus no further discrimination
+
+Allene: [CX3]=[CX2]=[CX3]
+
+
+# I.2: One Carbon-Hetero Bond
+# ---------------------------
+
+
+# I.2.1 Alkyl Halogenides
+
+Alkylchloride: [ClX1][CX4]
+# will also hit chloromethylethers and the like, but no chloroalkenes, -alkynes or -aromats 
+# a more restrictive version can be obtained by modifying the Alcohol string.
+
+Alkylfluoride: [FX1][CX4]
+
+Alkylbromide: [BrX1][CX4]
+
+Alkyliodide: [IX1][CX4]
+
+
+# I.2.2 Alcohols and Ethers
+
+Alcohol: [OX2H][CX4;!$(C([OX2H])[O,S,#7,#15])]
+# nonspecific definition, no acetals, aminals, and the like
+
+Primary_alcohol: [OX2H][CX4H2;!$(C([OX2H])[O,S,#7,#15])]
+
+Secondary_alcohol: [OX2H][CX4H;!$(C([OX2H])[O,S,#7,#15])]
+
+Tertiary_alcohol: [OX2H][CX4D4;!$(C([OX2H])[O,S,#7,#15])]
+
+Dialkylether: [OX2]([CX4;!$(C([OX2])[O,S,#7,#15,F,Cl,Br,I])])[CX4;!$(C([OX2])[O,S,#7,#15])]
+# no acetals and the like; no enolethers
+
+Dialkylthioether: [SX2]([CX4;!$(C([OX2])[O,S,#7,#15,F,Cl,Br,I])])[CX4;!$(C([OX2])[O,S,#7,#15])]
+# no acetals and the like; no enolethers
+
+Alkylarylether: [OX2](c)[CX4;!$(C([OX2])[O,S,#7,#15,F,Cl,Br,I])]
+# no acetals and the like; no enolethers
+
+Diarylether: [c][OX2][c]
+
+Alkylarylthioether: [SX2](c)[CX4;!$(C([OX2])[O,S,#7,#15,F,Cl,Br,I])]
+
+Diarylthioether: [c][SX2][c]
+
+Oxonium: [O+;!$([O]~[!#6]);!$([S]*~[#7,#8,#15,#16])]
+# can't be aromatic, thus O and not #8
+
+# I.2.3 Amines
+
+Amine: [NX3+0,NX4+;!$([N]~[!#6]);!$([N]*~[#7,#8,#15,#16])]
+# hits all amines (prim/sec/tert/quart), including ammonium salts, also enamines, but not amides, imides, aminals, ...
+
+# the following amines include also the protonated forms 
+
+Primary_aliph_amine: [NX3H2+0,NX4H3+;!$([N][!C]);!$([N]*~[#7,#8,#15,#16])]
+
+Secondary_aliph_amine: [NX3H1+0,NX4H2+;!$([N][!C]);!$([N]*~[#7,#8,#15,#16])]
+
+Tertiary_aliph_amine: [NX3H0+0,NX4H1+;!$([N][!C]);!$([N]*~[#7,#8,#15,#16])]
+
+Quaternary_aliph_ammonium: [NX4H0+;!$([N][!C]);!$([N]*~[#7,#8,#15,#16])]
+
+Primary_arom_amine: [NX3H2+0,NX4H3+]c
+
+Secondary_arom_amine: [NX3H1+0,NX4H2+;!$([N][!c]);!$([N]*~[#7,#8,#15,#16])]
+
+Tertiary_arom_amine: [NX3H0+0,NX4H1+;!$([N][!c]);!$([N]*~[#7,#8,#15,#16])]
+
+Quaternary_arom_ammonium: [NX4H0+;!$([N][!c]);!$([N]*~[#7,#8,#15,#16])]
+
+Secondary_mixed_amine: [NX3H1+0,NX4H2+;$([N]([c])[C]);!$([N]*~[#7,#8,#15,#16])]
+
+Tertiary_mixed_amine: [NX3H0+0,NX4H1+;$([N]([c])([C])[#6]);!$([N]*~[#7,#8,#15,#16])]
+
+Quaternary_mixed_ammonium: [NX4H0+;$([N]([c])([C])[#6][#6]);!$([N]*~[#7,#8,#15,#16])]
+
+Ammonium: [N+;!$([N]~[!#6]);!$(N=*);!$([N]*~[#7,#8,#15,#16])]
+# only C and H substituents allowed. Quaternary or protonated amines
+# NX4+ or Nv4+ is not recognized by Daylight's depictmatch if less than four C are present
+
+
+# I.2.4 Others
+
+Alkylthiol: [SX2H][CX4;!$(C([SX2H])~[O,S,#7,#15])]
+
+Dialkylthioether: [SX2]([CX4;!$(C([SX2])[O,S,#7,#15,F,Cl,Br,I])])[CX4;!$(C([SX2])[O,S,#7,#15])]
+
+Alkylarylthioether: [SX2](c)[CX4;!$(C([SX2])[O,S,#7,#15])]
+
+Disulfide: [SX2D2][SX2D2]
+
+1,2-Aminoalcohol: [OX2H][CX4;!$(C([OX2H])[O,S,#7,#15,F,Cl,Br,I])][CX4;!$(C([N])[O,S,#7,#15])][NX3;!$(NC=[O,S,N])]
+# does not hit alpha-amino acids, enaminoalcohols, 1,2-aminoacetals, o-aminophenols, etc.
+
+1,2-Diol: [OX2H][CX4;!$(C([OX2H])[O,S,#7,#15])][CX4;!$(C([OX2H])[O,S,#7,#15])][OX2H]
+# does not hit alpha-hydroxy acids, enolalcohols, 1,2-hydroxyacetals, 1,2-diphenols, etc.
+
+1,1-Diol: [OX2H][CX4;!$(C([OX2H])([OX2H])[O,S,#7,#15])][OX2H]
+
+Hydroperoxide: [OX2H][OX2]
+#does not neccessarily have to be connected to a carbon atom, includes also hydrotrioxides
+
+Peroxo: [OX2D2][OX2D2]
+
+Organolithium_compounds: [LiX1][#6,#14]
+
+Organomagnesium_compounds: [MgX2][#6,#14]
+# not restricted to Grignard compounds, also dialkyl Mg
+
+Organometallic_compounds: [!#1;!#5;!#6;!#7;!#8;!#9;!#14;!#15;!#16;!#17;!#33;!#34;!#35;!#52;!#53;!#85]~[#6;!-]
+# very general, includes all metals covalently bound to carbon 
+
+
+# I.3: Two Carbon-Hetero Bonds (Carbonyl and Derivatives)
+# ----------------------------
+
+# I.3.1 Double Bond to Hetero
+
+Aldehyde: [$([CX3H][#6]),$([CX3H2])]=[OX1]
+# hits aldehydes including formaldehyde
+
+Ketone: [#6][CX3](=[OX1])[#6]
+# does not include oxo-groups connected to a (hetero-) aromatic ring
+
+Thioaldehyde: [$([CX3H][#6]),$([CX3H2])]=[SX1]
+
+Thioketone: [#6][CX3](=[SX1])[#6]
+# does not include thioxo-groups connected to a (hetero-) aromatic ring
+
+Imine: [NX2;$([N][#6]),$([NH]);!$([N][CX3]=[#7,#8,#15,#16])]=[CX3;$([CH2]),$([CH][#6]),$([C]([#6])[#6])]
+# nitrogen is not part of an amidelike strukture, nor of an aromatic ring, but can be part of an aminal or similar
+
+Immonium: [NX3+;!$([N][!#6]);!$([N][CX3]=[#7,#8,#15,#16])]
+
+Oxime: [NX2](=[CX3;$([CH2]),$([CH][#6]),$([C]([#6])[#6])])[OX2H]
+
+Oximether: [NX2](=[CX3;$([CH2]),$([CH][#6]),$([C]([#6])[#6])])[OX2][#6;!$(C=[#7,#8])]
+# ether, not ester or amide; does not hit isoxazole
+
+
+# I.3.2. Two Single Bonds to Hetero
+
+Acetal: [OX2]([#6;!$(C=[O,S,N])])[CX4;!$(C(O)(O)[!#6])][OX2][#6;!$(C=[O,S,N])]
+# does not hit hydroxy-methylesters, ketenacetals, hemiacetals, orthoesters, etc.
+
+Hemiacetal: [OX2H][CX4;!$(C(O)(O)[!#6])][OX2][#6;!$(C=[O,S,N])]
+
+Aminal: [NX3v3;!$(NC=[#7,#8,#15,#16])]([#6])[CX4;!$(C(N)(N)[!#6])][NX3v3;!$(NC=[#7,#8,#15,#16])][#6] 
+# Ns are not part of an amide or similar. v3 ist to exclude nitro and similar groups
+
+Hemiaminal: [NX3v3;!$(NC=[#7,#8,#15,#16])]([#6])[CX4;!$(C(N)(N)[!#6])][OX2H]
+
+Thioacetal: [SX2]([#6;!$(C=[O,S,N])])[CX4;!$(C(S)(S)[!#6])][SX2][#6;!$(C=[O,S,N])]
+
+Thiohemiacetal: [SX2]([#6;!$(C=[O,S,N])])[CX4;!$(C(S)(S)[!#6])][OX2H]
+
+Halogen_acetal_like: [NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][FX1,ClX1,BrX1,IX1]
+# hits chloromethylenethers and other reactive alkylating agents
+
+Acetal_like: [NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][FX1,ClX1,BrX1,IX1,NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])]
+# includes all of the above and other combinations (S-C-N, hydrates, ...), but still no aminomethylenesters and similar
+
+Halogenmethylen_ester_and_similar: [NX3v3,SX2,OX2;$(**=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][FX1,ClX1,BrX1,IX1]
+# also reactive alkylating agents. Acid does not have to be carboxylic acid, also S- and P-based acids allowed
+
+NOS_methylen_ester_and_similar: [NX3v3,SX2,OX2;$(**=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])]
+# Same as above, but N,O or S instead of halogen. Ester/amide allowed only on one side 
+
+Hetero_methylen_ester_and_similar: [NX3v3,SX2,OX2;$(**=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][FX1,ClX1,BrX1,IX1,NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])]
+# Combination of the last two patterns
+
+Cyanhydrine: [NX1]#[CX2][CX4;$([CH2]),$([CH]([CX2])[#6]),$(C([CX2])([#6])[#6])][OX2H]
+
+
+# I.3.3 Single Bond to Hetero, C=C Double Bond (Enols and Similar)
+
+Chloroalkene: [ClX1][CX3]=[CX3]
+
+Fluoroalkene: [FX1][CX3]=[CX3]
+
+Bromoalkene: [BrX1][CX3]=[CX3]
+
+Iodoalkene: [IX1][CX3]=[CX3]
+
+Enol: [OX2H][CX3;$([H1]),$(C[#6])]=[CX3]
+# no phenols
+
+Endiol: [OX2H][CX3;$([H1]),$(C[#6])]=[CX3;$([H1]),$(C[#6])][OX2H]
+# no 1,2-diphenols, ketenacetals, ...
+
+Enolether: [OX2]([#6;!$(C=[N,O,S])])[CX3;$([H0][#6]),$([H1])]=[CX3]
+# finds also endiodiethers, but not enolesters, no aromats
+
+Enolester: [OX2]([CX3]=[OX1])[#6X3;$([#6][#6]),$([H1])]=[#6X3;!$(C[OX2H])]
+
+
+Enamine: [NX3;$([NH2][CX3]),$([NH1]([CX3])[#6]),$([N]([CX3])([#6])[#6]);!$([N]*=[#7,#8,#15,#16])][CX3;$([CH]),$([C][#6])]=[CX3]
+# does not hit amines attached to aromatic rings, nor may the nitrogen be aromatic
+
+Thioenol: [SX2H][CX3;$([H1]),$(C[#6])]=[CX3]
+
+Thioenolether: [SX2]([#6;!$(C=[N,O,S])])[CX3;$(C[#6]),$([CH])]=[CX3]
+
+
+# I.4: Three Carbon-Hetero Bonds (Carboxyl and Derivatives)
+# ------------------------------
+
+Acylchloride: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[ClX1] 
+
+Acylfluoride: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[FX1] 
+
+Acylbromide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[BrX1] 
+
+Acyliodide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[IX1]
+
+Acylhalide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[FX1,ClX1,BrX1,IX1]
+# all of the above
+
+
+# The following contains all simple carboxylic combinations of O, N, S, & Hal -
+# - acids, esters, amides, ... as well as a few extra cases (anhydride, hydrazide...)
+# Cyclic structures (including aromats) like lactones, lactames, ... got their own  
+# definitions. Structures where both heteroatoms are part of an aromatic ring  
+# (oxazoles, imidazoles, ...) were excluded.
+
+Carboxylic_acid: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[$([OX2H]),$([OX1-])]
+# includes carboxylate anions
+
+Carboxylic_ester:  [CX3;$([R0][#6]),$([H1R0])](=[OX1])[OX2][#6;!$(C=[O,N,S])]
+# does not hit anhydrides or lactones
+
+Lactone: [#6][#6X3R](=[OX1])[#8X2][#6;!$(C=[O,N,S])]
+# may also be aromatic
+
+Carboxylic_anhydride: [CX3;$([H0][#6]),$([H1])](=[OX1])[#8X2][CX3;$([H0][#6]),$([H1])](=[OX1])
+# anhydride formed by two carboxylic acids, no mixed anhydrides (e.g. between carboxylic acid and sulfuric acid); may be part of a ring, even aromatic
+
+Carboxylic_acid_derivative: [$([#6X3H0][#6]),$([#6X3H])](=[!#6])[!#6]
+# includes most of the structures of I.4 and many more, also 1,3-heteroaromatics such as isoxazole
+
+Carbothioic_acid: [CX3;!R;$([C][#6]),$([CH]);$([C](=[OX1])[$([SX2H]),$([SX1-])]),$([C](=[SX1])[$([OX2H]),$([OX1-])])]
+# hits both tautomeric forms, as well as anions
+
+Carbothioic_S_ester: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[SX2][#6;!$(C=[O,N,S])]
+
+Carbothioic_S_lactone: [#6][#6X3R](=[OX1])[#16X2][#6;!$(C=[O,N,S])]
+# may also be aromatic
+
+Carbothioic_O_ester: [CX3;$([H0][#6]),$([H1])](=[SX1])[OX2][#6;!$(C=[O,N,S])]
+
+Carbothioic_O_lactone: [#6][#6X3R](=[SX1])[#8X2][#6;!$(C=[O,N,S])]
+
+Carbothioic_halide: [CX3;$([H0][#6]),$([H1])](=[SX1])[FX1,ClX1,BrX1,IX1]
+
+Carbodithioic_acid: [CX3;!R;$([C][#6]),$([CH]);$([C](=[SX1])[SX2H])]
+
+Carbodithioic_ester: [CX3;!R;$([C][#6]),$([CH]);$([C](=[SX1])[SX2][#6;!$(C=[O,N,S])])]
+
+Carbodithiolactone: [#6][#6X3R](=[SX1])[#16X2][#6;!$(C=[O,N,S])]
+
+
+Amide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+# does not hit lactames
+
+Primary_amide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[NX3H2]
+
+Secondary_amide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H1][#6;!$(C=[O,N,S])]
+
+Tertiary_amide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H0]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])]
+
+Lactam: [#6R][#6X3R](=[OX1])[#7X3;$([H1][#6;!$(C=[O,N,S])]),$([H0]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+# cyclic amides, may also be aromatic
+
+Alkyl_imide: [#6X3;$([H0][#6]),$([H1])](=[OX1])[#7X3H0]([#6])[#6X3;$([H0][#6]),$([H1])](=[OX1])
+# may be part of a ring, even aromatic. only C allowed at central N. May also be triacyl amide
+
+N_hetero_imide: [#6X3;$([H0][#6]),$([H1])](=[OX1])[#7X3H0]([!#6])[#6X3;$([H0][#6]),$([H1])](=[OX1])
+# everything else than H or C at central N
+
+Imide_acidic: [#6X3;$([H0][#6]),$([H1])](=[OX1])[#7X3H1][#6X3;$([H0][#6]),$([H1])](=[OX1])
+# can be deprotonated
+
+Thioamide: [$([CX3;!R][#6]),$([CX3H;!R])](=[SX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+# does not hit thiolactames
+
+Thiolactam: [#6R][#6X3R](=[SX1])[#7X3;$([H1][#6;!$(C=[O,N,S])]),$([H0]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+# cyclic thioamides, may also be aromatic
+
+
+Oximester: [#6X3;$([H0][#6]),$([H1])](=[OX1])[#8X2][#7X2]=,:[#6X3;$([H0]([#6])[#6]),$([H1][#6]),$([H2])]
+# may also be part of a ring / aromatic
+
+Amidine: [NX3;!$(NC=[O,S])][CX3;$([CH]),$([C][#6])]=[NX2;!$(NC=[O,S])]
+# only basic amidines, not as part of aromatic ring (e.g. imidazole)
+
+Hydroxamic_acid: [CX3;$([H0][#6]),$([H1])](=[OX1])[#7X3;$([H1]),$([H0][#6;!$(C=[O,N,S])])][$([OX2H]),$([OX1-])]
+
+Hydroxamic_acid_ester: [CX3;$([H0][#6]),$([H1])](=[OX1])[#7X3;$([H1]),$([H0][#6;!$(C=[O,N,S])])][OX2][#6;!$(C=[O,N,S])]
+# does not hit anhydrides of carboxylic acids withs hydroxamic acids
+
+
+Imidoacid: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[$([OX2H]),$([OX1-])]
+# not cyclic 
+
+Imidoacid_cyclic: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[$([OX2H]),$([OX1-])]  
+# the enamide-form of lactames. may be aromatic like 2-hydroxypyridine 
+
+Imidoester: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[OX2][#6;!$(C=[O,N,S])]
+# esters of the above structures. no anhydrides. 
+
+Imidolactone: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[OX2][#6;!$(C=[O,N,S])]
+# no oxazoles and similar
+
+Imidothioacid: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[$([SX2H]),$([SX1-])]
+# not cyclic 
+
+Imidothioacid_cyclic: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[$([SX2H]),$([SX1-])]  
+# the enamide-form of thiolactames. may be aromatic like 2-thiopyridine 
+
+Imidothioester: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[SX2][#6;!$(C=[O,N,S])]
+# thioesters of the above structures. no anhydrides. 
+
+Imidothiolactone: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[SX2][#6;!$(C=[O,N,S])]
+# no thioxazoles and similar
+
+Amidine: [#7X3v3;!$(N([#6X3]=[#7X2])C=[O,S])][CX3R0;$([H1]),$([H0][#6])]=[NX2v3;!$(N(=[#6X3][#7X3])C=[O,S])]
+# only basic amidines, not substituted by carbonyl or thiocarbonyl, not as part of a ring 
+
+Imidolactam: [#6][#6X3R;$([H0](=[NX2;!$(N(=[#6X3][#7X3])C=[O,S])])[#7X3;!$(N([#6X3]=[#7X2])C=[O,S])]),$([H0](-[NX3;!$(N([#6X3]=[#7X2])C=[O,S])])=,:[#7X2;!$(N(=[#6X3][#7X3])C=[O,S])])]  
+# one of the two C~N bonds is part of a ring (may be aromatic), but not both - thus no imidazole
+
+Imidoylhalide: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[FX1,ClX1,BrX1,IX1]
+# not cyclic
+
+Imidoylhalide_cyclic: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[FX1,ClX1,BrX1,IX1]
+# may also be aromatic
+ 
+# may be ring, aromatic, substituted with carbonyls, hetero, ...
+# (everything else would get too complicated)
+
+Amidrazone: [$([$([#6X3][#6]),$([#6X3H])](=[#7X2v3])[#7X3v3][#7X3v3]),$([$([#6X3][#6]),$([#6X3H])]([#7X3v3])=[#7X2v3][#7X3v3])]
+# hits both tautomers. as above, it may be ring, aromatic, substituted with carbonyls, hetero, ...
+
+
+Alpha_aminoacid: [NX3,NX4+;!$([N]~[!#6]);!$([N]*~[#7,#8,#15,#16])][C][CX3](=[OX1])[OX2H,OX1-]
+# N may be alkylated, but not part of an amide (as in peptides), ionic forms are included
+# includes also non-natural aminoacids with double-bonded or two aliph./arom. substituents at alpha-C 
+# N may not be aromatic as in 1H-pyrrole-2-carboxylic acid
+
+Alpha_hydroxyacid: [OX2H][C][CX3](=[OX1])[OX2H,OX1-]
+
+Peptide_middle: [NX3;$([N][CX3](=[OX1])[C][NX3,NX4+])][C][CX3](=[OX1])[NX3;$([N][C][CX3](=[OX1])[NX3,OX2,OX1-])]
+# finds peptidic structures which are neither C- nor N-terminal. Both neighbours must be amino-acids/peptides
+
+Peptide_C_term: [NX3;$([N][CX3](=[OX1])[C][NX3,NX4+])][C][CX3](=[OX1])[OX2H,OX1-]
+# finds C-terminal amino acids
+
+Peptide_N_term: [NX3,NX4+;!$([N]~[!#6]);!$([N]*~[#7,#8,#15,#16])][C][CX3](=[OX1])[NX3;$([N][C][CX3](=[OX1])[NX3,OX2,OX1-])]
+# finds N-terminal amino acids. As above, N may be substituted, but not part of an amide-bond.
+
+
+Carboxylic_orthoester: [#6][OX2][CX4;$(C[#6]),$([CH])]([OX2][#6])[OX2][#6] 
+# hits also anhydride like struktures (e. g. HC(OMe)2-OC=O residues)
+
+Ketene: [CX3]=[CX2]=[OX1]
+
+Ketenacetal: [#7X2,#8X3,#16X2;$(*[#6,#14])][#6X3]([#7X2,#8X3,#16X2;$(*[#6,#14])])=[#6X3]
+# includes aminals, silylacetals, ketenesters, etc. C=C DB is not aromatic, everything else may be
+
+Nitrile: [NX1]#[CX2]
+# includes cyanhydrines
+
+Isonitrile: [CX1-]#[NX2+]
+
+
+Vinylogous_carbonyl_or_carboxyl_derivative: [#6X3](=[OX1])[#6X3]=,:[#6X3][#7,#8,#16,F,Cl,Br,I]
+# may be part of a ring, even aromatic
+
+Vinylogous_acid: [#6X3](=[OX1])[#6X3]=,:[#6X3][$([OX2H]),$([OX1-])]
+
+Vinylogous_ester: [#6X3](=[OX1])[#6X3]=,:[#6X3][#6;!$(C=[O,N,S])]
+
+Vinylogous_amide: [#6X3](=[OX1])[#6X3]=,:[#6X3][#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Vinylogous_halide: [#6X3](=[OX1])[#6X3]=,:[#6X3][FX1,ClX1,BrX1,IX1]
+
+
+
+# I.5: Four Carbon-Hetero Bonds (Carbonic Acid and Derivatives)
+# -----------------------------
+
+Carbonic_acid_dieester: [#6;!$(C=[O,N,S])][#8X2][#6X3](=[OX1])[#8X2][#6;!$(C=[O,N,S])]
+# may be part of a ring, even aromatic
+
+Carbonic_acid_esterhalide: [#6;!$(C=[O,N,S])][OX2;!R][CX3](=[OX1])[OX2][FX1,ClX1,BrX1,IX1]
+
+Carbonic_acid_monoester: [#6;!$(C=[O,N,S])][OX2;!R][CX3](=[OX1])[$([OX2H]),$([OX1-])]
+# unstable
+
+Carbonic_acid_derivatives: [!#6][#6X3](=[!#6])[!#6]
+
+
+Thiocarbonic_acid_dieester: [#6;!$(C=[O,N,S])][#8X2][#6X3](=[SX1])[#8X2][#6;!$(C=[O,N,S])]
+# may be part of a ring, even aromatic
+
+Thiocarbonic_acid_esterhalide: [#6;!$(C=[O,N,S])][OX2;!R][CX3](=[SX1])[OX2][FX1,ClX1,BrX1,IX1]
+
+Thiocarbonic_acid_monoester: [#6;!$(C=[O,N,S])][OX2;!R][CX3](=[SX1])[$([OX2H]),$([OX1-])]
+
+
+Urea:[#7X3;!$([#7][!#6])][#6X3](=[OX1])[#7X3;!$([#7][!#6])]
+# no check whether part of imide, biuret, etc. Aromatic structures are only hit if
+# both N share no double bonds, like in the dioxo-form of uracil
+
+Thiourea: [#7X3;!$([#7][!#6])][#6X3](=[SX1])[#7X3;!$([#7][!#6])]
+
+Isourea: [#7X2;!$([#7][!#6])]=,:[#6X3]([#8X2&!$([#8][!#6]),OX1-])[#7X3;!$([#7][!#6])]
+# O may be substituted. no check whether further amide-like bonds are present. Aromatic 
+# structures are only hit if single bonded N shares no additional double bond, like in
+# the 1-hydroxy-3-oxo form of uracil
+
+Isothiourea: [#7X2;!$([#7][!#6])]=,:[#6X3]([#16X2&!$([#16][!#6]),SX1-])[#7X3;!$([#7][!#6])]
+
+Guanidine: [N;v3X3,v4X4+][CX3](=[N;v3X2,v4X3+])[N;v3X3,v4X4+]
+# also hits guanidinium salts. v3 and v4 to avoid nitroamidines
+
+Carbaminic_acid: [NX3]C(=[OX1])[O;X2H,X1-]
+# quite unstable, unlikely to be found. Also hits salts
+
+Urethan: [#7X3][#6](=[OX1])[#8X2][#6]
+# also hits when part of a ring, no check whether the last C is part of carbonyl
+
+Biuret: [#7X3][#6](=[OX1])[#7X3][#6](=[OX1])[#7X3]
+
+Semicarbazide: [#7X3][#7X3][#6X3]([#7X3;!$([#7][#7])])=[OX1]
+
+Carbazide: [#7X3][#7X3][#6X3]([#7X3][#7X3])=[OX1]
+
+Semicarbazone: [#7X2](=[#6])[#7X3][#6X3]([#7X3;!$([#7][#7])])=[OX1]
+
+Carbazone: [#7X2](=[#6])[#7X3][#6X3]([#7X3][#7X3])=[OX1]
+
+Thiosemicarbazide: [#7X3][#7X3][#6X3]([#7X3;!$([#7][#7])])=[SX1]
+
+Thiocarbazide: [#7X3][#7X3][#6X3]([#7X3][#7X3])=[SX1]
+
+Thiosemicarbazone: [#7X2](=[#6])[#7X3][#6X3]([#7X3;!$([#7][#7])])=[SX1]
+
+Thiocarbazone: [#7X2](=[#6])[#7X3][#6X3]([#7X3][#7X3])=[SX1]
+
+
+Isocyanate: [NX2]=[CX2]=[OX1]
+
+Cyanate: [OX2][CX2]#[NX1]
+
+Isothiocyanate: [NX2]=[CX2]=[SX1]
+
+Thiocyanate: [SX2][CX2]#[NX1]
+
+Carbodiimide: [NX2]=[CX2]=[NX2]
+
+Orthocarbonic_derivatives: [CX4H0]([O,S,#7])([O,S,#7])([O,S,#7])[O,S,#7,F,Cl,Br,I]
+# halogen allowed just once, to avoid mapping to -OCF3 and similar groups (much more
+# stable as for example C(OCH3)4)
+
+
+# I.6 Aromatics
+# -------------
+
+# I know that this classification is not very logical, arylamines are found under I.2 ...
+
+Phenol: [OX2H][c]
+
+1,2-Diphenol: [OX2H][c][c][OX2H]
+
+Arylchloride: [Cl][c]
+
+Arylfluoride: [F][c]
+
+Arylbromide: [Br][c]
+
+Aryliodide: [I][c]
+
+Arylthiol: [SX2H][c]
+
+Iminoarene: [c]=[NX2;$([H1]),$([H0][#6;!$([C]=[N,S,O])])]
+# N may be substituted with H or C, but not carbonyl or similar
+# aromatic atom is always C, not S or P (these are not planar when substituted)
+
+Oxoarene: [c]=[OX1]
+
+Thioarene: [c]=[SX1]
+
+Hetero_N_basic_H: [nX3H1+0]
+# as in pyrole. uncharged to exclude pyridinium ions
+
+Hetero_N_basic_no_H: [nX3H0+0]
+# as in N-methylpyrole. uncharged to exclude pyridinium ions
+
+Hetero_N_nonbasic: [nX2,nX3+]
+# as in pyridine, pyridinium
+
+Hetero_O: [o]
+
+Hetero_S: [sX2]
+# X2 because Daylight's depictmatch falsely describes C1=CS(=O)C=C1 as aromatic
+# (is not planar because of lonepair at S)
+
+Heteroaromatic: [a;!c]
+
+
+# Part II: N, S, P, Si, B 
+# =======================
+
+
+# II.1 Nitrogen
+# -------------
+
+Nitrite: [NX2](=[OX1])[O;$([X2]),$([X1-])]
+# hits nitrous acid, its anion, esters, and other O-substituted derivatives 
+
+Thionitrite: [SX2][NX2]=[OX1]
+
+Nitrate: [$([NX3](=[OX1])(=[OX1])[O;$([X2]),$([X1-])]),$([NX3+]([OX1-])(=[OX1])[O;$([X2]),$([X1-])])] 
+# hits nitric acid, its anion, esters, and other O-substituted derivatives 
+
+Nitro: [$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]
+# hits nitro groups attached to C,N, ... but not nitrates
+
+Nitroso: [NX2](=[OX1])[!#7;!#8]
+# no nitrites, no nitrosamines
+
+Azide: [NX1]~[NX2]~[NX2,NX1]
+# hits both mesomeric forms, also anion
+
+Acylazide: [CX3](=[OX1])[NX2]~[NX2]~[NX1]
+
+Diazo: [$([#6]=[NX2+]=[NX1-]),$([#6-]-[NX2+]#[NX1])] 
+
+Diazonium: [#6][NX2+]#[NX1]
+
+Nitrosamine: [#7;!$(N*=O)][NX2]=[OX1]
+
+Nitrosamide: [NX2](=[OX1])N-*=O
+# includes nitrososulfonamides
+
+N-Oxide: [$([#7+][OX1-]),$([#7v5]=[OX1]);!$([#7](~[O])~[O]);!$([#7]=[#7])] 
+# Hits both forms. Won't hit azoxy, nitro, nitroso, or nitrate.
+
+
+Hydrazine: [NX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6]);!$(NC=[O,N,S])][NX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6]);!$(NC=[O,N,S])]
+# no hydrazides
+
+Hydrazone: [NX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6]);!$(NC=[O,N,S])][NX2]=[#6]
+
+Hydroxylamine: [NX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6]);!$(NC=[O,N,S])][OX2;$([H1]),$(O[#6;!$(C=[N,O,S])])]
+# no discrimination between O-, N-, and O,N-substitution
+
+
+# II.2 Sulfur
+# -----------
+
+Sulfon: [$([SX4](=[OX1])(=[OX1])([#6])[#6]),$([SX4+2]([OX1-])([OX1-])([#6])[#6])]
+# can't be aromatic, thus S and not #16
+
+Sulfoxide: [$([SX3](=[OX1])([#6])[#6]),$([SX3+]([OX1-])([#6])[#6])]
+
+Sulfonium: [S+;!$([S]~[!#6]);!$([S]*~[#7,#8,#15,#16])]
+# can't be aromatic, thus S and not #16
+
+Sulfuric_acid: [SX4](=[OX1])(=[OX1])([$([OX2H]),$([OX1-])])[$([OX2H]),$([OX1-])]
+# includes anions
+
+Sulfuric_monoester: [SX4](=[OX1])(=[OX1])([$([OX2H]),$([OX1-])])[OX2][#6;!$(C=[O,N,S])]
+
+Sulfuric_diester: [SX4](=[OX1])(=[OX1])([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
+
+Sulfuric_monoamide: [SX4](=[OX1])(=[OX1])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[$([OX2H]),$([OX1-])]
+
+Sulfuric_diamide: [SX4](=[OX1])(=[OX1])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Sulfuric_esteramide: [SX4](=[OX1])(=[OX1])([#7X3][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
+
+Sulfuric_derivative: [SX4D4](=[!#6])(=[!#6])([!#6])[!#6]
+# everything else (would not be a "true" derivative of sulfuric acid, if one of the substituents were less electronegative 
+# than sulfur, but this should be very very rare, anyway)
+
+
+
+#### sulfurous acid and derivatives missing!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+
+
+
+Sulfonic_acid: [SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[$([OX2H]),$([OX1-])]
+
+Sulfonamide: [SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Sulfonic_ester: [SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[OX2][#6;!$(C=[O,N,S])]
+
+Sulfonic_halide: [SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[FX1,ClX1,BrX1,IX1]
+
+Sulfonic_derivative: [SX4;$([H1]),$([H0][#6])](=[!#6])(=[!#6])[!#6]
+# includes all of the above and many more
+# for comparison: this is what "all sulfonic derivatives but not the ones above" would look like:
+# [$([SX4;$([H1]),$([H0][#6])](=[!#6])(=[!#6;!O])[!#6]),$([SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[!$([FX1,ClX1,BrX1,IX1]);!$([#6]);!$([OX2H]);!$([OX1-]);!$([OX2][#6;!$(C=[O,N,S])]);!$([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])])]
+
+
+Sulfinic_acid: [SX3;$([H1]),$([H0][#6])](=[OX1])[$([OX2H]),$([OX1-])]
+
+Sulfinic_amide: [SX3;$([H1]),$([H0][#6])](=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Sulfinic_ester: [SX3;$([H1]),$([H0][#6])](=[OX1])[OX2][#6;!$(C=[O,N,S])]
+
+Sulfinic_halide: [SX3;$([H1]),$([H0][#6])](=[OX1])[FX1,ClX1,BrX1,IX1]
+
+Sulfinic_derivative: [SX3;$([H1]),$([H0][#6])](=[!#6])[!#6]
+
+Sulfenic_acid: [SX2;$([H1]),$([H0][#6])][$([OX2H]),$([OX1-])]
+
+Sulfenic_amide: [SX2;$([H1]),$([H0][#6])][#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Sulfenic_ester: [SX2;$([H1]),$([H0][#6])][OX2][#6;!$(C=[O,N,S])]
+
+Sulfenic_halide: [SX2;$([H1]),$([H0][#6])][FX1,ClX1,BrX1,IX1]
+
+Sulfenic_derivative: [SX2;$([H1]),$([H0][#6])][!#6]
+
+
+# II.3 Phosphorous
+# ----------------
+
+Phosphine: [PX3;$([H3]),$([H2][#6]),$([H1]([#6])[#6]),$([H0]([#6])([#6])[#6])]
+# similar to amine, but less restrictive: includes also amide- and aminal-analogues 
+
+Phosphine_oxide: [PX4;$([H3]=[OX1]),$([H2](=[OX1])[#6]),$([H1](=[OX1])([#6])[#6]),$([H0](=[OX1])([#6])([#6])[#6])]
+
+Phosphonium: [P+;!$([P]~[!#6]);!$([P]*~[#7,#8,#15,#16])]
+# similar to Ammonium
+
+Phosphorylen: [PX4;$([H3]=[CX3]),$([H2](=[CX3])[#6]),$([H1](=[CX3])([#6])[#6]),$([H0](=[CX3])([#6])([#6])[#6])]
+
+
+# conventions for the following acids and derivatives: 
+# acids find protonated and deprotonated acids
+# esters do not find mixed anhydrides ( ...P-O-C(=O))
+# derivatives: subtituents which go in place of the OH and =O are not H or C (may also be O, 
+# thus including acids and esters)
+
+Phosphonic_acid: [PX4;$([H1]),$([H0][#6])](=[OX1])([$([OX2H]),$([OX1-])])[$([OX2H]),$([OX1-])]
+# includes anions
+
+Phosphonic_monoester: [PX4;$([H1]),$([H0][#6])](=[OX1])([$([OX2H]),$([OX1-])])[OX2][#6;!$(C=[O,N,S])]
+
+Phosphonic_diester: [PX4;$([H1]),$([H0][#6])](=[OX1])([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
+
+Phosphonic_monoamide: [PX4;$([H1]),$([H0][#6])](=[OX1])([$([OX2H]),$([OX1-])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphonic_diamide: [PX4;$([H1]),$([H0][#6])](=[OX1])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphonic_esteramide: [PX4;$([H1]),$([H0][#6])](=[OX1])([OX2][#6;!$(C=[O,N,S])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphonic_acid_derivative: [PX4;$([H1]),$([H0][#6])](=[!#6])([!#6])[!#6]
+# all of the above and much more 
+
+
+Phosphoric_acid: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([$([OX2H]),$([OX1-])])[$([OX2H]),$([OX1-])]
+# includes anions
+
+Phosphoric_monoester: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([$([OX2H]),$([OX1-])])[OX2][#6;!$(C=[O,N,S])]
+
+Phosphoric_diester: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
+
+Phosphoric_triester: [PX4D4](=[OX1])([OX2][#6;!$(C=[O,N,S])])([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
+
+Phosphoric_monoamide: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([$([OX2H]),$([OX1-])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphoric_diamide: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphoric_triamide: [PX4D4](=[OX1])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphoric_monoestermonoamide: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([OX2][#6;!$(C=[O,N,S])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphoric_diestermonoamide: [PX4D4](=[OX1])([OX2][#6;!$(C=[O,N,S])])([OX2][#6;!$(C=[O,N,S])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphoric_monoesterdiamide: [PX4D4](=[OX1])([OX2][#6;!$(C=[O,N,S])])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphoric_acid_derivative: [PX4D4](=[!#6])([!#6])([!#6])[!#6]
+
+
+Phosphinic_acid: [PX4;$([H2]),$([H1][#6]),$([H0]([#6])[#6])](=[OX1])[$([OX2H]),$([OX1-])]
+
+Phosphinic_ester: [PX4;$([H2]),$([H1][#6]),$([H0]([#6])[#6])](=[OX1])[OX2][#6;!$(C=[O,N,S])]
+
+Phosphinic_amide: [PX4;$([H2]),$([H1][#6]),$([H0]([#6])[#6])](=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphinic_acid_derivative: [PX4;$([H2]),$([H1][#6]),$([H0]([#6])[#6])](=[!#6])[!#6]
+
+
+Phosphonous_acid: [PX3;$([H1]),$([H0][#6])]([$([OX2H]),$([OX1-])])[$([OX2H]),$([OX1-])]
+
+Phosphonous_monoester: [PX3;$([H1]),$([H0][#6])]([$([OX2H]),$([OX1-])])[OX2][#6;!$(C=[O,N,S])]
+
+Phosphonous_diester: [PX3;$([H1]),$([H0][#6])]([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])]
+
+Phosphonous_monoamide: [PX3;$([H1]),$([H0][#6])]([$([OX2H]),$([OX1-])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphonous_diamide: [PX3;$([H1]),$([H0][#6])]([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphonous_esteramide: [PX3;$([H1]),$([H0][#6])]([OX2][#6;!$(C=[O,N,S])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphonous_derivatives: [PX3;$([D2]),$([D3][#6])]([!#6])[!#6]
+
+
+Phosphinous_acid: [PX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6])][$([OX2H]),$([OX1-])]
+
+Phosphinous_ester: [PX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6])][OX2][#6;!$(C=[O,N,S])]
+
+Phosphinous_amide: [PX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6])][#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]
+
+Phosphinous_derivatives: [PX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6])][!#6]
+
+
+# II.4 Silicon
+# ------------
+
+Quart_silane: [SiX4]([#6])([#6])([#6])[#6]
+# four C-substituents. non-reactive, non-toxic, in experimental phase for drug development
+
+Non-quart_silane: [SiX4;$([H1]([#6])([#6])[#6]),$([H2]([#6])[#6]),$([H3][#6]),$([H4])]
+# has 1-4 hydride(s), reactive. Daylight's depictmatch does not add hydrogens automatically to 
+# the free positions at Si, thus Hs had to be added implicitly
+
+Silylmonohalide: [SiX4]([FX1,ClX1,BrX1,IX1])([#6])([#6])[#6]
+# reagents for inserting protection groups
+
+Het_trialkylsilane: [SiX4]([!#6])([#6])([#6])[#6]
+# mostly acid-labile protection groups such as trimethylsilyl-ethers 
+
+Dihet_dialkylsilane: [SiX4]([!#6])([!#6])([#6])[#6]
+
+Trihet_alkylsilane: [SiX4]([!#6])([!#6])([!#6])[#6]
+
+Silicic_acid_derivative: [SiX4]([!#6])([!#6])([!#6])[!#6]
+# four substituent which are neither C nor H
+
+
+# II.5 Boron
+# ----------
+
+Trialkylborane: [BX3]([#6])([#6])[#6] 
+# also carbonyls allowed
+
+Boric_acid_derivatives: [BX3]([!#6])([!#6])[!#6]
+# includes acids, esters, amides, ... H-substituent at B is very rare.
+
+Boronic_acid_derivative: [BX3]([!#6])([!#6])[!#6]
+# # includes acids, esters, amides, ...
+
+Borohydride: [BH1,BH2,BH3,BH4]
+# at least one H attached to B
+
+Quaternary_boron: [BX4]
+# mostly borates (negative charge), in complex with Lewis-base
+
+
+
+# Part III: Some Special Patterns
+# ===============================
+
+
+# III.1 Chains
+# ------------
+
+# some simple chains
+
+
+
+# III.2 Rings
+# -----------
+
+Aromatic: a
+
+Heterocyclic: [!#6;!R0]
+# may be aromatic or not
+
+Epoxide: [OX2r3]1[#6r3][#6r3]1
+# toxic/reactive. may be annelated to aromat, but must not be aromatic itself (oxirane-2,3-dione)
+
+NH_aziridine: [NX3H1r3]1[#6r3][#6r3]1
+# toxic/reactive according to Maybridge's garbage filter
+
+Spiro: [D4R;$(*(@*)(@*)(@*)@*)]
+# at least two different rings can be found which are sharing just one atom.
+# these two rings can be connected by a third ring, so it matches also some 
+# bridged systems, like morphine
+
+Annelated_rings: [R;$(*(@*)(@*)@*);!$([R2;$(*(@*)(@*)(@*)@*)])]@[R;$(*(@*)(@*)@*);!$([R2;$(*(@*)(@*)(@*)@*)])]
+# two different rings sharing exactly two atoms
+
+Bridged_rings: [R;$(*(@*)(@*)@*);!$([D4R;$(*(@*)(@*)(@*)@*)]);!$([R;$(*(@*)(@*)@*);!$([R2;$(*(@*)(@*)(@*)@*)])]@[R;$(*(@*)(@*)@*);!$([R2;$(*(@*)(@*)(@*)@*)])])]
+# part of two or more rings, not spiro, not annelated -> finds bridgehead atoms, 
+# but only if they are not annelated at the same time - otherwise impossible (?)
+# to distinguish from non-bridgehead annelated atoms
+
+# some basic ring-patterns (just size, no other information):
+
+
+
+
+
+# III.3 Sugars and Nucleosides/Nucleotides, Steroids
+# --------------------------------------------------
+
+# because of the large variety of sugar derivatives, different patterns can be applied.
+# The choice of patterns and their combinations will depend on the contents of the database
+# e.g. natural products, nucleoside analoges with modified sugars, ... as well as on the 
+# desired restriction
+
+
+Sugar_pattern_1: [OX2;$([r5]1@C@C@C(O)@C1),$([r6]1@C@C@C(O)@C(O)@C1)]
+# 5 or 6-membered ring containing one O and at least one (r5) or two (r6) oxygen-substituents. 
+
+Sugar_pattern_2: [OX2;$([r5]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)]
+# 5 or 6-membered ring containing one O and an acetal-like bond at postion 2. 
+
+Sugar_pattern_combi: [OX2;$([r5]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C(O)@C1),$([r6]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C(O)@C(O)@C1)]
+# combination of the two above
+
+Sugar_pattern_2_reducing: [OX2;$([r5]1@C(!@[OX2H1])@C@C@C1),$([r6]1@C(!@[OX2H1])@C@C@C@C1)]
+# 5 or 6-membered cyclic hemi-acetal
+
+Sugar_pattern_2_alpha: [OX2;$([r5]1@[C@@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@[C@@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)]
+# 5 or 6-membered cyclic hemi-acetal
+
+Sugar_pattern_2_beta: [OX2;$([r5]1@[C@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@[C@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)]
+# 5 or 6-membered cyclic hemi-acetal
+
+##Poly_sugar_1: ([OX2;$([r5]1@C@C@C(O)@C1),$([r6]1@C@C@C(O)@C(O)@C1)].[OX2;$([r5]1@C@C@C(O)@C1),$([r6]1@C@C@C(O)@C(O)@C1)])
+# pattern1 occours more than once (in same molecule, but moieties don't have to be adjacent!)
+
+##Poly_sugar_2: ([OX2;$([r5]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)].[OX2;$([r5]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)])
+# pattern2 occours more than once (in same molecule, but moieties don't have to be adjacent!)
+
+
+# III.4 Everything else...
+# ------------------------
+
+Conjugated_double_bond: *=*[*]=,#,:[*]
+
+Conjugated_tripple_bond: *#*[*]=,#,:[*]
+
+Cis_double_bond: */[D2]=[D2]\*
+# only one single-bonded substituent on each DB-atom. no aromats. 
+# only found when character of DB is explicitely stated.
+
+Trans_double_bond: */[D2]=[D2]/*
+# analog
+
+Mixed_anhydrides: [$(*=O),$([#16,#14,#5]),$([#7]([#6]=[OX1]))][#8X2][$(*=O),$([#16,#14,#5]),$([#7]([#6]=[OX1]))]
+# should hits all combinations of two acids
+
+Halogen_on_hetero: [FX1,ClX1,BrX1,IX1][!#6]
+
+Halogen_multi_subst: [F,Cl,Br,I;!$([X1]);!$([X0-])]
+# Halogen which is not mono-substituted nor an anion, e.g. chlorate.
+# Most of these cases should be also filtered by Halogen_on_hetero.
+
+Trifluoromethyl: [FX1][CX4;!$([H0][Cl,Br,I]);!$([F][C]([F])([F])[F])]([FX1])([FX1])
+# C with three F attached, connected to anything which is not another halogen
+
+C_ONS_bond: [#6]~[#7,#8,#16]
+# probably all drug-like molecules have at least one O, N, or S connected to a C -> nice filter
+
+## Mixture: (*).(*)
+# two or more seperate parts, may also be salt
+# component-level grouping is not yet supported in Open Babel Version 2.0
+
+
+Charged: [!+0]
+
+Anion: [-1,-2,-3,-4,-5,-6,-7]
+
+Kation: [+1,+2,+3,+4,+5,+6,+7]
+
+Salt: ([-1,-2,-3,-4,-5,-6,-7]).([+1,+2,+3,+4,+5,+6,+7])
+# two or more seperate components with opposite charges
+
+##Zwitterion: ([-1,-2,-3,-4,-5,-6,-7].[+1,+2,+3,+4,+5,+6,+7])
+# both negative and positive charges somewhere within the same molecule. 
+
+1,3-Tautomerizable: [$([#7X2,OX1,SX1]=*[!H0;!$([a;!n])]),$([#7X3,OX2,SX2;!H0]*=*),$([#7X3,OX2,SX2;!H0]*:n)]
+# 1,3 migration of H allowed. Includes keto/enol and amide/enamide. 
+# Aromatic rings must stay aromatic - no keto form of phenol 
+
+1,5-Tautomerizable: [$([#7X2,OX1,SX1]=,:**=,:*[!H0;!$([a;!n])]),$([#7X3,OX2,SX2;!H0]*=**=*),$([#7X3,OX2,SX2;!H0]*=,:**:n)]
+
+Rotatable_bond: [!$(*#*)&!D1]-!@[!$(*#*)&!D1]
+# taken from http://www.daylight.com/support/contrib/smarts/content.html
+
+Michael_acceptor: [CX3]=[CX3][$([CX3]=[O,N,S]),$(C#[N]),$([S,P]=[OX1]),$([NX3]=O),$([NX3+](=O)[O-])]
+# the classical case: C=C near carbonyl, nitrile, nitro, or similar
+# Oxo-heteroaromats and similar are not included.
+
+Dicarbodiazene: [CX3](=[OX1])[NX2]=[NX2][CX3](=[OX1])
+# Michael-like acceptor, see Mitsunobu reaction
+
+# H-Bond_donor:
+
+# H-Bond_acceptor:
+
+# Pos_ionizable:
+
+# Neg_ionizable:
+
+# Unlikely_ions: 
+# O+,N-,C+,C-, ...
+
+CH-acidic: [$([CX4;!$([H0]);!$(C[!#6;!$([P,S]=O);!$(N(~O)~O)])][$([CX3]=[O,N,S]),$(C#[N]),$([S,P]=[OX1]),$([NX3]=O),$([NX3+](=O)[O-]);!$(*[S,O,N;H1,H2]);!$([*+0][S,O;X1-])]),$([CX4;!$([H0])]1[CX3]=[CX3][CX3]=[CX3]1)]
+# C-H alpha to carbony, nitro or similar, C is not double-bonded, only C, H, S,P=O and nitro substituents allowed. 
+# pentadiene is included. acids, their salts, prim./sec. amides, and imides are excluded. 
+# hits also CH-acidic_strong
+
+CH-acidic_strong: [CX4;!$([H0]);!$(C[!#6;!$([P,S]=O);!$(N(~O)~O)])]([$([CX3]=[O,N,S]),$(C#[N]),$([S,P]=[OX1]),$([NX3]=O),$([NX3+](=O)[O-]);!$(*[S,O,N;H1,H2]);!$([*+0][S,O;X1-])])[$([CX3]=[O,N,S]),$(C#[N]),$([S,P]=[OX1]),$([NX3]=O),$([NX3+](=O)[O-]);!$(*[S,O,N;H1,H2]);!$([*+0][S,O;X1-])]
+# same as above (without pentadiene), but carbonyl or similar on two or three sides
+
+Chiral_center_specified: [$([*@](~*)(~*)(*)*),$([*@H](*)(*)*),$([*@](~*)(*)*),$([*@H](~*)~*)]
+# Hits atoms with tetrahedral chirality, if chiral center is specified in the SMILES string
+# depictmach does not find oxonium, sulfonium, or sulfoxides!
+
+# Chiral_center_unspecified: [$([*@?](~*)(~*)(*)*),$([*@?H](*)(*)*),$([*@?](~*)(*)*),$([*@?H](~*)~*)]
+# Hits atoms with tetrahedral chirality, if chiral center is not specified in the SMILES string
+# "@?" (unspecified chirality) is not yet supported in Open Babel Version 2.0 
+ 
+\ No newline at end of file
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
new file mode 100644
index 0000000..113f847
--- /dev/null
+++ b/lib/algorithm.rb
@@ -0,0 +1,21 @@
+module OpenTox
+
+  module Algorithm 
+
+    # Generic method to execute algorithms
+    # Algorithms should:
+    #   - accept a Compound, an Array of Compounds or a Dataset as first argument
+    #   - optional parameters as second argument
+    #   - return an object corresponding to the input type as result (eg. Compound -> value, Array of Compounds -> Array of values, Dataset -> Dataset with values
+    # @param [OpenTox::Compound,Array,OpenTox::Dataset] Input object
+    # @param [Hash] Algorithm parameters
+    # @return Algorithm result
+    def self.run algorithm, object, parameters=nil
+      bad_request_error "Cannot run '#{algorithm}' algorithm. Please provide an OpenTox::Algorithm." unless algorithm =~ /^OpenTox::Algorithm/
+      klass,method = algorithm.split('.')
+      parameters.nil? ?  Object.const_get(klass).send(method,object) : Object.const_get(klass).send(method,object, parameters)
+    end
+
+  end
+end
+
diff --git a/lib/bbrc.rb b/lib/bbrc.rb
new file mode 100644
index 0000000..c83b9b3
--- /dev/null
+++ b/lib/bbrc.rb
@@ -0,0 +1,165 @@
+module OpenTox
+  module Algorithm
+    class Fminer
+      TABLE_OF_ELEMENTS = [
+"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"]
+        
+      #
+      # Run bbrc algorithm on dataset
+      #
+      # @param [OpenTox::Dataset] training dataset
+      # @param [optional] parameters BBRC parameters, accepted parameters are
+      #   - min_frequency  Minimum frequency (default 5)
+      #   - feature_type Feature type, can be 'paths' or 'trees' (default "trees")
+      #   - backbone BBRC classes, pass 'false' to switch off mining for BBRC representatives. (default "true")
+      #   - min_chisq_significance Significance threshold (between 0 and 1)
+      #   - nr_hits Set to "true" to get hit count instead of presence
+      #   - get_target Set to "true" to obtain target variable as feature
+      # @return [OpenTox::Dataset] Fminer Dataset
+      def self.bbrc training_dataset, params={}
+
+        time = Time.now
+        bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
+
+        prediction_feature = training_dataset.features.first
+        if params[:min_frequency]
+          minfreq = params[:min_frequency]
+        else
+          per_mil = 5 # value from latest version
+          per_mil = 8 # as suggested below
+          i = training_dataset.feature_ids.index prediction_feature.id
+          nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size
+          minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST
+          minfreq = 2 unless minfreq > 2
+          minfreq = minfreq.round
+        end
+
+        @bbrc ||= Bbrc::Bbrc.new
+        @bbrc.Reset
+        if prediction_feature.numeric 
+          @bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations!
+        else
+          bad_request_error "No accept values for "\
+                            "dataset '#{training_dataset.id}' and "\
+                            "feature '#{prediction_feature.id}'" unless prediction_feature.accept_values
+          value2act = Hash[[*prediction_feature.accept_values.map.with_index]]
+        end
+        @bbrc.SetMinfreq(minfreq)
+        @bbrc.SetType(1) if params[:feature_type] == "paths"
+        @bbrc.SetBackbone(false) if params[:backbone] == "false"
+        @bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance]
+        @bbrc.SetConsoleOut(false)
+
+        params[:nr_hits] ? nr_hits = params[:nr_hits] : nr_hits = false
+        feature_dataset = FminerDataset.new(
+            :training_dataset_id => training_dataset.id,
+            :training_algorithm => "#{self.to_s}.bbrc",
+            :training_feature_id => prediction_feature.id ,
+            :training_parameters => {
+              :min_frequency => minfreq,
+              :nr_hits => nr_hits,
+              :backbone => (params[:backbone] == false ? false : true) 
+            }
+
+        )
+        feature_dataset.compounds = training_dataset.compounds
+
+        # add data 
+        training_dataset.compounds.each_with_index do |compound,i|
+          act = value2act[training_dataset.data_entries[i].first]
+          if act # TODO check if this works
+            @bbrc.AddCompound(compound.smiles,i+1)
+            @bbrc.AddActivity(act,i+1)
+          end
+        end
+        #g_median=@fminer.all_activities.values.to_scale.median
+
+        #task.progress 10
+        #step_width = 80 / @bbrc.GetNoRootNodes().to_f
+
+        $logger.debug "BBRC setup: #{Time.now-time}"
+        time = Time.now
+        ftime = 0
+        itime = 0
+        rtime = 0
+  
+        # run @bbrc
+        (0 .. @bbrc.GetNoRootNodes()-1).each do |j|
+          results = @bbrc.MineRoot(j)
+          results.each do |result|
+            rt = Time.now
+            f = YAML.load(result)[0]
+            smarts = f.shift
+            # convert fminer SMARTS representation into a more human readable format
+            smarts.gsub!(%r{\[#(\d+)&(\w)\]}) do
+             element = TABLE_OF_ELEMENTS[$1.to_i-1]
+             $2 == "a" ? element.downcase : element
+            end
+            p_value = f.shift
+            f.flatten!
+            compound_idxs = f.collect{|e| e.first.first-1}
+            # majority class
+            effect = compound_idxs.collect{|i| training_dataset.data_entries[i].first}.mode
+  
+=begin
+            if (!@bbrc.GetRegression)
+              id_arrs = f[2..-1].flatten
+              max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc
+              effect = max+1
+            else #regression part
+              id_arrs = f[2]
+              # DV: effect calculation
+              f_arr=Array.new
+              f[2].each do |id|
+                id=id.keys[0] # extract id from hit count hash
+                f_arr.push(@fminer.all_activities[id])
+              end
+              f_median=f_arr.to_scale.median
+              if g_median >= f_median
+                effect = 'activating'
+              else
+                effect = 'deactivating'
+              end
+            end
+=end
+            rtime += Time.now - rt
+  
+            ft = Time.now
+            feature = OpenTox::FminerSmarts.find_or_create_by({
+              "smarts" => smarts,
+              "p_value" => p_value.to_f.abs.round(5),
+              "effect" => effect,
+              "dataset_id" => feature_dataset.id
+            })
+            feature_dataset.feature_ids << feature.id
+            ftime += Time.now - ft
+
+            it = Time.now
+            f.each do |id_count_hash|
+              id_count_hash.each do |id,count|
+                nr_hits ? count = count.to_i : count = 1
+                feature_dataset.data_entries[id-1] ||= []
+                feature_dataset.data_entries[id-1][feature_dataset.feature_ids.size-1] = count
+              end
+            end
+            itime += Time.now - it
+  
+          end
+        end
+
+        $logger.debug "Fminer: #{Time.now-time} (read: #{rtime}, iterate: #{itime}, find/create Features: #{ftime})"
+        time = Time.now
+
+        feature_dataset.fill_nil_with 0
+
+        $logger.debug "Prepare save: #{Time.now-time}"
+        time = Time.now
+        feature_dataset.save_all
+
+        $logger.debug "Save: #{Time.now-time}"
+        feature_dataset
+  
+      end
+    end
+  end
+end
diff --git a/lib/classification.rb b/lib/classification.rb
new file mode 100644
index 0000000..b4b2e59
--- /dev/null
+++ b/lib/classification.rb
@@ -0,0 +1,110 @@
+module OpenTox
+  module Algorithm
+    
+    class Classification
+
+      def self.weighted_majority_vote compound, params
+        neighbors = params[:neighbors]
+        return {:value => nil,:confidence => nil,:warning => "Cound not find similar compounds."} if neighbors.empty?
+        weighted_sum = {}
+        sim_sum = 0.0
+        confidence = 0.0
+        neighbors.each do |row|
+          n,sim,acts = row
+          #confidence = sim if sim > confidence # distance to nearest neighbor
+          acts.each do |act|
+            weighted_sum[act] ||= 0
+            weighted_sum[act] += sim
+          end
+        end
+        case weighted_sum.size
+        when 1
+          return {:value => weighted_sum.keys.first, :confidence => weighted_sum.values.first/neighbors.size.abs}
+        when 2
+          sim_sum = weighted_sum[weighted_sum.keys[0]]
+          sim_sum -= weighted_sum[weighted_sum.keys[1]]
+          sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1] 
+          confidence = (sim_sum/neighbors.size).abs 
+          return {:value => prediction,:confidence => confidence}
+        else
+          bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'"
+        end
+      end
+
+      # Classification with majority vote from neighbors weighted by similarity
+      # @param [Hash] params Keys `:activities, :sims, :value_map` are required
+      # @return [Numeric] A prediction value.
+      def self.fminer_weighted_majority_vote neighbors, training_dataset
+
+        neighbor_contribution = 0.0
+        confidence_sum = 0.0
+
+        $logger.debug "Weighted Majority Vote Classification."
+
+        values = neighbors.collect{|n| n[2]}.uniq
+        neighbors.each do |neighbor|
+          i = training_dataset.compound_ids.index n.id
+          neighbor_weight = neighbor[1]
+          activity = values.index(neighbor[2]) + 1 # map values to integers > 1
+          neighbor_contribution += activity * neighbor_weight
+          if values.size == 2 # AM: provide compat to binary classification: 1=>false 2=>true
+            case activity
+            when 1
+              confidence_sum -= neighbor_weight
+            when 2
+              confidence_sum += neighbor_weight
+            end
+          else
+            confidence_sum += neighbor_weight
+          end
+        end
+        if values.size == 2 
+          if confidence_sum >= 0.0
+            prediction = values[1]
+          elsif confidence_sum < 0.0
+            prediction = values[0] 
+          end
+        elsif values.size == 1 # all neighbors have the same value
+          prediction = values[0] 
+        else 
+          prediction = (neighbor_contribution/confidence_sum).round  # AM: new multinomial prediction
+        end 
+
+        confidence = (confidence_sum/neighbors.size).abs 
+        {:value => prediction, :confidence => confidence.abs}
+      end
+
+      # Local support vector regression from neighbors 
+      # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
+      # @return [Numeric] A prediction value.
+      def self.local_svm_classification(params)
+
+        confidence = 0.0
+        prediction = nil
+
+        $logger.debug "Local SVM."
+        if params[:activities].size>0
+          if params[:props]
+            n_prop = params[:props][0].collect.to_a
+            q_prop = params[:props][1].collect.to_a
+            props = [ n_prop, q_prop ]
+          end
+          activities = params[:activities].collect.to_a
+          activities = activities.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification
+          prediction = local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
+          prediction = prediction.sub(/Val/,"") if prediction # Convert back
+          confidence = 0.0 if prediction.nil?
+          #$logger.debug "Prediction: '" + prediction.to_s + "' ('#{prediction.class}')."
+          confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]})
+        end
+        {:value => prediction, :confidence => confidence}
+
+      end
+
+
+
+    end
+
+  end
+end
+
diff --git a/lib/compound.rb b/lib/compound.rb
new file mode 100644
index 0000000..7a3dc5c
--- /dev/null
+++ b/lib/compound.rb
@@ -0,0 +1,370 @@
+# TODO: check
+# *** Open Babel Error  in ParseFile
+#    Could not find contribution data file.
+
+CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/"
+
+module OpenTox
+
+  class Compound
+    include OpenTox
+
+    DEFAULT_FINGERPRINT = "MP2D"
+
+    field :inchi, type: String
+    field :smiles, type: String
+    field :inchikey, type: String
+    field :names, type: Array
+    field :warning, type: String
+    field :cid, type: String
+    field :chemblid, type: String
+    field :png_id, type: BSON::ObjectId
+    field :svg_id, type: BSON::ObjectId
+    field :sdf_id, type: BSON::ObjectId
+    field :fingerprints, type: Hash, default: {}
+    field :default_fingerprint_size, type: Integer
+
+    index({smiles: 1}, {unique: true})
+
+    # Overwrites standard Mongoid method to create fingerprints before database insertion
+    def self.find_or_create_by params
+      compound = self.find_or_initialize_by params
+      compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT)
+      compound.save
+      compound
+    end
+
+    def fingerprint type="MP2D"
+      unless fingerprints[type]
+        return [] unless self.smiles
+        #http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format
+        if type == "MP2D"
+          fp = obconversion(smiles,"smi","mpd").strip.split("\t")
+          name = fp.shift # remove Title
+          fingerprints[type] = fp
+        #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html
+        elsif type== "MNA"
+          level = 2 # TODO: level as parameter, evaluate level 1, see paper
+          fp = obconversion(smiles,"smi","mna","xL\"#{level}\"").split("\n")
+          fp.shift # remove Title
+          fingerprints[type] = fp
+        else # standard fingerprints
+          fp = OpenBabel::OBFingerprint.find_fingerprint(type)
+          obmol = OpenBabel::OBMol.new
+          obconversion = OpenBabel::OBConversion.new
+          obconversion.set_in_format "smi"
+          obconversion.read_string obmol, self.smiles
+          result = OpenBabel::VectorUnsignedInt.new
+          fp.get_fingerprint(obmol,result)
+          # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i
+          #p OpenBabel::OBFingerprint.describe_bits(result)
+          # convert result to a list of the bits that are set
+          # from openbabel/scripts/python/pybel.py line 830
+          # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints
+          result = result.to_a
+          bitsperint = OpenBabel::OBFingerprint.getbitsperint()
+          bits_set = []
+          start = 1
+          result.each do |x|
+            i = start
+            while x > 0 do
+              bits_set << i if (x % 2) == 1
+              x >>= 1
+              i += 1
+            end
+            start += bitsperint
+          end
+          fingerprints[type] = bits_set
+        end
+        save
+      end
+      fingerprints[type]
+    end
+
+    # Create a compound from smiles string
+    # @example
+    #   compound = OpenTox::Compound.from_smiles("c1ccccc1")
+    # @param [String] smiles Smiles string
+    # @return [OpenTox::Compound] Compound
+    def self.from_smiles smiles
+      smiles = obconversion(smiles,"smi","can")
+      if smiles.empty?
+        return nil
+        #Compound.find_or_create_by(:warning => "SMILES parsing failed for '#{smiles}', this may be caused by an incorrect SMILES string.")
+      else
+        Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can")
+      end
+    end
+
+    # Create a compound from inchi string
+    # @param inchi [String] smiles InChI string
+    # @return [OpenTox::Compound] Compound
+    def self.from_inchi inchi
+      # Temporary workaround for OpenBabels Inchi bug
+      # http://sourceforge.net/p/openbabel/bugs/957/
+      # bug has not been fixed in latest git/development version
+      #smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip
+      smiles = obconversion(inchi,"inchi","can")
+      if smiles.empty?
+        Compound.find_or_create_by(:warning => "InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries.")
+      else
+        Compound.find_or_create_by(:smiles => smiles, :inchi => inchi)
+      end
+    end
+
+    # Create a compound from sdf string
+    # @param sdf [String] smiles SDF string
+    # @return [OpenTox::Compound] Compound
+    def self.from_sdf sdf
+      # do not store sdf because it might be 2D
+      Compound.from_smiles obconversion(sdf,"sdf","can")
+    end
+
+    # Create a compound from name. Relies on an external service for name lookups.
+    # @example
+    #   compound = OpenTox::Compound.from_name("Benzene")
+    # @param name [String] can be also an InChI/InChiKey, CAS number, etc
+    # @return [OpenTox::Compound] Compound
+    def self.from_name name
+      Compound.from_smiles RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"smiles"))
+    end
+
+    # Get InChI
+    # @return [String] InChI string
+    def inchi
+      unless self["inchi"]
+
+        result = obconversion(smiles,"smi","inchi")
+        #result = `echo "#{self.smiles}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -ismi - -oinchi`.chomp
+        update(:inchi => result.chomp) if result and !result.empty?
+      end
+      self["inchi"]
+    end
+
+    # Get InChIKey
+    # @return [String] InChIKey string
+    def inchikey
+      update(:inchikey => obconversion(smiles,"smi","inchikey")) unless self["inchikey"]
+      self["inchikey"]
+    end
+
+    # Get (canonical) smiles
+    # @return [String] Smiles string
+    def smiles
+      update(:smiles => obconversion(self["smiles"],"smi","can")) unless self["smiles"] 
+      self["smiles"]
+    end
+
+    # Get sdf
+    # @return [String] SDF string
+    def sdf
+      if self.sdf_id.nil? 
+        sdf = obconversion(smiles,"smi","sdf")
+        file = Mongo::Grid::File.new(sdf, :filename => "#{id}.sdf",:content_type => "chemical/x-mdl-sdfile")
+        sdf_id = $gridfs.insert_one file
+        update :sdf_id => sdf_id
+      end
+      $gridfs.find_one(_id: self.sdf_id).data
+    end
+
+    # Get SVG image
+    # @return [image/svg] Image data
+    def svg
+      if self.svg_id.nil?
+       svg = obconversion(smiles,"smi","svg")
+       file = Mongo::Grid::File.new(svg, :filename => "#{id}.svg", :content_type => "image/svg")
+       update(:svg_id => $gridfs.insert_one(file))
+      end
+      $gridfs.find_one(_id: self.svg_id).data
+
+    end
+
+    # Get png image
+    # @example
+    #   image = compound.png
+    # @return [image/png] Image data
+    def png
+      if self.png_id.nil?
+       png = obconversion(smiles,"smi","_png2")
+       file = Mongo::Grid::File.new(Base64.encode64(png), :filename => "#{id}.png", :content_type => "image/png")
+       update(:png_id => $gridfs.insert_one(file))
+      end
+      Base64.decode64($gridfs.find_one(_id: self.png_id).data)
+
+    end
+
+    # Get all known compound names. Relies on an external service for name lookups.
+    # @example
+    #   names = compound.names
+    # @return [String] Compound names
+    def names
+      update(:names => RestClientWrapper.get("#{CACTUS_URI}#{inchi}/names").split("\n")) unless self["names"] 
+      self["names"]
+    end
+
+    # @return [String] PubChem Compound Identifier (CID), derieved via restcall to pubchem
+    def cid
+      pug_uri = "http://pubchem.ncbi.nlm.nih.gov/rest/pug/"
+      update(:cid => RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"] 
+      self["cid"]
+    end
+
+    # @return [String] ChEMBL database compound id, derieved via restcall to chembl
+    def chemblid
+      # https://www.ebi.ac.uk/chembldb/ws#individualCompoundByInChiKey
+      uri = "http://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json"
+      update(:chemblid => JSON.parse(RestClientWrapper.get(uri))["compounds"].first["chemblId"]) unless self["chemblid"] 
+      self["chemblid"]
+    end
+
+    def fingerprint_count_neighbors params
+      # TODO fix
+      neighbors = []
+      query_fingerprint = self.fingerprint params[:type]
+      training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
+        unless self == compound
+          candidate_fingerprint = compound.fingerprint params[:type]
+          features = (query_fingerprint + candidate_fingerprint).uniq
+          min_sum = 0
+          max_sum = 0
+          features.each do |f|
+            min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax
+            min_sum += min
+            max_sum += max
+          end
+          max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f
+          neighbors << [compound.id, sim] if sim and sim >= params[:min_sim]
+        end
+      end
+      neighbors.sort{|a,b| b.last <=> a.last}
+    end
+
+    def fingerprint_neighbors params
+      bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim]
+      neighbors = []
+      #if params[:type] == DEFAULT_FINGERPRINT
+        #neighbors = db_neighbors params
+        #p neighbors
+      #else
+        query_fingerprint = self.fingerprint params[:type]
+        training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
+          unless self == compound
+            candidate_fingerprint = compound.fingerprint params[:type]
+            sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
+            neighbors << [compound.id, sim] if sim >= params[:min_sim]
+          end
+        end
+      #end
+      neighbors.sort{|a,b| b.last <=> a.last}
+    end
+
+    def fminer_neighbors params
+      bad_request_error "Incorrect parameters for Compound#fminer_neighbors. Please provide :feature_dataset_id, :min_sim." unless params[:feature_dataset_id] and params[:min_sim]
+      feature_dataset = Dataset.find params[:feature_dataset_id]
+      query_fingerprint = Algorithm::Descriptor.smarts_match(self, feature_dataset.features)
+      neighbors = []
+
+      # find neighbors
+      feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
+        sim = Algorithm::Similarity.tanimoto candidate_fingerprint, query_fingerprint
+        if sim >= params[:min_sim]
+          neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
+        end
+      end
+      neighbors
+    end
+
+    def physchem_neighbors params
+      feature_dataset = Dataset.find params[:feature_dataset_id]
+      query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors]
+      neighbors = []
+      feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
+        # TODO implement pearson and cosine similarity separatly
+        R.assign "x", query_fingerprint
+        R.assign "y", candidate_fingerprint
+        # pearson r
+        #sim = R.eval("cor(x,y,use='complete.obs',method='pearson')").to_ruby
+        #p "pearson"
+        #p sim
+        #p "cosine"
+        sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first
+        #p sim
+        if sim >= params[:min_sim]
+          neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
+        end
+      end
+      neighbors
+    end
+
+    def db_neighbors params
+      p "DB NEIGHBORS"
+      p params
+      # TODO restrict to dataset
+      # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
+      qn = fingerprint(params[:type]).size
+      #qmin = qn * threshold
+      #qmax = qn / threshold
+      #not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...)
+      #reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)]
+      aggregate = [
+        #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
+        {'$match' =>  {'_id' => {'$ne' => self.id}}}, # remove self
+        {'$project' => {
+          'tanimoto' => {'$let' => {
+            'vars' => {'common' => {'$size' => {'$setIntersection' => ["'$#{DEFAULT_FINGERPRINT}'", DEFAULT_FINGERPRINT]}}},
+            'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [qn, '$default_fingerprint_size']}, '$$common']}]}
+          }},
+          '_id' => 1
+        }},
+        {'$match' =>  {'tanimoto' => {'$gte' => params[:min_sim]}}},
+        {'$sort' => {'tanimoto' => -1}}
+      ]
+      
+      $mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] }
+        
+    end
+
+    private
+
+    def self.obconversion(identifier,input_format,output_format,option=nil)
+      obconversion = OpenBabel::OBConversion.new
+      obconversion.set_options(option, OpenBabel::OBConversion::OUTOPTIONS) if option
+      obmol = OpenBabel::OBMol.new
+      obconversion.set_in_and_out_formats input_format, output_format
+      return nil if identifier.nil?
+      obconversion.read_string obmol, identifier
+      case output_format
+      when /smi|can|inchi/
+        obconversion.write_string(obmol).gsub(/\s/,'').chomp
+      when /sdf/
+        # TODO: find disconnected structures
+        # strip_salts
+        # separate
+        obmol.add_hydrogens
+        builder = OpenBabel::OBBuilder.new
+        builder.build(obmol)
+
+        sdf = obconversion.write_string(obmol)
+print sdf
+        if sdf.match(/.nan/)
+          
+          $logger.warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure"
+          obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS)
+          sdf = obconversion.write_string(obmol)
+          if sdf.match(/.nan/)
+            $logger.warn "2D generation failed for compound #{identifier}, rendering without coordinates."
+            obconversion.remove_option("gen2D", OpenBabel::OBConversion::GENOPTIONS)
+            sdf = obconversion.write_string(obmol)
+          end
+        end
+        sdf
+      else
+        obconversion.write_string(obmol)
+      end
+    end
+
+    def obconversion(identifier,input_format,output_format,option=nil)
+      self.class.obconversion(identifier,input_format,output_format,option)
+    end
+  end
+end
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
new file mode 100644
index 0000000..cbffb7c
--- /dev/null
+++ b/lib/crossvalidation.rb
@@ -0,0 +1,302 @@
+module OpenTox
+
+  class CrossValidation
+    field :validation_ids, type: Array, default: []
+    field :model_id, type: BSON::ObjectId
+    field :folds, type: Integer
+    field :nr_instances, type: Integer
+    field :nr_unpredicted, type: Integer
+    field :predictions, type: Array, default: []
+    field :finished_at, type: Time 
+
+    def time
+      finished_at - created_at
+    end
+
+    def validations
+      validation_ids.collect{|vid| Validation.find vid}
+    end
+
+    def model
+      Model::Lazar.find model_id
+    end
+
+    def self.create model, n=10
+      model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation
+      bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass
+      cv = klass.new(
+        name: model.name,
+        model_id: model.id,
+        folds: n
+      )
+      cv.save # set created_at
+      nr_instances = 0
+      nr_unpredicted = 0
+      predictions = []
+      training_dataset = Dataset.find model.training_dataset_id
+      training_dataset.folds(n).each_with_index do |fold,fold_nr|
+        fork do # parallel execution of validations
+          $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
+          t = Time.now
+          validation = Validation.create(model, fold[0], fold[1],cv)
+          $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}:  #{Time.now-t} seconds"
+        end
+      end
+      Process.waitall
+      cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id)
+      cv.validations.each do |validation|
+        nr_instances += validation.nr_instances
+        nr_unpredicted += validation.nr_unpredicted
+        predictions += validation.predictions
+      end
+      cv.update_attributes(
+        nr_instances: nr_instances,
+        nr_unpredicted: nr_unpredicted,
+        predictions: predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence
+      )
+      $logger.debug "Nr unpredicted: #{nr_unpredicted}"
+      cv.statistics
+      cv
+    end
+  end
+
+  class ClassificationCrossValidation < CrossValidation
+
+    field :accept_values, type: Array
+    field :confusion_matrix, type: Array
+    field :weighted_confusion_matrix, type: Array
+    field :accuracy, type: Float
+    field :weighted_accuracy, type: Float
+    field :true_rate, type: Hash
+    field :predictivity, type: Hash
+    field :confidence_plot_id, type: BSON::ObjectId
+    # TODO auc, f-measure (usability??)
+
+    def statistics
+      accept_values = Feature.find(model.prediction_feature_id).accept_values
+      confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
+      weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
+      true_rate = {}
+      predictivity = {}
+      predictions.each do |pred|
+        compound_id,activity,prediction,confidence = pred
+        if activity and prediction and confidence.numeric? 
+          if prediction == activity
+            if prediction == accept_values[0]
+              confusion_matrix[0][0] += 1
+              weighted_confusion_matrix[0][0] += confidence
+            elsif prediction == accept_values[1]
+              confusion_matrix[1][1] += 1
+              weighted_confusion_matrix[1][1] += confidence
+            end
+          elsif prediction != activity
+            if prediction == accept_values[0]
+              confusion_matrix[0][1] += 1
+              weighted_confusion_matrix[0][1] += confidence
+            elsif prediction == accept_values[1]
+              confusion_matrix[1][0] += 1
+              weighted_confusion_matrix[1][0] += confidence
+            end
+          end
+        else
+          nr_unpredicted += 1 if prediction.nil?
+        end
+      end
+      true_rate = {}
+      predictivity = {}
+      accept_values.each_with_index do |v,i|
+        true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
+        predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
+      end
+      confidence_sum = 0
+      weighted_confusion_matrix.each do |r|
+        r.each do |c|
+          confidence_sum += c
+        end
+      end
+      update_attributes(
+        accept_values: accept_values,
+        confusion_matrix: confusion_matrix,
+        weighted_confusion_matrix: weighted_confusion_matrix,
+        accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
+        weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
+        true_rate: true_rate,
+        predictivity: predictivity,
+        finished_at: Time.now
+      )
+      $logger.debug "Accuracy #{accuracy}"
+    end
+
+    def confidence_plot
+      tmpfile = "/tmp/#{id.to_s}_confidence.svg"
+      accuracies = []
+      confidences = []
+      correct_predictions = 0
+      incorrect_predictions = 0
+      predictions.each do |p|
+        if p[1] and p[2]
+          p[1] == p [2] ? correct_predictions += 1 : incorrect_predictions += 1
+          accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f
+          confidences << p[3]
+
+        end
+      end
+      R.assign "accuracy", accuracies
+      R.assign "confidence", confidences
+      R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()"
+      R.eval "ggsave(file='#{tmpfile}', plot=image)"
+      file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
+      plot_id = $gridfs.insert_one(file)
+      update(:confidence_plot_id => plot_id)
+      $gridfs.find_one(_id: confidence_plot_id).data
+    end
+
+    #Average area under roc  0.646
+    #Area under roc  0.646
+    #F measure carcinogen: 0.769, noncarcinogen: 0.348
+  end
+
+  class RegressionCrossValidation < CrossValidation
+
+    field :rmse, type: Float
+    field :mae, type: Float
+    field :weighted_rmse, type: Float
+    field :weighted_mae, type: Float
+    field :r_squared, type: Float
+    field :correlation_plot_id, type: BSON::ObjectId
+    field :confidence_plot_id, type: BSON::ObjectId
+
+    def statistics
+      rmse = 0
+      weighted_rmse = 0
+      rse = 0
+      weighted_rse = 0
+      mae = 0
+      weighted_mae = 0
+      rae = 0
+      weighted_rae = 0
+      confidence_sum = 0
+      predictions.each do |pred|
+        compound_id,activity,prediction,confidence = pred
+        if activity and prediction
+          error = Math.log10(prediction)-Math.log10(activity)
+          rmse += error**2
+          weighted_rmse += confidence*error**2
+          mae += error.abs
+          weighted_mae += confidence*error.abs
+          confidence_sum += confidence
+        else
+          warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+          $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+        end
+      end
+      x = predictions.collect{|p| p[1]}
+      y = predictions.collect{|p| p[2]}
+      R.assign "measurement", x
+      R.assign "prediction", y
+      R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
+      r = R.eval("r").to_ruby
+
+      mae = mae/predictions.size
+      weighted_mae = weighted_mae/confidence_sum
+      rmse = Math.sqrt(rmse/predictions.size)
+      weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
+      update_attributes(
+        mae: mae,
+        rmse: rmse,
+        weighted_mae: weighted_mae,
+        weighted_rmse: weighted_rmse,
+        r_squared: r**2,
+        finished_at: Time.now
+      )
+      $logger.debug "R^2 #{r**2}"
+      $logger.debug "RMSE #{rmse}"
+      $logger.debug "MAE #{mae}"
+    end
+
+    def misclassifications n=nil
+      #n = predictions.size unless n
+      n ||= 10 
+      model = Model::Lazar.find(self.model_id)
+      training_dataset = Dataset.find(model.training_dataset_id)
+      prediction_feature = training_dataset.features.first
+      predictions.collect do |p|
+        unless p.include? nil
+          compound = Compound.find(p[0])
+          neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters)
+          neighbors.collect! do |n|
+            neighbor = Compound.find(n[0])
+            values = training_dataset.values(neighbor,prediction_feature)
+            { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values}
+          end
+          {
+            :smiles => compound.smiles, 
+            #:fingerprint => compound.fp4.collect{|id|  Smarts.find(id).name},
+            :measured => p[1],
+            :predicted => p[2],
+            #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs,
+            :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs,
+            :relative_error => (p[1]-p[2]).abs/p[1],
+            :confidence => p[3],
+            :neighbors => neighbors
+          }
+        end
+      end.compact.sort{|a,b| p a; b[:relative_error] <=> a[:relative_error]}[0..n-1]
+    end
+
+    def confidence_plot
+      tmpfile = "/tmp/#{id.to_s}_confidence.svg"
+      sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact
+      R.assign "error", sorted_predictions.collect{|p| p[0]}
+      R.assign "confidence", sorted_predictions.collect{|p| p[1]}
+      # TODO fix axis names
+      R.eval "image = qplot(confidence,error)"
+      R.eval "image = image + stat_smooth(method='lm', se=FALSE)"
+      R.eval "ggsave(file='#{tmpfile}', plot=image)"
+      file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg")
+      plot_id = $gridfs.insert_one(file)
+      update(:confidence_plot_id => plot_id)
+      $gridfs.find_one(_id: confidence_plot_id).data
+    end
+
+    def correlation_plot
+      unless correlation_plot_id
+        tmpfile = "/tmp/#{id.to_s}_correlation.svg"
+        x = predictions.collect{|p| p[1]}
+        y = predictions.collect{|p| p[2]}
+        attributes = Model::Lazar.find(self.model_id).attributes
+        attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key}
+        attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n")
+        R.assign "measurement", x
+        R.assign "prediction", y
+        R.eval "all = c(-log(measurement),-log(prediction))"
+        R.eval "range = c(min(all), max(all))"
+        R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)"
+        R.eval "image = image + geom_abline(intercept=0, slope=1)"
+        R.eval "ggsave(file='#{tmpfile}', plot=image)"
+        file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg")
+        plot_id = $gridfs.insert_one(file)
+        update(:correlation_plot_id => plot_id)
+      end
+      $gridfs.find_one(_id: correlation_plot_id).data
+    end
+  end
+
+  class RepeatedCrossValidation
+    field :crossvalidation_ids, type: Array, default: []
+    def self.create model, folds=10, repeats=3
+      repeated_cross_validation = self.new
+      repeats.times do |n|
+        $logger.debug "Crossvalidation #{n+1} for #{model.name}"
+        repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id
+      end
+      repeated_cross_validation.save
+      repeated_cross_validation
+    end
+    def crossvalidations
+      crossvalidation_ids.collect{|id| CrossValidation.find(id)}
+    end
+  end
+
+
+end
diff --git a/lib/dataset.rb b/lib/dataset.rb
new file mode 100644
index 0000000..60f3bb5
--- /dev/null
+++ b/lib/dataset.rb
@@ -0,0 +1,350 @@
+require 'csv'
+require 'tempfile'
+
+module OpenTox
+
+  class Dataset
+
+    attr_writer :data_entries
+
+    # associations like has_many, belongs_to deteriorate performance
+    field :feature_ids, type: Array, default: []
+    field :compound_ids, type: Array, default: []
+    field :data_entries_id, type: BSON::ObjectId#, default: []
+    field :source, type: String
+
+    # Save all data including data_entries
+    # Should be used instead of save
+    def save_all
+      dump = Marshal.dump(@data_entries)
+      file = Mongo::Grid::File.new(dump, :filename => "#{self.id.to_s}.data_entries")
+      entries_id = $gridfs.insert_one(file)
+      update(:data_entries_id => entries_id)
+    end
+
+    # Readers
+
+    # Get all compounds
+    def compounds
+      @compounds ||= self.compound_ids.collect{|id| OpenTox::Compound.find id}
+      @compounds
+    end
+
+    # Get all features
+    def features
+      @features ||= self.feature_ids.collect{|id| OpenTox::Feature.find(id)}
+      @features
+    end
+
+    # Get all data_entries
+    def data_entries
+      unless @data_entries
+        t = Time.now
+        data_entry_file = $gridfs.find_one(_id: data_entries_id)
+        if data_entry_file.nil?
+          @data_entries = []
+        else
+          @data_entries = Marshal.load(data_entry_file.data)
+          bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array
+          unless @data_entries.first.size == feature_ids.size
+            # TODO: fix (unknown) source of empty data_entries
+            sleep 1
+            data_entry_file = $gridfs.find_one(_id: data_entries_id)
+            @data_entries = Marshal.load(data_entry_file.data)
+          end
+          bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size
+          # TODO: data_entries can be empty, poorly reproducible, mongo problem?
+          bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size
+          #$logger.debug "Retrieving data: #{Time.now-t}"
+        end
+      end
+      @data_entries
+    end
+
+    # Find data entry values for a given compound and feature
+    # @param compound [OpenTox::Compound] OpenTox Compound object
+    # @param feature [OpenTox::Feature] OpenTox Feature object
+    # @return [Array] Data entry values
+    def values(compound, feature)
+      rows = compound_ids.each_index.select{|r| compound_ids[r] == compound.id }
+      col = feature_ids.index feature.id
+      rows.collect{|row| data_entries[row][col]}
+    end
+
+    # Writers
+
+    # Set compounds
+    def compounds=(compounds)
+      self.compound_ids = compounds.collect{|c| c.id}
+    end
+
+    # Set features
+    def features=(features)
+      self.feature_ids = features.collect{|f| f.id}
+    end
+
+    # Dataset operations
+
+    # Split a dataset into n folds
+    # @param [Integer] number of folds
+    # @return [Array] Array with folds [training_dataset,test_dataset]
+    def folds n
+      len = self.compound_ids.size
+      indices = (0..len-1).to_a.shuffle
+      mid = (len/n)
+      chunks = []
+      start = 0
+      1.upto(n) do |i|
+        last = start+mid
+        last = last-1 unless len%n >= i
+        test_idxs = indices[start..last] || []
+        test_cids = test_idxs.collect{|i| self.compound_ids[i]}
+        test_data_entries = test_idxs.collect{|i| self.data_entries[i]}
+        test_dataset = self.class.new(:compound_ids => test_cids, :feature_ids => self.feature_ids, :data_entries => test_data_entries)
+        training_idxs = indices-test_idxs
+        training_cids = training_idxs.collect{|i| self.compound_ids[i]}
+        training_data_entries = training_idxs.collect{|i| self.data_entries[i]}
+        training_dataset = self.class.new(:compound_ids => training_cids, :feature_ids => self.feature_ids, :data_entries => training_data_entries)
+        test_dataset.save_all
+        training_dataset.save_all
+        chunks << [training_dataset,test_dataset]
+        start = last+1
+      end
+      chunks
+    end
+
+    # Diagnostics
+
+    def correlation_plot training_dataset
+      # TODO: create/store svg
+      R.assign "features", data_entries
+      R.assign "activities", training_dataset.data_entries.collect{|de| de.first}
+      R.eval "featurePlot(features,activities)"
+    end
+
+    def density_plot
+      # TODO: create/store svg
+      R.assign "acts", data_entries.collect{|r| r.first }#.compact
+      R.eval "plot(density(-log(acts),na.rm= TRUE), main='-log(#{features.first.name})')"
+    end
+
+    # Serialisation
+    
+    # converts dataset to csv format including compound smiles as first column, other column headers are feature names
+    # @return [String]
+    def to_csv(inchi=false)
+      CSV.generate() do |csv| #{:force_quotes=>true}
+        csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
+        compounds.each_with_index do |c,i|
+          csv << [inchi ? c.inchi : c.smiles] + data_entries[i]
+        end
+      end
+    end
+
+
+    # Parsers
+
+    # Create a dataset from file (csv,sdf,...)
+    # @param filename [String]
+    # @return [String] dataset uri
+    # TODO
+    #def self.from_sdf_file
+    #end
+
+    # Create a dataset from CSV file
+    # TODO: document structure
+    def self.from_csv_file file, source=nil, bioassay=true
+      source ||= file
+      name = File.basename(file,".*")
+      dataset = self.find_by(:source => source, :name => name)
+      if dataset
+        $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})."
+      else
+        $logger.debug "Parsing #{file}."
+        table = CSV.read file, :skip_blanks => true
+        dataset = self.new(:source => source, :name => name)
+        dataset.parse_table table, bioassay
+      end
+      dataset
+    end
+
+    # parse data in tabular format (e.g. from csv)
+    # does a lot of guesswork in order to determine feature types
+    def parse_table table, bioassay=true
+
+      time = Time.now
+
+      # features
+      feature_names = table.shift.collect{|f| f.strip}
+      warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size
+      compound_format = feature_names.shift.strip
+      bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
+
+      numeric = []
+      # guess feature types
+      feature_names.each_with_index do |f,i|
+        metadata = {:name => f}
+        values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
+        types = values.collect{|v| v.numeric? ? true : false}.uniq
+        if values.size == 0 # empty feature
+        elsif  values.size > 5 and types.size == 1 and types.first == true # 5 max classes
+          metadata["numeric"] = true
+          numeric[i] = true
+        else
+          metadata["nominal"] = true
+          metadata["accept_values"] = values
+          numeric[i] = false
+        end
+        if bioassay
+          if metadata["numeric"]
+            feature = NumericBioAssay.find_or_create_by(metadata)
+          elsif metadata["nominal"]
+            feature = NominalBioAssay.find_or_create_by(metadata)
+          end
+        else
+          metadata.merge({:measured => false, :calculated => true})
+          if metadata["numeric"]
+            feature = NumericFeature.find_or_create_by(metadata)
+          elsif metadata["nominal"]
+            feature = NominalFeature.find_or_create_by(metadata)
+          end
+        end
+        feature_ids << feature.id if feature
+      end
+      
+      $logger.debug "Feature values: #{Time.now-time}"
+      time = Time.now
+
+      r = -1
+      compound_time = 0
+      value_time = 0
+
+      # compounds and values
+      @data_entries = [] #Array.new(table.size){Array.new(table.first.size-1)}
+
+      table.each_with_index do |vals,i|
+        ct = Time.now
+        identifier = vals.shift
+        warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty?
+        begin
+          case compound_format
+          when /SMILES/i
+            compound = OpenTox::Compound.from_smiles(identifier)
+          when /InChI/i
+            compound = OpenTox::Compound.from_inchi(identifier)
+          end
+        rescue 
+          compound = nil
+        end
+        if compound.nil?
+          # compound parsers may return nil
+          warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
+          next
+        end
+        # TODO insert empty compounds to keep positions?
+        compound_time += Time.now-ct
+          
+        r += 1
+        unless vals.size == feature_ids.size # way cheaper than accessing features
+          warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
+          next
+        end
+
+        compound_ids << compound.id
+        table.first.size == 0 ?  @data_entries << Array.new(0) : @data_entries << Array.new(table.first.size-1) 
+        
+        vals.each_with_index do |v,j|
+          if v.blank?
+            warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
+            next
+          elsif numeric[j]
+            @data_entries.last[j] = v.to_f
+          else
+            @data_entries.last[j] = v.strip
+          end
+        end
+      end
+      compounds.duplicates.each do |compound|
+        positions = []
+        compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi}
+        warnings << "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." 
+      end
+      
+      $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
+      time = Time.now
+      save_all
+      $logger.debug "Saving: #{Time.now-time}"
+
+    end
+
+    # Fill unset data entries 
+    # @param any value
+    def fill_nil_with n
+      (0 .. compound_ids.size-1).each do |i|
+        @data_entries[i] ||= []
+        (0 .. feature_ids.size-1).each do |j|
+          @data_entries[i][j] ||= n
+        end
+      end
+    end
+
+    def scale
+      scaled_data_entries = Array.new(data_entries.size){Array.new(data_entries.first.size)}
+      centers = []
+      scales = []
+      feature_ids.each_with_index do |feature_id,col| 
+        R.assign "x", data_entries.collect{|de| de[col]}
+        R.eval "scaled = scale(x,center=T,scale=T)"
+        centers[col] = R.eval("attr(scaled, 'scaled:center')").to_ruby
+        scales[col] = R.eval("attr(scaled, 'scaled:scale')").to_ruby
+        R.eval("scaled").to_ruby.each_with_index do |value,row|
+          scaled_data_entries[row][col] = value
+        end
+      end
+      scaled_dataset = ScaledDataset.new(attributes)
+      scaled_dataset["_id"] = BSON::ObjectId.new
+      scaled_dataset["_type"] = "OpenTox::ScaledDataset"
+      scaled_dataset.centers = centers
+      scaled_dataset.scales = scales
+      scaled_dataset.data_entries = scaled_data_entries
+      scaled_dataset.save_all
+      scaled_dataset
+    end
+  end
+
+  # Dataset for lazar predictions
+  class LazarPrediction < Dataset
+    field :creator, type: String
+    field :prediction_feature_id, type: String
+
+    def prediction_feature
+      Feature.find prediction_feature_id
+    end
+
+  end
+
+  # Dataset for descriptors (physchem)
+  class DescriptorDataset < Dataset
+    field :feature_calculation_algorithm, type: String
+
+  end
+
+  class ScaledDataset < DescriptorDataset
+
+    field :centers, type: Array, default: []
+    field :scales, type: Array, default: []
+
+    def original_value value, i
+      value * scales[i] + centers[i]
+    end
+  end
+
+  # Dataset for fminer descriptors
+  class FminerDataset < DescriptorDataset
+    field :training_algorithm, type: String
+    field :training_dataset_id, type: BSON::ObjectId
+    field :training_feature_id, type: BSON::ObjectId
+    field :training_parameters, type: Hash
+  end
+
+end
diff --git a/lib/descriptor.rb b/lib/descriptor.rb
new file mode 100644
index 0000000..9733bde
--- /dev/null
+++ b/lib/descriptor.rb
@@ -0,0 +1,248 @@
+require 'digest/md5'
+ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk" 
+# TODO store descriptors in mongodb
+
+module OpenTox
+
+  module Algorithm 
+    
+    # Class for descriptor calculations
+    class Descriptor 
+      include OpenTox
+
+      JAVA_DIR = File.join(File.dirname(__FILE__),"..","java")
+      CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last
+      JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar")
+      LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar")
+      JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar")
+
+      obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"]
+      OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d|
+        name,description = d.split(/\s+/,2)
+        ["Openbabel."+name,description] unless obexclude.include? name
+      end.compact.sort{|a,b| a[0] <=> b[0]}]
+
+      cdk_desc = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR}  CdkDescriptorInfo`)
+      CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}]
+      CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"."+name } }.flatten
+
+      # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug)
+      joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"]
+      # strip Joelib messages from stdout
+      JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR}  JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d|
+        name = d[:java_class].sub(/^joelib2.feature.types./,'')
+        # impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java
+        ["Joelib."+name, "no description available"] unless joelibexclude.include? name
+      end.compact.sort{|a,b| a[0] <=> b[0]}] 
+
+      DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS))
+      DESCRIPTOR_VALUES = OBDESCRIPTORS.keys + CDKDESCRIPTOR_VALUES + JOELIBDESCRIPTORS.keys
+
+      require_relative "unique_descriptors.rb"
+
+      # Description of available descriptors
+      def self.description descriptor
+        lib = descriptor.split('.').first
+        case lib
+        when "Openbabel"
+          OBDESCRIPTORS[descriptor]
+        when "Cdk"
+          name = descriptor.split('.')[0..-2].join('.')
+          CDKDESCRIPTORS[name]
+        when "Joelib"
+          JOELIBDESCRIPTORS[descriptor]
+        when "lookup"
+          "Read feature values from a dataset"
+        end
+      end
+
+      # Match an array of smarts features 
+      def self.smarts_match compounds, smarts_features, count=false
+        bad_request_error "Compounds for smarts_match are empty" unless compounds
+        bad_request_error "Smarts features for smarts_match are empty" unless smarts_features
+        parse compounds
+        @count = count
+        obconversion = OpenBabel::OBConversion.new
+        obmol = OpenBabel::OBMol.new
+        obconversion.set_in_format('smi')
+        smarts_pattern = OpenBabel::OBSmartsPattern.new
+        smarts_features = [smarts_features] if smarts_features.is_a?(Feature)
+        @smarts = smarts_features.collect{|f| f.smarts}
+        @physchem_descriptors = nil
+        @data_entries = Array.new(@compounds.size){Array.new(@smarts.size,false)}
+        @compounds.each_with_index do |compound,c|
+          obconversion.read_string(obmol,compound.smiles)
+          @smarts.each_with_index do |smart,s|
+            smarts_pattern.init(smart)
+            if smarts_pattern.match(obmol)
+              count ? value = smarts_pattern.get_map_list.to_a.size : value = 1
+            else
+              value = 0 
+            end
+            @data_entries[c][s] = value
+          end
+        end
+        serialize 
+      end
+
+      # Count matches of an array with smarts features 
+      def self.smarts_count compounds, smarts
+        # TODO: non-overlapping matches?
+        smarts_match compounds,smarts,true
+      end
+
+      # Calculate physchem descriptors
+      # @param [OpenTox::Compound,Array,OpenTox::Dataset] input object, either a compound, an array of compounds or a dataset
+      def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS
+        parse compounds
+        @data_entries = Array.new(@compounds.size){[]}
+        @descriptors = descriptors
+        @smarts = nil
+        @physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features
+        des = {}
+        @descriptors.each do |d|
+          lib, descriptor = d.split(".",2)
+          lib = lib.downcase.to_sym
+          des[lib] ||= []
+          des[lib] << descriptor
+        end
+        des.each do |lib,descriptors|
+          p lib, descriptors
+          send(lib, descriptors)
+        end
+        serialize
+      end
+
+      def self.openbabel descriptors
+        $logger.debug "compute #{descriptors.size} openbabel descriptors for #{@compounds.size} compounds"
+        obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d}
+        obmol = OpenBabel::OBMol.new
+        obconversion = OpenBabel::OBConversion.new
+        obconversion.set_in_format 'smi'
+        last_feature_idx = @physchem_descriptors.size
+        @compounds.each_with_index do |compound,c|
+          obconversion.read_string obmol, compound.smiles
+          obdescriptors.each_with_index do |descriptor,d|
+            @data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol))
+          end
+        end
+        @physchem_descriptors += descriptors.collect{|d| "Openbabel.#{d}"}
+      end
+
+      def self.java_descriptors descriptors, lib
+        $logger.debug "compute #{descriptors.size} cdk descriptors for #{@compounds.size} compounds"
+        sdf = sdf_3d 
+        # use java system call (rjb blocks within tasks)
+        # use Tempfiles to avoid "Argument list too long" error 
+        case lib
+        when "cdk"
+          run_cmd "java -classpath #{CDK_JAR}:#{JAVA_DIR}  CdkDescriptors #{sdf} #{descriptors.join(" ")}"
+        when "joelib"
+          run_cmd "java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR}  JoelibDescriptors  #{sdf} #{descriptors.join(' ')}"
+        end
+        last_feature_idx = @physchem_descriptors.size
+        YAML.load_file("#{sdf}#{lib}.yaml").each_with_index do |calculation,i|
+          # TODO create warnings
+          #$logger.error "Descriptor calculation failed for compound #{@compounds[i].inchi}." if calculation.empty?
+          # CDK Descriptors may calculate multiple values, they are stored in separate features
+          @physchem_descriptors += calculation.keys if i == 0
+          calculation.keys.each_with_index do |name,j|
+            @data_entries[i][j+last_feature_idx] = fix_value(calculation[name])
+          end
+        end
+        FileUtils.rm "#{sdf}#{lib}.yaml"
+      end
+
+      def self.cdk descriptors
+        java_descriptors descriptors, "cdk"
+      end
+
+      def self.joelib descriptors
+        java_descriptors descriptors, "joelib"
+      end
+
+      def self.lookup compounds, features, dataset
+        parse compounds
+        fingerprint = []
+        compounds.each do |compound|
+          fingerprint << []
+          features.each do |feature|
+          end
+        end
+      end
+
+      def self.run_cmd cmd
+        cmd = "#{cmd} 2>&1"
+        $logger.debug "running external cmd: '#{cmd}'"
+        p = IO.popen(cmd) do |io|
+          while line = io.gets
+            $logger.debug "> #{line.chomp}"
+          end
+          io.close
+          raise "external cmd failed '#{cmd}' (see log file for error msg)" unless $?.to_i == 0
+        end
+      end
+
+      def self.sdf_3d 
+        # TODO check if 3d sdfs are stored in GridFS
+        sdf = ""
+        @compounds.each do |compound|
+          sdf << compound.sdf
+        end
+        sdf_file = "/tmp/#{SecureRandom.uuid}.sdf"
+        File.open(sdf_file,"w+"){|f| f.print sdf}
+        sdf_file
+      end
+
+      def self.parse compounds
+        @input_class = compounds.class.to_s
+        case @input_class
+        when "OpenTox::Compound"
+          @compounds = [compounds]
+        when "Array"
+          @compounds = compounds
+        when "OpenTox::Dataset"
+          @compounds = compounds.compounds
+        else
+          bad_request_error "Cannot calculate descriptors for #{compounds.class} objects."
+        end
+      end
+
+      def self.serialize
+        @data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}}
+        case @input_class
+        when "OpenTox::Compound"
+          @data_entries.first
+        when "Array"
+          @data_entries
+        when "OpenTox::Dataset"
+          dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id})
+          if @smarts
+            dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id}
+            @count ? algo = "count" : algo = "match"
+            dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}"
+            
+          elsif @physchem_descriptors
+            dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id}
+            dataset.data_entries = @data_entries
+            dataset.feature_calculation_algorithm = "#{self}.physchem"
+            #TODO params?
+          end
+          dataset.save_all
+          dataset
+        end
+      end
+
+      def self.fix_value val
+        val = val.first if val.is_a? Array and val.size == 1
+        val = nil if val == "NaN"
+        if val.numeric?
+          val = Float(val)
+          val = nil if val.nan? or val.infinite?
+        end
+        val
+      end
+      private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize
+    end
+  end
+end
diff --git a/lib/error.rb b/lib/error.rb
new file mode 100644
index 0000000..39b3c76
--- /dev/null
+++ b/lib/error.rb
@@ -0,0 +1,66 @@
+module OpenToxError
+  attr_accessor :http_code, :message, :cause
+  def initialize message=nil
+    message = message.to_s.gsub(/\A"|"\Z/, '') if message # remove quotes
+    super message
+    @http_code ||= 500
+    @message = message.to_s
+    @cause = cut_backtrace(caller)
+    $logger.error("\n"+JSON.pretty_generate({
+      :http_code => @http_code,
+      :message => @message,
+      :cause => @cause
+    })) 
+  end
+  
+  def cut_backtrace(trace)
+    if trace.is_a?(Array)
+      cut_index = trace.find_index{|line| line.match(/sinatra|minitest/)}
+      cut_index ||= trace.size
+      cut_index -= 1
+      cut_index = trace.size-1 if cut_index < 0
+      trace[0..cut_index]
+    else
+      trace
+    end
+  end
+
+end
+
+class RuntimeError
+  include OpenToxError
+end
+
+# clutters log file with library errors 
+#class NoMethodError
+  #include OpenToxError
+#end
+
+module OpenTox
+
+  class Error < RuntimeError
+    include OpenToxError
+    
+    def initialize(code, message=nil)
+      @http_code = code
+      super message
+    end
+  end
+
+  # OpenTox errors
+  RestClientWrapper.known_errors.each do |error|
+    # create error classes 
+    c = Class.new Error do
+      define_method :initialize do |message=nil|
+        super error[:code], message
+      end
+    end
+    OpenTox.const_set error[:class],c
+    
+    # define global methods for raising errors, eg. bad_request_error
+    Object.send(:define_method, error[:method]) do |message|
+      raise c.new(message)
+    end
+  end
+  
+end
diff --git a/lib/experiment.rb b/lib/experiment.rb
new file mode 100644
index 0000000..0dfdf86
--- /dev/null
+++ b/lib/experiment.rb
@@ -0,0 +1,99 @@
+module OpenTox
+
+  class Experiment
+    field :dataset_ids, type: Array
+    field :model_settings, type: Array, default: []
+    field :results, type: Hash, default: {}
+
+    def run 
+      dataset_ids.each do |dataset_id|
+        dataset = Dataset.find(dataset_id)
+        results[dataset_id.to_s] = []
+        model_settings.each do |setting|
+          setting = setting.dup
+          model_algorithm = setting.delete :model_algorithm #if setting[:model_algorithm]
+          model = Object.const_get(model_algorithm).create dataset, setting
+          $logger.debug model
+          model.save
+          repeated_crossvalidation = RepeatedCrossValidation.create model
+          results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id}
+        end
+      end
+      save
+    end
+
+    def report
+      # statistical significances http://www.r-bloggers.com/anova-and-tukeys-test-on-r/
+      report = {}
+      report[:name] = name
+      report[:experiment_id] = self.id.to_s
+      report[:results] = {}
+      parameters = []
+      dataset_ids.each do |dataset_id|
+        dataset_name = Dataset.find(dataset_id).name
+        report[:results][dataset_name] = {}
+        report[:results][dataset_name][:anova] = {}
+        report[:results][dataset_name][:data] = []
+        # TODO results[dataset_id.to_s] does not exist
+        results[dataset_id.to_s].each do |result|
+          model = Model::Lazar.find(result[:model_id])
+          repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id])
+          crossvalidations = repeated_cv.crossvalidations
+          if crossvalidations.first.is_a? ClassificationCrossValidation
+            parameters = [:accuracy,:true_rate,:predictivity]
+          elsif crossvalidations.first.is_a? RegressionCrossValidation
+            parameters = [:rmse,:mae,:r_squared]
+          end
+          summary = {}
+          [:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key|
+            summary[key] = model[key]
+          end
+          summary[:nr_instances] = crossvalidations.first.nr_instances
+          summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted}
+          summary[:time] = crossvalidations.collect{|cv| cv.time}
+          parameters.each do |param|
+            summary[param] = crossvalidations.collect{|cv| cv.send(param)}
+          end
+          report[:results][dataset_name][:data] << summary
+        end
+      end
+      report[:results].each do |dataset,results|
+        ([:time,:nr_unpredicted]+parameters).each do |param|
+          experiments = []
+          outcome = []
+          results[:data].each_with_index do |result,i|
+            result[param].each do |p|
+              experiments << i
+              p = nil if p.kind_of? Float and p.infinite? # TODO fix @ division by 0
+              outcome << p
+            end
+          end
+          begin
+          R.assign "experiment_nr",experiments.collect{|i| "Experiment #{i}"}
+          R.eval "experiment_nr = factor(experiment_nr)"
+          R.assign "outcome", outcome
+          R.eval "data = data.frame(experiment_nr,outcome)"
+          # one-way ANOVA
+          R.eval "fit = aov(outcome ~ experiment_nr, data=data,na.action='na.omit')"
+          # http://stackoverflow.com/questions/3366506/extract-p-value-from-aov
+          p_value = R.eval("summary(fit)[[1]][['Pr(>F)']][[1]]").to_ruby
+          # aequivalent
+          # sum = R.eval("summary(fit)")
+          #p_value = sum.to_ruby.first.last.first
+          rescue 
+            p_value = nil
+          end
+          report[:results][dataset][:anova][param] = p_value
+=begin
+=end
+        end
+      end
+      report
+    end
+
+    def summary
+      report[:results].collect{|dataset,data| {dataset => data[:anova].select{|param,p_val| p_val < 0.1}}}
+    end
+  end
+
+end
diff --git a/lib/feature.rb b/lib/feature.rb
new file mode 100644
index 0000000..13fa6d1
--- /dev/null
+++ b/lib/feature.rb
@@ -0,0 +1,95 @@
+module OpenTox
+
+  # Basic feature class
+  class Feature
+    field :nominal, type: Boolean
+    field :numeric, type: Boolean
+    field :measured, type: Boolean
+  end
+
+  # Feature for categorical variables
+  class NominalFeature < Feature
+    # TODO check if accept_values are still needed 
+    field :accept_values, type: Array
+    def initialize params
+      super params
+      nominal = true
+    end
+  end
+
+  # Feature for quantitative variables
+  class NumericFeature < Feature
+    def initialize params
+      super params
+      numeric = true
+    end
+  end
+
+  # Feature for SMARTS fragments
+  class Smarts < NominalFeature
+    field :smarts, type: String 
+    index "smarts" => 1
+    def self.from_smarts smarts
+      self.find_or_create_by :smarts => smarts
+    end
+  end
+
+  # Feature for supervised fragments from Fminer algorithm
+  class FminerSmarts < Smarts
+    field :p_value, type: Float
+    # TODO check if effect is used
+    field :effect, type: String
+    field :dataset_id 
+  end
+
+  # Feature for database fingerprints
+  # needs count for efficient retrieval (see compound.rb)
+  class FingerprintSmarts < Smarts
+    field :count, type: Integer
+    def self.fingerprint
+=begin
+      @@fp4 ||= OpenTox::FingerprintSmarts.all
+      unless @@fp4.size == 306
+        @@fp4 = []
+        # OpenBabel FP4 fingerprints
+        # OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html
+        # TODO investigate other types of fingerprints (MACCS)
+        # OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html
+        # http://www.dalkescientific.com/writings/diary/archive/2008/06/26/fingerprint_background.html
+        # OpenBabel MNA http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html#multilevel-neighborhoods-of-atoms-mna
+        # Morgan ECFP, FCFP
+        # http://cdk.github.io/cdk/1.5/docs/api/org/openscience/cdk/fingerprint/CircularFingerprinter.html
+        # http://www.rdkit.org/docs/GettingStartedInPython.html
+        # Chemfp
+        # https://chemfp.readthedocs.org/en/latest/using-tools.html
+        # CACTVS/PubChem
+
+        File.open(File.join(File.dirname(__FILE__),"SMARTS_InteLigand.txt")).each do |l| 
+          l.strip!
+          unless l.empty? or l.match /^#/
+            name,smarts = l.split(': ')
+            @@fp4 << OpenTox::FingerprintSmarts.find_or_create_by(:name => name, :smarts => smarts) unless smarts.nil?
+          end
+        end
+      end
+      @@fp4
+=end
+    end
+  end
+
+  # Feature for physico-chemical descriptors
+  class PhysChemDescriptor < NumericFeature
+    field :algorithm, type: String, default: "OpenTox::Algorithm::Descriptor.physchem"
+    field :parameters, type: Hash
+    field :creator, type: String
+  end
+
+  # Feature for categorical bioassay results
+  class NominalBioAssay < NominalFeature
+  end
+
+  # Feature for quantitative bioassay results
+  class NumericBioAssay < NumericFeature
+  end
+
+end
diff --git a/lib/lazar.rb b/lib/lazar.rb
new file mode 100644
index 0000000..f801062
--- /dev/null
+++ b/lib/lazar.rb
@@ -0,0 +1,72 @@
+require 'rubygems'
+require "bundler/setup"
+require "rest-client"
+require 'yaml'
+require 'json'
+require 'logger'
+require 'mongoid'
+require 'rserve'
+require "nokogiri"
+require "base64"
+
+# Mongo setup
+# TODO retrieve correct environment from Rack/Sinatra
+ENV["MONGOID_ENV"] ||= "development"
+# TODO remove config files, change default via ENV or directly in Mongoid class
+Mongoid.load!("#{File.expand_path(File.join(File.dirname(__FILE__),'..','mongoid.yml'))}")
+Mongoid.raise_not_found_error = false # return nil if no document is found
+$mongo = Mongo::Client.new('mongodb://127.0.0.1:27017/opentox')
+#$mongo = Mongoid.default_client
+$gridfs = $mongo.database.fs
+
+# R setup
+R = Rserve::Connection.new
+R.eval "library(ggplot2)"
+R.eval "library(grid)"
+R.eval "library(gridExtra)"
+
+# Logger setup
+STDOUT.sync = true # for redirection, etc see http://stackoverflow.com/questions/8549443/why-doesnt-logger-output-to-stdout-get-redirected-to-files
+$logger = Logger.new STDOUT # STDERR did not work on my development machine (CH)
+$logger.level = Logger::DEBUG
+Mongo::Logger.level = Logger::WARN 
+#Mongo::Logger.logger = $logger
+
+# Require sub-Repositories
+require_relative '../libfminer/libbbrc/bbrc' # include before openbabel
+require_relative '../libfminer/liblast/last' # 
+require_relative '../last-utils/lu.rb'
+require_relative '../openbabel/lib/openbabel'
+
+# Fminer environment variables
+ENV['FMINER_SMARTS'] = 'true'
+ENV['FMINER_NO_AROMATIC'] = 'true'
+ENV['FMINER_PVALUES'] = 'true'
+ENV['FMINER_SILENT'] = 'true'
+ENV['FMINER_NR_HITS'] = 'true'
+
+# OpenTox classes and includes
+CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
+
+[ # be aware of the require sequence as it affects class/method overwrites
+  "overwrite.rb",
+  "rest-client-wrapper.rb", 
+  "error.rb",
+  "opentox.rb",
+  "feature.rb",
+  "compound.rb",
+  "dataset.rb",
+  "descriptor.rb",
+  "algorithm.rb",
+  "descriptor.rb",
+  "bbrc.rb",
+  "model.rb",
+  "similarity.rb",
+  #"neighbor.rb",
+  "classification.rb",
+  "regression.rb",
+  "validation.rb",
+  "crossvalidation.rb",
+  "experiment.rb",
+].each{ |f| require_relative f }
+
diff --git a/lib/model.rb b/lib/model.rb
new file mode 100644
index 0000000..98433d0
--- /dev/null
+++ b/lib/model.rb
@@ -0,0 +1,265 @@
+module OpenTox
+
+  module Model
+
+    class Model
+      include OpenTox
+      include Mongoid::Document
+      include Mongoid::Timestamps
+      store_in collection: "models"
+
+      field :name, type: String
+      field :creator, type: String, default: __FILE__
+      # datasets
+      field :training_dataset_id, type: BSON::ObjectId
+      # algorithms
+      field :prediction_algorithm, type: String
+      # prediction feature
+      field :prediction_feature_id, type: BSON::ObjectId
+
+      def training_dataset
+        Dataset.find(training_dataset_id)
+      end
+    end
+
+    class Lazar < Model
+
+      # algorithms
+      field :neighbor_algorithm, type: String
+      field :neighbor_algorithm_parameters, type: Hash, default: {}
+
+      # Create a lazar model from a training_dataset and a feature_dataset
+      # @param [OpenTox::Dataset] training_dataset
+      # @return [OpenTox::Model::Lazar] Regression or classification model
+      def initialize training_dataset, params={}
+
+        super params
+        bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
+
+        # TODO document convention
+        prediction_feature = training_dataset.features.first
+        # set defaults for empty parameters
+        self.prediction_feature_id ||= prediction_feature.id
+        self.training_dataset_id ||= training_dataset.id
+        self.name ||= "#{training_dataset.name} #{prediction_feature.name}" 
+        self.neighbor_algorithm_parameters ||= {}
+        self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id
+        save
+        self
+      end
+
+      def predict object, use_database_values=true
+
+        t = Time.now
+        at = Time.now
+
+        training_dataset = Dataset.find training_dataset_id
+        prediction_feature = Feature.find prediction_feature_id
+
+        # parse data
+        compounds = []
+        case object.class.to_s
+        when "OpenTox::Compound"
+          compounds = [object] 
+        when "Array"
+          compounds = object
+        when "OpenTox::Dataset"
+          compounds = object.compounds
+        else 
+          bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter."
+        end
+
+        # make predictions
+        predictions = []
+        neighbors = []
+        compounds.each_with_index do |compound,c|
+          t = Time.new
+          database_activities = training_dataset.values(compound,prediction_feature)
+          if use_database_values and database_activities and !database_activities.empty?
+            database_activities = database_activities.first if database_activities.size == 1
+            predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."}
+            next
+          end
+          neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
+
+          # add activities
+          # TODO: improve efficiency, takes 3 times longer than previous version
+          neighbors.collect! do |n|
+            rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first}
+            acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact
+            acts.empty? ? nil : n << acts
+          end
+          neighbors.compact! # remove neighbors without training activities
+          predictions << Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_size => training_dataset.data_entries.size})
+=begin
+# TODO scaled dataset for physchem
+          p neighbor_algorithm_parameters
+          p (neighbor_algorithm_parameters["feature_dataset_id"])
+          d = Dataset.find(neighbor_algorithm_parameters["feature_dataset_id"])
+          p d
+          p d.class
+          if neighbor_algorithm_parameters["feature_dataset_id"] and Dataset.find(neighbor_algorithm_parameters["feature_dataset_id"]).kind_of? ScaledDataset
+            p "SCALED"
+          end
+=end
+        end 
+
+        # serialize result
+        case object.class.to_s
+        when "OpenTox::Compound"
+          prediction = predictions.first
+          prediction[:neighbors] = neighbors.sort{|a,b| b[1] <=> a[1]} # sort according to similarity
+          return prediction
+        when "Array"
+          return predictions
+        when "OpenTox::Dataset"
+          # prepare prediction dataset
+          prediction_dataset = LazarPrediction.new(
+            :name => "Lazar prediction for #{prediction_feature.name}",
+            :creator =>  __FILE__,
+            :prediction_feature_id => prediction_feature.id
+
+          )
+          confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Prediction confidence" )
+          # TODO move into warnings field
+          warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings")
+          prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ]
+          prediction_dataset.compounds = compounds
+          prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence], p[:warning]]}
+          prediction_dataset.save_all
+          return prediction_dataset
+        end
+
+      end
+      
+      def training_activities
+        i = training_dataset.feature_ids.index prediction_feature_id
+        training_dataset.data_entries.collect{|de| de[i]}
+      end
+
+    end
+
+    class LazarClassification < Lazar
+      
+      def self.create training_dataset, params={}
+        model = self.new training_dataset, params
+        model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm
+        model.neighbor_algorithm ||= "fingerprint_neighbors"
+        model.neighbor_algorithm_parameters ||= {}
+        {
+          :type => "MP2D",
+          :training_dataset_id => training_dataset.id,
+          :min_sim => 0.1
+        }.each do |key,value|
+          model.neighbor_algorithm_parameters[key] ||= value
+        end
+        model.save
+        model
+      end
+    end
+
+    class LazarRegression < Lazar
+
+      def self.create training_dataset, params={}
+        model = self.new training_dataset, params
+        model.neighbor_algorithm ||= "fingerprint_neighbors"
+        model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.weighted_average" 
+        model.neighbor_algorithm_parameters ||= {}
+        {
+          :type => "MP2D",
+          :training_dataset_id => training_dataset.id,
+          :min_sim => 0.1
+          #:type => "FP4",
+          #:training_dataset_id => training_dataset.id,
+          #:min_sim => 0.7
+        }.each do |key,value|
+          model.neighbor_algorithm_parameters[key] ||= value
+        end
+        model.save
+        model
+      end
+    end
+
+    class LazarFminerClassification < LazarClassification
+      field :feature_calculation_parameters, type: Hash
+
+      def self.create training_dataset, fminer_params={}
+        model = super(training_dataset)
+        model.update "_type" => self.to_s # adjust class
+        model = self.find model.id # adjust class
+        model.neighbor_algorithm = "fminer_neighbors"
+        model.neighbor_algorithm_parameters = {
+          :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match",
+          :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset,fminer_params).id,
+          :min_sim => 0.3
+        }
+        model.feature_calculation_parameters = fminer_params
+        model.save
+        model
+      end
+    end
+
+    class Prediction
+      include OpenTox
+      include Mongoid::Document
+      include Mongoid::Timestamps
+
+      # TODO cv -> repeated cv
+      # TODO field Validations
+      field :endpoint, type: String
+      field :species, type: String
+      field :source, type: String
+      field :unit, type: String
+      field :model_id, type: BSON::ObjectId
+      field :repeated_crossvalidation_id, type: BSON::ObjectId
+
+      def predict object
+        Lazar.find(model_id).predict object
+      end
+
+      def training_dataset
+        model.training_dataset
+      end
+
+      def model
+        Lazar.find model_id
+      end
+
+      def repeated_crossvalidation
+        RepeatedCrossValidation.find repeated_crossvalidation_id
+      end
+
+      def crossvalidations
+        repeated_crossvalidation.crossvalidations
+      end
+
+      def regression?
+        training_dataset.features.first.numeric?
+      end
+
+      def classification?
+        training_dataset.features.first.nominal?
+      end
+
+      def self.from_csv_file file
+        metadata_file = file.sub(/csv$/,"json")
+        bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
+        prediction_model = self.new JSON.parse(File.read(metadata_file))
+        training_dataset = Dataset.from_csv_file file
+        model = nil
+        if training_dataset.features.first.nominal?
+          #model = LazarFminerClassification.create training_dataset
+          model = LazarClassification.create training_dataset
+        elsif training_dataset.features.first.numeric?
+          model = LazarRegression.create training_dataset
+        end
+        prediction_model[:model_id] = model.id
+        prediction_model[:repeated_crossvalidation_id] = RepeatedCrossValidation.create(model).id
+        prediction_model.save
+        prediction_model
+      end
+    end
+
+  end
+
+end
diff --git a/lib/opentox.rb b/lib/opentox.rb
new file mode 100644
index 0000000..186c87a
--- /dev/null
+++ b/lib/opentox.rb
@@ -0,0 +1,22 @@
+module OpenTox
+
+  # Ruby interface
+
+  # create default OpenTox classes (defined in opentox-client.rb)
+  # provides Mongoid's query and persistence methods
+  # http://mongoid.org/en/mongoid/docs/persistence.html
+  # http://mongoid.org/en/mongoid/docs/querying.html
+  CLASSES.each do |klass|
+    c = Class.new do
+      include OpenTox
+      include Mongoid::Document
+      include Mongoid::Timestamps
+      store_in collection: klass.downcase.pluralize
+      field :name,  type: String
+      field :warnings, type: Array, default: []
+    end
+    OpenTox.const_set klass,c
+  end
+
+end
+
diff --git a/lib/overwrite.rb b/lib/overwrite.rb
new file mode 100644
index 0000000..c92ad2b
--- /dev/null
+++ b/lib/overwrite.rb
@@ -0,0 +1,148 @@
+require "base64"
+class Object
+  # An object is blank if it's false, empty, or a whitespace string.
+  # For example, "", "   ", +nil+, [], and {} are all blank.
+  def blank?
+    respond_to?(:empty?) ? empty? : !self
+  end
+
+  def numeric?
+    true if Float(self) rescue false
+  end
+
+  # Returns dimension of nested arrays
+  def dimension
+    self.class == Array ? 1 + self[0].dimension : 0
+  end
+end
+
+class Numeric
+  def percent_of(n)
+    self.to_f / n.to_f * 100.0
+  end
+end
+
+module Enumerable
+  # @return [Array] only the duplicates of an enumerable
+  def duplicates
+    inject({}) {|h,v| h[v]=h[v].to_i+1; h}.reject{|k,v| v==1}.keys
+  end
+  # http://stackoverflow.com/questions/2562256/find-most-common-string-in-an-array
+  Enumerable.class_eval do
+    def mode
+      group_by do |e|
+        e
+      end.values.max_by(&:size).first
+    end
+  end
+end
+
+class String
+  # @return [String] converts camel-case to underscore-case (OpenTox::SuperModel -> open_tox/super_model)
+  def underscore
+    self.gsub(/::/, '/').
+    gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
+    gsub(/([a-z\d])([A-Z])/,'\1_\2').
+    tr("-", "_").
+    downcase
+  end
+
+  # convert strings to boolean values
+  # @return [TrueClass,FalseClass] true or false
+  def to_boolean
+    return true if self == true || self =~ (/(true|t|yes|y|1)$/i)
+    return false if self == false || self.nil? || self =~ (/(false|f|no|n|0)$/i)
+    bad_request_error "invalid value for Boolean: \"#{self}\""
+  end
+
+end
+
+class File
+  # @return [String] mime_type including charset using linux cmd command
+  def mime_type
+    `file -ib '#{self.path}'`.chomp
+  end
+end
+
+class Array
+
+  # Sum up the size of single arrays in an array of arrays
+  # @param [Array] Array of arrays
+  # @return [Integer] Sum of size of array elements
+  def sum_size
+    self.inject(0) { |s,a|
+      if a.respond_to?('size')
+        s+=a.size
+      else
+        internal_server_error "No size available: #{a.inspect}"
+      end
+    }
+  end
+
+  # For symbolic features
+  # @param [Array] Array to test.
+  # @return [Boolean] Whether the array has just one unique value.
+  def zero_variance?
+    return self.uniq.size == 1
+  end
+
+  def median
+    sorted = self.sort
+    len = sorted.length
+    (sorted[(len - 1) / 2] + sorted[len / 2]) / 2.0
+  end
+
+  def mean
+    self.inject{ |sum, el| sum + el }.to_f / self.size
+  end
+
+  def sample_variance
+    m = self.mean
+    sum = self.inject(0){|accum, i| accum +(i-m)**2 }
+    sum/(self.length - 1).to_f
+  end
+
+  def standard_deviation
+    Math.sqrt(self.sample_variance)
+  end
+
+end
+
+module URI
+
+  def self.ssl? uri
+    URI.parse(uri).instance_of? URI::HTTPS
+  end
+
+  # @return [Boolean] checks if resource exists by making a HEAD-request
+  def self.accessible?(uri)
+    parsed_uri = URI.parse(uri + (OpenTox::RestClientWrapper.subjectid ? "?subjectid=#{CGI.escape OpenTox::RestClientWrapper.subjectid}" : ""))
+    http_code = URI.task?(uri) ? 600 : 400
+    http = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
+    unless (URI.ssl? uri) == true
+      http = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
+      request = Net::HTTP::Head.new(parsed_uri.request_uri)
+      http.request(request).code.to_i < http_code
+    else
+      http = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
+      http.use_ssl = true
+      http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+      request = Net::HTTP::Head.new(parsed_uri.request_uri)
+      http.request(request).code.to_i < http_code
+    end
+  rescue
+    false
+  end
+
+  def self.valid? uri
+    u = URI.parse(uri)
+    u.scheme!=nil and u.host!=nil
+  rescue URI::InvalidURIError
+    false
+  end
+
+  def self.task? uri
+    uri =~ /task/ and URI.valid? uri
+  end
+
+end
diff --git a/lib/regression.rb b/lib/regression.rb
new file mode 100644
index 0000000..868c25f
--- /dev/null
+++ b/lib/regression.rb
@@ -0,0 +1,262 @@
+# TODO install R packages kernlab, caret, doMC, class, e1071
+
+
+        # log transform activities (create new dataset)
+        # scale, normalize features, might not be necessary
+        # http://stats.stackexchange.com/questions/19216/variables-are-often-adjusted-e-g-standardised-before-making-a-model-when-is
+        # http://stats.stackexchange.com/questions/7112/when-and-how-to-use-standardized-explanatory-variables-in-linear-regression
+        # zero-order correlation and the semi-partial correlation
+        # seems to be necessary for svm
+        #   http://stats.stackexchange.com/questions/77876/why-would-scaling-features-decrease-svm-performance?lq=1
+        #   http://stackoverflow.com/questions/15436367/svm-scaling-input-values
+        # use lasso or elastic net??
+        # select relevant features
+        #   remove features with a single value
+        #   remove correlated features
+        #   remove features not correlated with endpoint
+module OpenTox
+  module Algorithm
+    
+    class Regression
+
+      def self.weighted_average compound, params
+        weighted_sum = 0.0
+        sim_sum = 0.0
+        confidence = 0.0
+        neighbors = params[:neighbors]
+        activities = []
+        neighbors.each do |row|
+          n,sim,acts = row
+          confidence = sim if sim > confidence # distance to nearest neighbor
+          # TODO add LOO errors
+          acts.each do |act|
+            weighted_sum += sim*Math.log10(act)
+            activities << act
+            sim_sum += sim
+          end
+        end
+        #R.assign "activities", activities
+        #R.eval "cv = cv(activities)"
+        #confidence /= activities.standard_deviation#/activities.mean
+        #confidence = sim_sum*neighbors.size.to_f/params[:training_dataset_size]
+        #confidence = sim_sum/neighbors.size.to_f
+        #confidence = neighbors.size.to_f
+        confidence = 0 if confidence.nan?
+        sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
+        {:value => prediction,:confidence => confidence}
+      end
+
+      def self.local_linear_regression  compound, neighbors
+        p neighbors.size
+        return nil unless neighbors.size > 0
+        features = neighbors.collect{|n| Compound.find(n.first).fp4}.flatten.uniq
+        p features
+        training_data = Array.new(neighbors.size){Array.new(features.size,0)}
+        neighbors.each_with_index do |n,i|
+          #p n.first
+          neighbor = Compound.find n.first
+          features.each_with_index do |f,j|
+            training_data[i][j] = 1 if neighbor.fp4.include? f
+          end
+        end
+        p training_data
+
+        R.assign "activities", neighbors.collect{|n| n[2].median}
+        R.assign "features", training_data
+        R.eval "model <- lm(activities ~ features)"
+        R.eval "summary <- summary(model)"
+        p R.summary
+        compound_features = features.collect{|f| compound.fp4.include? f ? 1 : 0}
+        R.assign "compound_features", compound_features
+        R.eval "prediction <- predict(model,compound_features)"
+        p R.prediction
+      
+      end
+
+      def self.weighted_average_with_relevant_fingerprints neighbors
+        weighted_sum = 0.0
+        sim_sum = 0.0
+        fingerprint_features = []
+        neighbors.each do |row|
+          n,sim,acts = row
+          neighbor = Compound.find n
+          fingerprint_features += neighbor.fp4
+        end
+        fingerprint_features.uniq!
+        p fingerprint_features
+=begin
+          p n
+          acts.each do |act|
+            weighted_sum += sim*Math.log10(act)
+            sim_sum += sim
+          end
+        end
+=end
+        confidence = sim_sum/neighbors.size.to_f
+        sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
+        {:value => prediction,:confidence => confidence}
+      end
+
+      # Local support vector regression from neighbors 
+      # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required
+      # @return [Numeric] A prediction value.
+      def self.local_svm_regression neighbors, params={:min_train_performance => 0.1}
+
+        confidence = 0.0
+        prediction = nil
+
+        $logger.debug "Local SVM."
+        props = neighbors.collect{|row| row[3] }
+        neighbors.shift
+        activities = neighbors.collect{|n| n[2]}
+        prediction = self.local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting
+        prediction = nil if (!prediction.nil? && prediction.infinite?)
+        $logger.debug "Prediction: '#{prediction}' ('#{prediction.class}')."
+        if prediction
+          confidence = get_confidence({:sims => neighbors.collect{|n| n[1]}, :activities => activities})
+        else
+          confidence = nil if prediction.nil?
+        end
+          [prediction, confidence]
+
+      end
+
+
+      # Local support vector prediction from neighbors. 
+      # Uses propositionalized setting.
+      # Not to be called directly (use local_svm_regression or local_svm_classification).
+      # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ]
+      # @param [Array] activities, activities for neighbors.
+      # @param [Float] min_train_performance, parameter to control censoring
+      # @return [Numeric] A prediction value.
+      def self.local_svm_prop(props, activities, min_train_performance)
+
+        $logger.debug "Local SVM (Propositionalization / Kernlab Kernel)."
+        n_prop = props[1..-1] # is a matrix, i.e. two nested Arrays.
+        q_prop = props[0] # is an Array.
+
+        prediction = nil
+        if activities.uniq.size == 1
+          prediction = activities[0]
+        else
+          t = Time.now
+          #$logger.debug gram_matrix.to_yaml
+          #@r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests
+          @r = Rserve::Connection.new#(true,false) # global R instance leads to Socket errors after a large number of requests
+          rs = []
+          ["caret", "doMC", "class"].each do |lib|
+            #raise "failed to load R-package #{lib}" unless @r.void_eval "suppressPackageStartupMessages(library('#{lib}'))"
+            rs << "suppressPackageStartupMessages(library('#{lib}'))"
+          end
+          #@r.eval "registerDoMC()" # switch on parallel processing
+          rs << "registerDoMC()" # switch on parallel processing
+          #@r.eval "set.seed(1)"
+          rs << "set.seed(1)"
+          $logger.debug "Loading R packages: #{Time.now-t}"
+          t = Time.now
+          p n_prop
+          begin
+
+            # set data
+            rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
+            rs << "n_prop <- c(#{n_prop.flatten.join(',')})"
+            rs << "n_prop_x_size <- c(#{n_prop.size})"
+            rs << "n_prop_y_size <- c(#{n_prop[0].size})"
+            rs << "y <- c(#{activities.join(',')})"
+            rs << "q_prop <- c(#{q_prop.join(',')})"
+            rs << "y = matrix(y)"
+            rs << "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)"
+            rs << "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)"
+
+            $logger.debug "Setting R data: #{Time.now-t}"
+            t = Time.now
+            # prepare data
+            rs << "
+              weights=NULL
+              if (!(class(y) == 'numeric')) { 
+                y = factor(y)
+                weights=unlist(as.list(prop.table(table(y))))
+                weights=(weights-1)^2
+              }
+            "
+
+            rs << "
+              rem = nearZeroVar(prop_matrix)
+              if (length(rem) > 0) {
+                prop_matrix = prop_matrix[,-rem,drop=F]
+                q_prop = q_prop[,-rem,drop=F]
+              }
+              rem = findCorrelation(cor(prop_matrix))
+              if (length(rem) > 0) {
+                prop_matrix = prop_matrix[,-rem,drop=F]
+                q_prop = q_prop[,-rem,drop=F]
+              }
+            "
+
+            #p @r.eval("y").to_ruby
+            #p "weights"
+            #p @r.eval("weights").to_ruby
+            $logger.debug "Preparing R data: #{Time.now-t}"
+            t = Time.now
+            # model + support vectors
+            #train_success = @r.eval <<-EOR
+            rs << '
+              model = train(prop_matrix,y,
+                             method="svmRadial",
+                             preProcess=c("center", "scale"),
+                             class.weights=weights,
+                             trControl=trainControl(method="LGOCV",number=10),
+                             tuneLength=8
+                           )
+              perf = ifelse ( class(y)!="numeric", max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
+            '
+            File.open("/tmp/r.r","w+"){|f| f.puts rs.join("\n")}
+            p rs.join("\n")
+            p `Rscript /tmp/r.r`
+=begin
+            @r.void_eval <<-EOR
+              model = train(prop_matrix,y,
+                             method="svmRadial",
+                             #preProcess=c("center", "scale"),
+                             #class.weights=weights,
+                             #trControl=trainControl(method="LGOCV",number=10),
+                             #tuneLength=8
+                           )
+              perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared )
+            EOR
+=end
+
+            $logger.debug "Creating R SVM model: #{Time.now-t}"
+            t = Time.now
+            if train_success
+              # prediction
+              @r.eval "predict(model,q_prop); p = predict(model,q_prop)" # kernlab bug: predict twice
+              #@r.eval "p = predict(model,q_prop)" # kernlab bug: predict twice
+              @r.eval "if (class(y)!='numeric') p = as.character(p)"
+              prediction = @r.p
+
+              # censoring
+              prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance.to_f )
+              prediction = nil if prediction =~ /NA/
+              $logger.debug "Performance: '#{sprintf("%.2f", @r.perf)}'"
+            else
+              $logger.debug "Model creation failed."
+              prediction = nil 
+            end
+            $logger.debug "R Prediction: #{Time.now-t}"
+          rescue Exception => e
+            $logger.debug "#{e.class}: #{e.message}"
+            $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}"
+          ensure
+            #puts @r.inspect
+            #TODO: broken pipe
+            #@r.quit # free R
+          end
+        end
+        prediction
+      end
+    end
+
+  end
+end
+
diff --git a/lib/rest-client-wrapper.rb b/lib/rest-client-wrapper.rb
new file mode 100644
index 0000000..de1b74f
--- /dev/null
+++ b/lib/rest-client-wrapper.rb
@@ -0,0 +1,98 @@
+module OpenTox
+  
+  class RestClientWrapper
+    
+    attr_accessor :request, :response
+
+    @@subjectid = nil
+
+    def self.subjectid=(subjectid)
+      @@subjectid = subjectid
+    end
+
+    def self.subjectid
+      @@subjectid
+    end
+
+    # REST methods 
+    # Raises OpenTox::Error if call fails (rescued in overwrite.rb -> halt 502)
+    # Does not wait for task to finish and returns task uri
+    # @param [String] destination URI
+    # @param [optional,Hash|String] Payload data posted to the service
+    # @param [optional,Hash] Headers with params like :accept, :content_type, :subjectid, :verify_ssl
+    # @return [RestClient::Response] REST call response 
+    [:head,:get,:post,:put,:delete].each do |method|
+
+      define_singleton_method method do |uri,payload={},headers={},waiting_task=nil|
+
+        # check input
+        bad_request_error "Headers are not a hash: #{headers.inspect}", uri unless headers==nil or headers.is_a?(Hash) 
+        headers[:subjectid] ||= @@subjectid
+        bad_request_error "Invalid URI: '#{uri}'", uri unless URI.valid? uri
+        #resource_not_found_error "URI '#{uri}' not found.", uri unless URI.accessible?(uri, @subjectid) unless URI.ssl?(uri)
+        # make sure that no header parameters are set in the payload
+        [:accept,:content_type,:subjectid].each do |header|
+          if defined? $aa || URI(uri).host == URI($aa[:uri]).host
+          else
+            bad_request_error "#{header} should be submitted in the headers", uri if payload and payload.is_a?(Hash) and payload[header]
+          end
+        end
+      
+        # create request
+        args={}
+        args[:method] = method
+        args[:url] = uri
+        args[:verify_ssl] = 0 if headers[:verify_ssl].nil? || headers[:verify_ssl].empty?
+        args[:timeout] = 1800
+        args[:payload] = payload
+        headers.each{ |k,v| headers.delete(k) if v==nil } if headers #remove keys with empty values, as this can cause problems
+        args[:headers] = headers
+
+        $logger.debug "post to #{uri} with params #{payload.inspect.to_s[0..1000]}" if method.to_s=="post"
+        
+        @request = RestClient::Request.new(args)
+        # ignore error codes from Task services (may return error codes >= 400 according to API, which causes exceptions in RestClient and RDF::Reader)
+        @response = @request.execute do |response, request, result|
+          if [301, 302, 307].include? response.code and request.method == :get
+            response.follow_redirection(request, result)
+          elsif response.code >= 400 and !URI.task?(uri)
+            #TODO add parameters to error-report
+            #parameters = request.args
+            #parameters[:headers][:subjectid] = "REMOVED" if parameters[:headers] and parameters[:headers][:subjectid]
+            #parameters[:url] = parameters[:url].gsub(/(http|https|)\:\/\/[a-zA-Z0-9\-]+\:[a-zA-Z0-9]+\@/, "REMOVED@") if parameters[:url]
+            #message += "\nREST parameters:\n#{parameters.inspect}" 
+            error = known_errors.collect{|e| e if e[:code] == response.code}.compact.first
+            begin # errors are returned as error reports in json, try to parse
+              # TODO: may be the reason for failure of task.rb -n test_11_wait_for_error_task
+              content = JSON.parse(response)
+              msg = content["message"].to_s
+              cause = content["errorCause"].to_s
+              raise if msg.size==0 && cause.size==0 # parsing failed
+            rescue # parsing error failed, use complete content as message
+              msg = "Could not parse error response from rest call '#{method}' to '#{uri}':\n#{response}"
+              cause = nil
+            end
+            Object.method(error[:method]).call msg, uri, cause # call error method
+          else
+            response
+          end
+        end
+      end
+    end
+
+    #@return [Array] of hashes with error code, method and class
+    def self.known_errors
+      errors = []
+      RestClient::STATUSES.each do |code,k|
+        if code >= 400
+          method = k.underscore.gsub(/ |'/,'_')
+          method += "_error" unless method.match(/_error$/)
+          klass = method.split("_").collect{|s| s.capitalize}.join("")
+          errors << {:code => code, :method => method.to_sym, :class => klass}
+        end
+      end
+      errors
+    end
+
+  end
+end
diff --git a/lib/similarity.rb b/lib/similarity.rb
new file mode 100644
index 0000000..91e18db
--- /dev/null
+++ b/lib/similarity.rb
@@ -0,0 +1,58 @@
+=begin
+* Name: similarity.rb
+* Description: Similarity algorithms
+* Author: Andreas Maunz <andreas@maunz.de
+* Date: 10/2012
+=end
+
+module OpenTox
+  module Algorithm
+
+    class Similarity
+
+      #TODO weighted tanimoto
+
+      # Tanimoto similarity
+      # @param [Array] a fingerprints of first compound
+      # @param [Array] b fingerprints of second compound
+      # @return [Float] Tanimoto similarity
+      def self.tanimoto(a,b)
+        bad_request_error "fingerprints #{a} and #{b} don't have equal size" unless a.size == b.size
+        #common = 0.0
+        #a.each_with_index do |n,i|
+          #common += 1 if n == b[i]
+        #end
+        #common/a.size
+        # TODO check if calculation speed can be improved
+        common_p_sum = 0.0
+        all_p_sum = 0.0
+        (0...a.size).each { |idx|
+          common_p_sum += [ a[idx], b[idx] ].min
+          all_p_sum += [ a[idx], b[idx] ].max
+        }
+        common_p_sum/all_p_sum
+      end
+
+
+      # Cosine similarity
+      # @param [Array] a fingerprints of first compound
+      # @param [Array] b fingerprints of second compound
+      # @return [Float] Cosine similarity, the cosine of angle enclosed between vectors a and b
+      def self.cosine(a, b)
+        val = 0.0
+        if a.size>0 and b.size>0
+          if a.size>12 && b.size>12
+            a = a[0..11]
+            b = b[0..11]
+          end
+          a_vec = a.to_gv
+          b_vec = b.to_gv
+          val = a_vec.dot(b_vec) / (a_vec.norm * b_vec.norm)
+        end
+        val
+      end
+
+    end
+
+  end
+end
diff --git a/lib/unique_descriptors.rb b/lib/unique_descriptors.rb
new file mode 100644
index 0000000..cf9cbf3
--- /dev/null
+++ b/lib/unique_descriptors.rb
@@ -0,0 +1,120 @@
+# set of non redundant descriptors, faster algorithms are preferred
+# TODO:
+# select logP algorithm
+# select l5 algorithm
+# use smarts matcher for atom counts
+# check correlations
+UNIQUEDESCRIPTORS = [
+  "Openbabel.abonds", #Number of aromatic bonds
+  "Openbabel.atoms", #Number of atoms
+  "Openbabel.bonds", #Number of bonds
+  "Openbabel.dbonds", #Number of double bonds
+  "Openbabel.HBA1", #Number of Hydrogen Bond Acceptors 1 (JoelLib)
+  "Openbabel.HBA2", #Number of Hydrogen Bond Acceptors 2 (JoelLib)
+  "Openbabel.HBD", #Number of Hydrogen Bond Donors (JoelLib)
+  #"Openbabel.L5", #Lipinski Rule of Five# TODO Openbabel.L5 returns nil, investigate!!!
+  "Openbabel.logP", #octanol/water partition coefficient
+  "Openbabel.MP", #Melting point
+  "Openbabel.MR", #molar refractivity
+  "Openbabel.MW", #Molecular Weight filter
+  "Openbabel.nF", #Number of Fluorine Atoms
+  "Openbabel.sbonds", #Number of single bonds
+  "Openbabel.tbonds", #Number of triple bonds
+  "Openbabel.TPSA", #topological polar surface area
+  "Cdk.ALOGP", #Calculates atom additive logP and molar refractivity values as described by Ghose and Crippen and
+  "Cdk.APol", #Descriptor that calculates the sum of the atomic polarizabilities (including implicit hydrogens).
+  "Cdk.AcidicGroupCount", #Returns the number of acidic groups.
+  "Cdk.AminoAcidCount", #Returns the number of amino acids found in the system
+  #"Cdk.AromaticAtomsCount", #Descriptor based on the number of aromatic atoms of a molecule.
+  #"Cdk.AromaticBondsCount", #Descriptor based on the number of aromatic bonds of a molecule.
+  #"Cdk.AtomCount", #Descriptor based on the number of atoms of a certain element type.
+  "Cdk.AutocorrelationCharge", #The Moreau-Broto autocorrelation descriptors using partial charges
+  "Cdk.AutocorrelationMass", #The Moreau-Broto autocorrelation descriptors using atomic weight
+  "Cdk.AutocorrelationPolarizability", #The Moreau-Broto autocorrelation descriptors using polarizability
+  "Cdk.BCUT", #Eigenvalue based descriptor noted for its utility in chemical diversity described by Pearlman et al. .
+  "Cdk.BPol", #Descriptor that calculates the sum of the absolute value of the difference between atomic polarizabilities of all bonded atoms in the molecule (including implicit hydrogens).
+  "Cdk.BasicGroupCount", #Returns the number of basic groups.
+  #"Cdk.BondCount", #Descriptor based on the number of bonds of a certain bond order.
+  "Cdk.CPSA", #A variety of descriptors combining surface area and partial charge information
+  "Cdk.CarbonTypes", #Characterizes the carbon connectivity in terms of hybridization
+  "Cdk.ChiChain", #Evaluates the Kier & Hall Chi chain indices of orders 3,4,5 and 6
+  "Cdk.ChiCluster", #Evaluates the Kier & Hall Chi cluster indices of orders 3,4,5,6 and 7
+  "Cdk.ChiPathCluster", #Evaluates the Kier & Hall Chi path cluster indices of orders 4,5 and 6
+  "Cdk.ChiPath", #Evaluates the Kier & Hall Chi path indices of orders 0,1,2,3,4,5,6 and 7
+  "Cdk.EccentricConnectivityIndex", #A topological descriptor combining distance and adjacency information.
+  "Cdk.FMF", #Descriptor characterizing molecular complexity in terms of its Murcko framework
+  "Cdk.FragmentComplexity", #Class that returns the complexity of a system. The complexity is defined as @cdk.cite{Nilakantan06}
+  "Cdk.GravitationalIndex", #Descriptor characterizing the mass distribution of the molecule.
+  #"Cdk.HBondAcceptorCount", #Descriptor that calculates the number of hydrogen bond acceptors.
+  #"Cdk.HBondDonorCount", #Descriptor that calculates the number of hydrogen bond donors.
+  "Cdk.HybridizationRatio", #Characterizes molecular complexity in terms of carbon hybridization states.
+  "Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential.
+  "Cdk.KappaShapeIndices", #Descriptor that calculates Kier and Hall kappa molecular shape indices.
+  "Cdk.KierHallSmarts", #Counts the number of occurrences of the E-state fragments
+  "Cdk.LargestChain", #Returns the number of atoms in the largest chain
+  "Cdk.LargestPiSystem", #Returns the number of atoms in the largest pi chain
+  "Cdk.LengthOverBreadth", #Calculates the ratio of length to breadth.
+  "Cdk.LongestAliphaticChain", #Returns the number of atoms in the longest aliphatic chain
+  "Cdk.MDE", #Evaluate molecular distance edge descriptors for C, N and O
+  #"Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms .
+  "Cdk.MomentOfInertia", #Descriptor that calculates the principal moments of inertia and ratios of the principal moments. Als calculates the radius of gyration.
+  "Cdk.PetitjeanNumber", #Descriptor that calculates the Petitjean Number of a molecule.
+  "Cdk.PetitjeanShapeIndex", #The topological and geometric shape indices described Petitjean and Bath et al. respectively. Both measure the anisotropy in a molecule.
+  "Cdk.RotatableBondsCount", #Descriptor that calculates the number of nonrotatable bonds on a molecule.
+  #"Cdk.RuleOfFive", #This Class contains a method that returns the number failures of the Lipinski's Rule Of Five.
+  #"Cdk.TPSA", #Calculation of topological polar surface area based on fragment contributions .
+  "Cdk.VABC", #Describes the volume of a molecule.
+  "Cdk.VAdjMa", #Descriptor that calculates the vertex adjacency information of a molecule.
+  "Cdk.WHIM", #Holistic descriptors described by Todeschini et al .
+  #"Cdk.Weight", #Descriptor based on the weight of atoms of a certain element type. If no element is specified, the returned value is the Molecular Weight
+  "Cdk.WeightedPath", #The weighted path (molecular ID) descriptors described by Randic. They characterize molecular branching.
+  "Cdk.WienerNumbers", #This class calculates Wiener path number and Wiener polarity number.
+  "Cdk.XLogP", #Prediction of logP based on the atom-type method called XLogP.
+  "Cdk.ZagrebIndex", #The sum of the squared atom degrees of all heavy atoms.
+  "Joelib.count.NumberOfS", #no description available
+  "Joelib.count.NumberOfP", #no description available
+  "Joelib.count.NumberOfO", #no description available
+  "Joelib.count.NumberOfN", #no description available
+  #"Joelib.count.AromaticBonds", #no description available
+  "Joelib.count.NumberOfI", #no description available
+  "Joelib.count.NumberOfF", #no description available
+  "Joelib.count.NumberOfC", #no description available
+  "Joelib.count.NumberOfB", #no description available
+  "Joelib.count.HydrophobicGroups", #no description available
+  #"Joelib.KierShape3", #no description available
+  #"Joelib.KierShape2", #no description available
+  #"Joelib.KierShape1", #no description available
+  #"Joelib.count.AcidicGroups", #no description available
+  "Joelib.count.AliphaticOHGroups", #no description available
+  #"Joelib.count.NumberOfAtoms", #no description available
+  "Joelib.TopologicalRadius", #no description available
+  "Joelib.GeometricalShapeCoefficient", #no description available
+  #"Joelib.MolecularWeight", #no description available
+  "Joelib.FractionRotatableBonds", #no description available
+  #"Joelib.count.HBD2", #no description available
+  #"Joelib.count.HBD1", #no description available
+  "Joelib.LogP", #no description available
+  "Joelib.GraphShapeCoefficient", #no description available
+  "Joelib.count.BasicGroups", #no description available
+  #"Joelib.count.RotatableBonds", #no description available
+  "Joelib.count.HeavyBonds", #no description available
+  "Joelib.PolarSurfaceArea", #no description available
+  #"Joelib.ZagrebIndex1", #no description available
+  "Joelib.GeometricalRadius", #no description available
+  "Joelib.count.SO2Groups", #no description available
+  "Joelib.count.AromaticOHGroups", #no description available
+  "Joelib.GeometricalDiameter", #no description available
+  #"Joelib.MolarRefractivity", #no description available
+  "Joelib.count.NumberOfCl", #no description available
+  "Joelib.count.OSOGroups", #no description available
+  "Joelib.count.NumberOfBr", #no description available
+  "Joelib.count.NO2Groups", #no description available
+  "Joelib.count.HeteroCycles", #no description available
+  #"Joelib.count.HBA2", #no description available
+  #"Joelib.count.HBA1", #no description available
+  #"Joelib.count.NumberOfBonds", #no description available
+  "Joelib.count.SOGroups", #no description available
+  "Joelib.TopologicalDiameter", #no description available
+  "Joelib.count.NumberOfHal", #no description available
+
+].sort
diff --git a/lib/validation.rb b/lib/validation.rb
new file mode 100644
index 0000000..c52ffc0
--- /dev/null
+++ b/lib/validation.rb
@@ -0,0 +1,68 @@
+module OpenTox
+
+  class Validation
+
+    field :model_id, type: BSON::ObjectId
+    field :prediction_dataset_id, type: BSON::ObjectId
+    field :crossvalidation_id, type: BSON::ObjectId
+    field :test_dataset_id, type: BSON::ObjectId
+    field :nr_instances, type: Integer
+    field :nr_unpredicted, type: Integer
+    field :predictions, type: Array
+
+    def prediction_dataset
+      Dataset.find prediction_dataset_id
+    end
+
+    def test_dataset
+      Dataset.find test_dataset_id
+    end
+
+    def model
+      Model::Lazar.find model_id
+    end
+
+    def self.create model, training_set, test_set, crossvalidation=nil
+      
+      atts = model.attributes.dup # do not modify attributes from original model
+      atts["_id"] = BSON::ObjectId.new
+      atts[:training_dataset_id] = training_set.id
+      validation_model = model.class.create training_set, atts
+      validation_model.save
+      test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used
+      prediction_dataset = validation_model.predict test_set_without_activities
+      predictions = []
+      nr_unpredicted = 0
+      activities = test_set.data_entries.collect{|de| de.first}
+      prediction_dataset.data_entries.each_with_index do |de,i|
+        if de[0] and de[1] and de[1].numeric? 
+          activity = activities[i]
+          prediction = de.first
+          confidence = de[1]
+          predictions << [prediction_dataset.compound_ids[i], activity, prediction, de[1]]
+        else
+          nr_unpredicted += 1
+        end
+      end
+      validation = self.new(
+        :model_id => validation_model.id,
+        :prediction_dataset_id => prediction_dataset.id,
+        :test_dataset_id => test_set.id,
+        :nr_instances => test_set.compound_ids.size,
+        :nr_unpredicted => nr_unpredicted,
+        :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence
+      )
+      validation.crossvalidation_id = crossvalidation.id if crossvalidation
+      validation.save
+      validation
+    end
+
+  end
+
+  class ClassificationValidation < Validation
+  end
+
+  class RegressionValidation < Validation
+  end
+
+end
author	Christoph Helma <helma@in-silico.ch>	2015-10-08 10:43:43 +0200
committer	Christoph Helma <helma@in-silico.ch>	2015-10-08 10:43:43 +0200
commit	1a56148aadef031c4f487bc23fda43f4ac5b7369 (patch)
tree	3555c5883ed0c292b105c40c185ebba3e5bd4e3e /lib
parent	394d564699756288569169ff3e198d6d7702f092 (diff)
parent	e3217075b602a950a0ee995fcfa731d97b5ba3eb (diff)