diff options
author | Christoph Helma <helma@in-silico.ch> | 2015-10-08 10:43:43 +0200 |
---|---|---|
committer | Christoph Helma <helma@in-silico.ch> | 2015-10-08 10:43:43 +0200 |
commit | 1a56148aadef031c4f487bc23fda43f4ac5b7369 (patch) | |
tree | 3555c5883ed0c292b105c40c185ebba3e5bd4e3e /lib | |
parent | 394d564699756288569169ff3e198d6d7702f092 (diff) | |
parent | e3217075b602a950a0ee995fcfa731d97b5ba3eb (diff) |
new master branch
Diffstat (limited to 'lib')
-rw-r--r-- | lib/SMARTS_InteLigand.txt | 983 | ||||
-rw-r--r-- | lib/algorithm.rb | 21 | ||||
-rw-r--r-- | lib/bbrc.rb | 165 | ||||
-rw-r--r-- | lib/classification.rb | 110 | ||||
-rw-r--r-- | lib/compound.rb | 370 | ||||
-rw-r--r-- | lib/crossvalidation.rb | 302 | ||||
-rw-r--r-- | lib/dataset.rb | 350 | ||||
-rw-r--r-- | lib/descriptor.rb | 248 | ||||
-rw-r--r-- | lib/error.rb | 66 | ||||
-rw-r--r-- | lib/experiment.rb | 99 | ||||
-rw-r--r-- | lib/feature.rb | 95 | ||||
-rw-r--r-- | lib/lazar.rb | 72 | ||||
-rw-r--r-- | lib/model.rb | 265 | ||||
-rw-r--r-- | lib/opentox.rb | 22 | ||||
-rw-r--r-- | lib/overwrite.rb | 148 | ||||
-rw-r--r-- | lib/regression.rb | 262 | ||||
-rw-r--r-- | lib/rest-client-wrapper.rb | 98 | ||||
-rw-r--r-- | lib/similarity.rb | 58 | ||||
-rw-r--r-- | lib/unique_descriptors.rb | 120 | ||||
-rw-r--r-- | lib/validation.rb | 68 |
20 files changed, 3922 insertions, 0 deletions
diff --git a/lib/SMARTS_InteLigand.txt b/lib/SMARTS_InteLigand.txt new file mode 100644 index 0000000..23bc6e2 --- /dev/null +++ b/lib/SMARTS_InteLigand.txt @@ -0,0 +1,983 @@ +# +# SMARTS Patterns for Functional Group Classification +# +# written by Christian Laggner +# Copyright 2005 Inte:Ligand Software-Entwicklungs und Consulting GmbH +# +# Released under the Lesser General Public License (LGPL license) +# see http://www.gnu.org/copyleft/lesser.html +# Modified from Version 221105 +##################################################################################################### + +# General Stuff: +# These patters were written in an attempt to represent the classification of organic compounds +# from the viewpoint of an organic chemist. +# They are often very restrictive. This may be generally a good thing, but it also takes some time +# for filtering/indexing large compound sets. +# For filtering undesired groups (in druglike compounds) one will want to have more general patterns +# (e.g. you don't want *any* halide of *any* acid, *neither* aldehyde *nor* formyl esters and amides, ...). +# + +# Part I: Carbon +# ============== + + +# I.1: Carbon-Carbon Bonds +# ------------------------ + +# I.1.1 Alkanes: + +Primary_carbon: [CX4H3][#6] + +Secondary_carbon: [CX4H2]([#6])[#6] + +Tertiary_carbon: [CX4H1]([#6])([#6])[#6] + +Quaternary_carbon: [CX4]([#6])([#6])([#6])[#6] + + +# I.1.2 C-C double and Triple Bonds + +Alkene: [CX3;$([H2]),$([H1][#6]),$(C([#6])[#6])]=[CX3;$([H2]),$([H1][#6]),$(C([#6])[#6])] +# sp2 C may be substituted only by C or H - +# does not hit ketenes and allenes, nor enamines, enols and the like + +Alkyne: [CX2]#[CX2] +# non-carbon substituents (e.g. alkynol ethers) are rather rare, thus no further discrimination + +Allene: [CX3]=[CX2]=[CX3] + + +# I.2: One Carbon-Hetero Bond +# --------------------------- + + +# I.2.1 Alkyl Halogenides + +Alkylchloride: [ClX1][CX4] +# will also hit chloromethylethers and the like, but no chloroalkenes, -alkynes or -aromats +# a more restrictive version can be obtained by modifying the Alcohol string. + +Alkylfluoride: [FX1][CX4] + +Alkylbromide: [BrX1][CX4] + +Alkyliodide: [IX1][CX4] + + +# I.2.2 Alcohols and Ethers + +Alcohol: [OX2H][CX4;!$(C([OX2H])[O,S,#7,#15])] +# nonspecific definition, no acetals, aminals, and the like + +Primary_alcohol: [OX2H][CX4H2;!$(C([OX2H])[O,S,#7,#15])] + +Secondary_alcohol: [OX2H][CX4H;!$(C([OX2H])[O,S,#7,#15])] + +Tertiary_alcohol: [OX2H][CX4D4;!$(C([OX2H])[O,S,#7,#15])] + +Dialkylether: [OX2]([CX4;!$(C([OX2])[O,S,#7,#15,F,Cl,Br,I])])[CX4;!$(C([OX2])[O,S,#7,#15])] +# no acetals and the like; no enolethers + +Dialkylthioether: [SX2]([CX4;!$(C([OX2])[O,S,#7,#15,F,Cl,Br,I])])[CX4;!$(C([OX2])[O,S,#7,#15])] +# no acetals and the like; no enolethers + +Alkylarylether: [OX2](c)[CX4;!$(C([OX2])[O,S,#7,#15,F,Cl,Br,I])] +# no acetals and the like; no enolethers + +Diarylether: [c][OX2][c] + +Alkylarylthioether: [SX2](c)[CX4;!$(C([OX2])[O,S,#7,#15,F,Cl,Br,I])] + +Diarylthioether: [c][SX2][c] + +Oxonium: [O+;!$([O]~[!#6]);!$([S]*~[#7,#8,#15,#16])] +# can't be aromatic, thus O and not #8 + +# I.2.3 Amines + +Amine: [NX3+0,NX4+;!$([N]~[!#6]);!$([N]*~[#7,#8,#15,#16])] +# hits all amines (prim/sec/tert/quart), including ammonium salts, also enamines, but not amides, imides, aminals, ... + +# the following amines include also the protonated forms + +Primary_aliph_amine: [NX3H2+0,NX4H3+;!$([N][!C]);!$([N]*~[#7,#8,#15,#16])] + +Secondary_aliph_amine: [NX3H1+0,NX4H2+;!$([N][!C]);!$([N]*~[#7,#8,#15,#16])] + +Tertiary_aliph_amine: [NX3H0+0,NX4H1+;!$([N][!C]);!$([N]*~[#7,#8,#15,#16])] + +Quaternary_aliph_ammonium: [NX4H0+;!$([N][!C]);!$([N]*~[#7,#8,#15,#16])] + +Primary_arom_amine: [NX3H2+0,NX4H3+]c + +Secondary_arom_amine: [NX3H1+0,NX4H2+;!$([N][!c]);!$([N]*~[#7,#8,#15,#16])] + +Tertiary_arom_amine: [NX3H0+0,NX4H1+;!$([N][!c]);!$([N]*~[#7,#8,#15,#16])] + +Quaternary_arom_ammonium: [NX4H0+;!$([N][!c]);!$([N]*~[#7,#8,#15,#16])] + +Secondary_mixed_amine: [NX3H1+0,NX4H2+;$([N]([c])[C]);!$([N]*~[#7,#8,#15,#16])] + +Tertiary_mixed_amine: [NX3H0+0,NX4H1+;$([N]([c])([C])[#6]);!$([N]*~[#7,#8,#15,#16])] + +Quaternary_mixed_ammonium: [NX4H0+;$([N]([c])([C])[#6][#6]);!$([N]*~[#7,#8,#15,#16])] + +Ammonium: [N+;!$([N]~[!#6]);!$(N=*);!$([N]*~[#7,#8,#15,#16])] +# only C and H substituents allowed. Quaternary or protonated amines +# NX4+ or Nv4+ is not recognized by Daylight's depictmatch if less than four C are present + + +# I.2.4 Others + +Alkylthiol: [SX2H][CX4;!$(C([SX2H])~[O,S,#7,#15])] + +Dialkylthioether: [SX2]([CX4;!$(C([SX2])[O,S,#7,#15,F,Cl,Br,I])])[CX4;!$(C([SX2])[O,S,#7,#15])] + +Alkylarylthioether: [SX2](c)[CX4;!$(C([SX2])[O,S,#7,#15])] + +Disulfide: [SX2D2][SX2D2] + +1,2-Aminoalcohol: [OX2H][CX4;!$(C([OX2H])[O,S,#7,#15,F,Cl,Br,I])][CX4;!$(C([N])[O,S,#7,#15])][NX3;!$(NC=[O,S,N])] +# does not hit alpha-amino acids, enaminoalcohols, 1,2-aminoacetals, o-aminophenols, etc. + +1,2-Diol: [OX2H][CX4;!$(C([OX2H])[O,S,#7,#15])][CX4;!$(C([OX2H])[O,S,#7,#15])][OX2H] +# does not hit alpha-hydroxy acids, enolalcohols, 1,2-hydroxyacetals, 1,2-diphenols, etc. + +1,1-Diol: [OX2H][CX4;!$(C([OX2H])([OX2H])[O,S,#7,#15])][OX2H] + +Hydroperoxide: [OX2H][OX2] +#does not neccessarily have to be connected to a carbon atom, includes also hydrotrioxides + +Peroxo: [OX2D2][OX2D2] + +Organolithium_compounds: [LiX1][#6,#14] + +Organomagnesium_compounds: [MgX2][#6,#14] +# not restricted to Grignard compounds, also dialkyl Mg + +Organometallic_compounds: [!#1;!#5;!#6;!#7;!#8;!#9;!#14;!#15;!#16;!#17;!#33;!#34;!#35;!#52;!#53;!#85]~[#6;!-] +# very general, includes all metals covalently bound to carbon + + +# I.3: Two Carbon-Hetero Bonds (Carbonyl and Derivatives) +# ---------------------------- + +# I.3.1 Double Bond to Hetero + +Aldehyde: [$([CX3H][#6]),$([CX3H2])]=[OX1] +# hits aldehydes including formaldehyde + +Ketone: [#6][CX3](=[OX1])[#6] +# does not include oxo-groups connected to a (hetero-) aromatic ring + +Thioaldehyde: [$([CX3H][#6]),$([CX3H2])]=[SX1] + +Thioketone: [#6][CX3](=[SX1])[#6] +# does not include thioxo-groups connected to a (hetero-) aromatic ring + +Imine: [NX2;$([N][#6]),$([NH]);!$([N][CX3]=[#7,#8,#15,#16])]=[CX3;$([CH2]),$([CH][#6]),$([C]([#6])[#6])] +# nitrogen is not part of an amidelike strukture, nor of an aromatic ring, but can be part of an aminal or similar + +Immonium: [NX3+;!$([N][!#6]);!$([N][CX3]=[#7,#8,#15,#16])] + +Oxime: [NX2](=[CX3;$([CH2]),$([CH][#6]),$([C]([#6])[#6])])[OX2H] + +Oximether: [NX2](=[CX3;$([CH2]),$([CH][#6]),$([C]([#6])[#6])])[OX2][#6;!$(C=[#7,#8])] +# ether, not ester or amide; does not hit isoxazole + + +# I.3.2. Two Single Bonds to Hetero + +Acetal: [OX2]([#6;!$(C=[O,S,N])])[CX4;!$(C(O)(O)[!#6])][OX2][#6;!$(C=[O,S,N])] +# does not hit hydroxy-methylesters, ketenacetals, hemiacetals, orthoesters, etc. + +Hemiacetal: [OX2H][CX4;!$(C(O)(O)[!#6])][OX2][#6;!$(C=[O,S,N])] + +Aminal: [NX3v3;!$(NC=[#7,#8,#15,#16])]([#6])[CX4;!$(C(N)(N)[!#6])][NX3v3;!$(NC=[#7,#8,#15,#16])][#6] +# Ns are not part of an amide or similar. v3 ist to exclude nitro and similar groups + +Hemiaminal: [NX3v3;!$(NC=[#7,#8,#15,#16])]([#6])[CX4;!$(C(N)(N)[!#6])][OX2H] + +Thioacetal: [SX2]([#6;!$(C=[O,S,N])])[CX4;!$(C(S)(S)[!#6])][SX2][#6;!$(C=[O,S,N])] + +Thiohemiacetal: [SX2]([#6;!$(C=[O,S,N])])[CX4;!$(C(S)(S)[!#6])][OX2H] + +Halogen_acetal_like: [NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][FX1,ClX1,BrX1,IX1] +# hits chloromethylenethers and other reactive alkylating agents + +Acetal_like: [NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][FX1,ClX1,BrX1,IX1,NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])] +# includes all of the above and other combinations (S-C-N, hydrates, ...), but still no aminomethylenesters and similar + +Halogenmethylen_ester_and_similar: [NX3v3,SX2,OX2;$(**=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][FX1,ClX1,BrX1,IX1] +# also reactive alkylating agents. Acid does not have to be carboxylic acid, also S- and P-based acids allowed + +NOS_methylen_ester_and_similar: [NX3v3,SX2,OX2;$(**=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])] +# Same as above, but N,O or S instead of halogen. Ester/amide allowed only on one side + +Hetero_methylen_ester_and_similar: [NX3v3,SX2,OX2;$(**=[#7,#8,#15,#16])][CX4;!$(C([N,S,O])([N,S,O])[!#6])][FX1,ClX1,BrX1,IX1,NX3v3,SX2,OX2;!$(*C=[#7,#8,#15,#16])] +# Combination of the last two patterns + +Cyanhydrine: [NX1]#[CX2][CX4;$([CH2]),$([CH]([CX2])[#6]),$(C([CX2])([#6])[#6])][OX2H] + + +# I.3.3 Single Bond to Hetero, C=C Double Bond (Enols and Similar) + +Chloroalkene: [ClX1][CX3]=[CX3] + +Fluoroalkene: [FX1][CX3]=[CX3] + +Bromoalkene: [BrX1][CX3]=[CX3] + +Iodoalkene: [IX1][CX3]=[CX3] + +Enol: [OX2H][CX3;$([H1]),$(C[#6])]=[CX3] +# no phenols + +Endiol: [OX2H][CX3;$([H1]),$(C[#6])]=[CX3;$([H1]),$(C[#6])][OX2H] +# no 1,2-diphenols, ketenacetals, ... + +Enolether: [OX2]([#6;!$(C=[N,O,S])])[CX3;$([H0][#6]),$([H1])]=[CX3] +# finds also endiodiethers, but not enolesters, no aromats + +Enolester: [OX2]([CX3]=[OX1])[#6X3;$([#6][#6]),$([H1])]=[#6X3;!$(C[OX2H])] + + +Enamine: [NX3;$([NH2][CX3]),$([NH1]([CX3])[#6]),$([N]([CX3])([#6])[#6]);!$([N]*=[#7,#8,#15,#16])][CX3;$([CH]),$([C][#6])]=[CX3] +# does not hit amines attached to aromatic rings, nor may the nitrogen be aromatic + +Thioenol: [SX2H][CX3;$([H1]),$(C[#6])]=[CX3] + +Thioenolether: [SX2]([#6;!$(C=[N,O,S])])[CX3;$(C[#6]),$([CH])]=[CX3] + + +# I.4: Three Carbon-Hetero Bonds (Carboxyl and Derivatives) +# ------------------------------ + +Acylchloride: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[ClX1] + +Acylfluoride: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[FX1] + +Acylbromide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[BrX1] + +Acyliodide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[IX1] + +Acylhalide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[FX1,ClX1,BrX1,IX1] +# all of the above + + +# The following contains all simple carboxylic combinations of O, N, S, & Hal - +# - acids, esters, amides, ... as well as a few extra cases (anhydride, hydrazide...) +# Cyclic structures (including aromats) like lactones, lactames, ... got their own +# definitions. Structures where both heteroatoms are part of an aromatic ring +# (oxazoles, imidazoles, ...) were excluded. + +Carboxylic_acid: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[$([OX2H]),$([OX1-])] +# includes carboxylate anions + +Carboxylic_ester: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[OX2][#6;!$(C=[O,N,S])] +# does not hit anhydrides or lactones + +Lactone: [#6][#6X3R](=[OX1])[#8X2][#6;!$(C=[O,N,S])] +# may also be aromatic + +Carboxylic_anhydride: [CX3;$([H0][#6]),$([H1])](=[OX1])[#8X2][CX3;$([H0][#6]),$([H1])](=[OX1]) +# anhydride formed by two carboxylic acids, no mixed anhydrides (e.g. between carboxylic acid and sulfuric acid); may be part of a ring, even aromatic + +Carboxylic_acid_derivative: [$([#6X3H0][#6]),$([#6X3H])](=[!#6])[!#6] +# includes most of the structures of I.4 and many more, also 1,3-heteroaromatics such as isoxazole + +Carbothioic_acid: [CX3;!R;$([C][#6]),$([CH]);$([C](=[OX1])[$([SX2H]),$([SX1-])]),$([C](=[SX1])[$([OX2H]),$([OX1-])])] +# hits both tautomeric forms, as well as anions + +Carbothioic_S_ester: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[SX2][#6;!$(C=[O,N,S])] + +Carbothioic_S_lactone: [#6][#6X3R](=[OX1])[#16X2][#6;!$(C=[O,N,S])] +# may also be aromatic + +Carbothioic_O_ester: [CX3;$([H0][#6]),$([H1])](=[SX1])[OX2][#6;!$(C=[O,N,S])] + +Carbothioic_O_lactone: [#6][#6X3R](=[SX1])[#8X2][#6;!$(C=[O,N,S])] + +Carbothioic_halide: [CX3;$([H0][#6]),$([H1])](=[SX1])[FX1,ClX1,BrX1,IX1] + +Carbodithioic_acid: [CX3;!R;$([C][#6]),$([CH]);$([C](=[SX1])[SX2H])] + +Carbodithioic_ester: [CX3;!R;$([C][#6]),$([CH]);$([C](=[SX1])[SX2][#6;!$(C=[O,N,S])])] + +Carbodithiolactone: [#6][#6X3R](=[SX1])[#16X2][#6;!$(C=[O,N,S])] + + +Amide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] +# does not hit lactames + +Primary_amide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[NX3H2] + +Secondary_amide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H1][#6;!$(C=[O,N,S])] + +Tertiary_amide: [CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H0]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])] + +Lactam: [#6R][#6X3R](=[OX1])[#7X3;$([H1][#6;!$(C=[O,N,S])]),$([H0]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] +# cyclic amides, may also be aromatic + +Alkyl_imide: [#6X3;$([H0][#6]),$([H1])](=[OX1])[#7X3H0]([#6])[#6X3;$([H0][#6]),$([H1])](=[OX1]) +# may be part of a ring, even aromatic. only C allowed at central N. May also be triacyl amide + +N_hetero_imide: [#6X3;$([H0][#6]),$([H1])](=[OX1])[#7X3H0]([!#6])[#6X3;$([H0][#6]),$([H1])](=[OX1]) +# everything else than H or C at central N + +Imide_acidic: [#6X3;$([H0][#6]),$([H1])](=[OX1])[#7X3H1][#6X3;$([H0][#6]),$([H1])](=[OX1]) +# can be deprotonated + +Thioamide: [$([CX3;!R][#6]),$([CX3H;!R])](=[SX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] +# does not hit thiolactames + +Thiolactam: [#6R][#6X3R](=[SX1])[#7X3;$([H1][#6;!$(C=[O,N,S])]),$([H0]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] +# cyclic thioamides, may also be aromatic + + +Oximester: [#6X3;$([H0][#6]),$([H1])](=[OX1])[#8X2][#7X2]=,:[#6X3;$([H0]([#6])[#6]),$([H1][#6]),$([H2])] +# may also be part of a ring / aromatic + +Amidine: [NX3;!$(NC=[O,S])][CX3;$([CH]),$([C][#6])]=[NX2;!$(NC=[O,S])] +# only basic amidines, not as part of aromatic ring (e.g. imidazole) + +Hydroxamic_acid: [CX3;$([H0][#6]),$([H1])](=[OX1])[#7X3;$([H1]),$([H0][#6;!$(C=[O,N,S])])][$([OX2H]),$([OX1-])] + +Hydroxamic_acid_ester: [CX3;$([H0][#6]),$([H1])](=[OX1])[#7X3;$([H1]),$([H0][#6;!$(C=[O,N,S])])][OX2][#6;!$(C=[O,N,S])] +# does not hit anhydrides of carboxylic acids withs hydroxamic acids + + +Imidoacid: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[$([OX2H]),$([OX1-])] +# not cyclic + +Imidoacid_cyclic: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[$([OX2H]),$([OX1-])] +# the enamide-form of lactames. may be aromatic like 2-hydroxypyridine + +Imidoester: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[OX2][#6;!$(C=[O,N,S])] +# esters of the above structures. no anhydrides. + +Imidolactone: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[OX2][#6;!$(C=[O,N,S])] +# no oxazoles and similar + +Imidothioacid: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[$([SX2H]),$([SX1-])] +# not cyclic + +Imidothioacid_cyclic: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[$([SX2H]),$([SX1-])] +# the enamide-form of thiolactames. may be aromatic like 2-thiopyridine + +Imidothioester: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[SX2][#6;!$(C=[O,N,S])] +# thioesters of the above structures. no anhydrides. + +Imidothiolactone: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[SX2][#6;!$(C=[O,N,S])] +# no thioxazoles and similar + +Amidine: [#7X3v3;!$(N([#6X3]=[#7X2])C=[O,S])][CX3R0;$([H1]),$([H0][#6])]=[NX2v3;!$(N(=[#6X3][#7X3])C=[O,S])] +# only basic amidines, not substituted by carbonyl or thiocarbonyl, not as part of a ring + +Imidolactam: [#6][#6X3R;$([H0](=[NX2;!$(N(=[#6X3][#7X3])C=[O,S])])[#7X3;!$(N([#6X3]=[#7X2])C=[O,S])]),$([H0](-[NX3;!$(N([#6X3]=[#7X2])C=[O,S])])=,:[#7X2;!$(N(=[#6X3][#7X3])C=[O,S])])] +# one of the two C~N bonds is part of a ring (may be aromatic), but not both - thus no imidazole + +Imidoylhalide: [CX3R0;$([H0][#6]),$([H1])](=[NX2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[FX1,ClX1,BrX1,IX1] +# not cyclic + +Imidoylhalide_cyclic: [#6R][#6X3R](=,:[#7X2;$([H1]),$([H0][#6;!$(C=[O,N,S])])])[FX1,ClX1,BrX1,IX1] +# may also be aromatic + +# may be ring, aromatic, substituted with carbonyls, hetero, ... +# (everything else would get too complicated) + +Amidrazone: [$([$([#6X3][#6]),$([#6X3H])](=[#7X2v3])[#7X3v3][#7X3v3]),$([$([#6X3][#6]),$([#6X3H])]([#7X3v3])=[#7X2v3][#7X3v3])] +# hits both tautomers. as above, it may be ring, aromatic, substituted with carbonyls, hetero, ... + + +Alpha_aminoacid: [NX3,NX4+;!$([N]~[!#6]);!$([N]*~[#7,#8,#15,#16])][C][CX3](=[OX1])[OX2H,OX1-] +# N may be alkylated, but not part of an amide (as in peptides), ionic forms are included +# includes also non-natural aminoacids with double-bonded or two aliph./arom. substituents at alpha-C +# N may not be aromatic as in 1H-pyrrole-2-carboxylic acid + +Alpha_hydroxyacid: [OX2H][C][CX3](=[OX1])[OX2H,OX1-] + +Peptide_middle: [NX3;$([N][CX3](=[OX1])[C][NX3,NX4+])][C][CX3](=[OX1])[NX3;$([N][C][CX3](=[OX1])[NX3,OX2,OX1-])] +# finds peptidic structures which are neither C- nor N-terminal. Both neighbours must be amino-acids/peptides + +Peptide_C_term: [NX3;$([N][CX3](=[OX1])[C][NX3,NX4+])][C][CX3](=[OX1])[OX2H,OX1-] +# finds C-terminal amino acids + +Peptide_N_term: [NX3,NX4+;!$([N]~[!#6]);!$([N]*~[#7,#8,#15,#16])][C][CX3](=[OX1])[NX3;$([N][C][CX3](=[OX1])[NX3,OX2,OX1-])] +# finds N-terminal amino acids. As above, N may be substituted, but not part of an amide-bond. + + +Carboxylic_orthoester: [#6][OX2][CX4;$(C[#6]),$([CH])]([OX2][#6])[OX2][#6] +# hits also anhydride like struktures (e. g. HC(OMe)2-OC=O residues) + +Ketene: [CX3]=[CX2]=[OX1] + +Ketenacetal: [#7X2,#8X3,#16X2;$(*[#6,#14])][#6X3]([#7X2,#8X3,#16X2;$(*[#6,#14])])=[#6X3] +# includes aminals, silylacetals, ketenesters, etc. C=C DB is not aromatic, everything else may be + +Nitrile: [NX1]#[CX2] +# includes cyanhydrines + +Isonitrile: [CX1-]#[NX2+] + + +Vinylogous_carbonyl_or_carboxyl_derivative: [#6X3](=[OX1])[#6X3]=,:[#6X3][#7,#8,#16,F,Cl,Br,I] +# may be part of a ring, even aromatic + +Vinylogous_acid: [#6X3](=[OX1])[#6X3]=,:[#6X3][$([OX2H]),$([OX1-])] + +Vinylogous_ester: [#6X3](=[OX1])[#6X3]=,:[#6X3][#6;!$(C=[O,N,S])] + +Vinylogous_amide: [#6X3](=[OX1])[#6X3]=,:[#6X3][#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] + +Vinylogous_halide: [#6X3](=[OX1])[#6X3]=,:[#6X3][FX1,ClX1,BrX1,IX1] + + + +# I.5: Four Carbon-Hetero Bonds (Carbonic Acid and Derivatives) +# ----------------------------- + +Carbonic_acid_dieester: [#6;!$(C=[O,N,S])][#8X2][#6X3](=[OX1])[#8X2][#6;!$(C=[O,N,S])] +# may be part of a ring, even aromatic + +Carbonic_acid_esterhalide: [#6;!$(C=[O,N,S])][OX2;!R][CX3](=[OX1])[OX2][FX1,ClX1,BrX1,IX1] + +Carbonic_acid_monoester: [#6;!$(C=[O,N,S])][OX2;!R][CX3](=[OX1])[$([OX2H]),$([OX1-])] +# unstable + +Carbonic_acid_derivatives: [!#6][#6X3](=[!#6])[!#6] + + +Thiocarbonic_acid_dieester: [#6;!$(C=[O,N,S])][#8X2][#6X3](=[SX1])[#8X2][#6;!$(C=[O,N,S])] +# may be part of a ring, even aromatic + +Thiocarbonic_acid_esterhalide: [#6;!$(C=[O,N,S])][OX2;!R][CX3](=[SX1])[OX2][FX1,ClX1,BrX1,IX1] + +Thiocarbonic_acid_monoester: [#6;!$(C=[O,N,S])][OX2;!R][CX3](=[SX1])[$([OX2H]),$([OX1-])] + + +Urea:[#7X3;!$([#7][!#6])][#6X3](=[OX1])[#7X3;!$([#7][!#6])] +# no check whether part of imide, biuret, etc. Aromatic structures are only hit if +# both N share no double bonds, like in the dioxo-form of uracil + +Thiourea: [#7X3;!$([#7][!#6])][#6X3](=[SX1])[#7X3;!$([#7][!#6])] + +Isourea: [#7X2;!$([#7][!#6])]=,:[#6X3]([#8X2&!$([#8][!#6]),OX1-])[#7X3;!$([#7][!#6])] +# O may be substituted. no check whether further amide-like bonds are present. Aromatic +# structures are only hit if single bonded N shares no additional double bond, like in +# the 1-hydroxy-3-oxo form of uracil + +Isothiourea: [#7X2;!$([#7][!#6])]=,:[#6X3]([#16X2&!$([#16][!#6]),SX1-])[#7X3;!$([#7][!#6])] + +Guanidine: [N;v3X3,v4X4+][CX3](=[N;v3X2,v4X3+])[N;v3X3,v4X4+] +# also hits guanidinium salts. v3 and v4 to avoid nitroamidines + +Carbaminic_acid: [NX3]C(=[OX1])[O;X2H,X1-] +# quite unstable, unlikely to be found. Also hits salts + +Urethan: [#7X3][#6](=[OX1])[#8X2][#6] +# also hits when part of a ring, no check whether the last C is part of carbonyl + +Biuret: [#7X3][#6](=[OX1])[#7X3][#6](=[OX1])[#7X3] + +Semicarbazide: [#7X3][#7X3][#6X3]([#7X3;!$([#7][#7])])=[OX1] + +Carbazide: [#7X3][#7X3][#6X3]([#7X3][#7X3])=[OX1] + +Semicarbazone: [#7X2](=[#6])[#7X3][#6X3]([#7X3;!$([#7][#7])])=[OX1] + +Carbazone: [#7X2](=[#6])[#7X3][#6X3]([#7X3][#7X3])=[OX1] + +Thiosemicarbazide: [#7X3][#7X3][#6X3]([#7X3;!$([#7][#7])])=[SX1] + +Thiocarbazide: [#7X3][#7X3][#6X3]([#7X3][#7X3])=[SX1] + +Thiosemicarbazone: [#7X2](=[#6])[#7X3][#6X3]([#7X3;!$([#7][#7])])=[SX1] + +Thiocarbazone: [#7X2](=[#6])[#7X3][#6X3]([#7X3][#7X3])=[SX1] + + +Isocyanate: [NX2]=[CX2]=[OX1] + +Cyanate: [OX2][CX2]#[NX1] + +Isothiocyanate: [NX2]=[CX2]=[SX1] + +Thiocyanate: [SX2][CX2]#[NX1] + +Carbodiimide: [NX2]=[CX2]=[NX2] + +Orthocarbonic_derivatives: [CX4H0]([O,S,#7])([O,S,#7])([O,S,#7])[O,S,#7,F,Cl,Br,I] +# halogen allowed just once, to avoid mapping to -OCF3 and similar groups (much more +# stable as for example C(OCH3)4) + + +# I.6 Aromatics +# ------------- + +# I know that this classification is not very logical, arylamines are found under I.2 ... + +Phenol: [OX2H][c] + +1,2-Diphenol: [OX2H][c][c][OX2H] + +Arylchloride: [Cl][c] + +Arylfluoride: [F][c] + +Arylbromide: [Br][c] + +Aryliodide: [I][c] + +Arylthiol: [SX2H][c] + +Iminoarene: [c]=[NX2;$([H1]),$([H0][#6;!$([C]=[N,S,O])])] +# N may be substituted with H or C, but not carbonyl or similar +# aromatic atom is always C, not S or P (these are not planar when substituted) + +Oxoarene: [c]=[OX1] + +Thioarene: [c]=[SX1] + +Hetero_N_basic_H: [nX3H1+0] +# as in pyrole. uncharged to exclude pyridinium ions + +Hetero_N_basic_no_H: [nX3H0+0] +# as in N-methylpyrole. uncharged to exclude pyridinium ions + +Hetero_N_nonbasic: [nX2,nX3+] +# as in pyridine, pyridinium + +Hetero_O: [o] + +Hetero_S: [sX2] +# X2 because Daylight's depictmatch falsely describes C1=CS(=O)C=C1 as aromatic +# (is not planar because of lonepair at S) + +Heteroaromatic: [a;!c] + + +# Part II: N, S, P, Si, B +# ======================= + + +# II.1 Nitrogen +# ------------- + +Nitrite: [NX2](=[OX1])[O;$([X2]),$([X1-])] +# hits nitrous acid, its anion, esters, and other O-substituted derivatives + +Thionitrite: [SX2][NX2]=[OX1] + +Nitrate: [$([NX3](=[OX1])(=[OX1])[O;$([X2]),$([X1-])]),$([NX3+]([OX1-])(=[OX1])[O;$([X2]),$([X1-])])] +# hits nitric acid, its anion, esters, and other O-substituted derivatives + +Nitro: [$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8] +# hits nitro groups attached to C,N, ... but not nitrates + +Nitroso: [NX2](=[OX1])[!#7;!#8] +# no nitrites, no nitrosamines + +Azide: [NX1]~[NX2]~[NX2,NX1] +# hits both mesomeric forms, also anion + +Acylazide: [CX3](=[OX1])[NX2]~[NX2]~[NX1] + +Diazo: [$([#6]=[NX2+]=[NX1-]),$([#6-]-[NX2+]#[NX1])] + +Diazonium: [#6][NX2+]#[NX1] + +Nitrosamine: [#7;!$(N*=O)][NX2]=[OX1] + +Nitrosamide: [NX2](=[OX1])N-*=O +# includes nitrososulfonamides + +N-Oxide: [$([#7+][OX1-]),$([#7v5]=[OX1]);!$([#7](~[O])~[O]);!$([#7]=[#7])] +# Hits both forms. Won't hit azoxy, nitro, nitroso, or nitrate. + + +Hydrazine: [NX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6]);!$(NC=[O,N,S])][NX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6]);!$(NC=[O,N,S])] +# no hydrazides + +Hydrazone: [NX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6]);!$(NC=[O,N,S])][NX2]=[#6] + +Hydroxylamine: [NX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6]);!$(NC=[O,N,S])][OX2;$([H1]),$(O[#6;!$(C=[N,O,S])])] +# no discrimination between O-, N-, and O,N-substitution + + +# II.2 Sulfur +# ----------- + +Sulfon: [$([SX4](=[OX1])(=[OX1])([#6])[#6]),$([SX4+2]([OX1-])([OX1-])([#6])[#6])] +# can't be aromatic, thus S and not #16 + +Sulfoxide: [$([SX3](=[OX1])([#6])[#6]),$([SX3+]([OX1-])([#6])[#6])] + +Sulfonium: [S+;!$([S]~[!#6]);!$([S]*~[#7,#8,#15,#16])] +# can't be aromatic, thus S and not #16 + +Sulfuric_acid: [SX4](=[OX1])(=[OX1])([$([OX2H]),$([OX1-])])[$([OX2H]),$([OX1-])] +# includes anions + +Sulfuric_monoester: [SX4](=[OX1])(=[OX1])([$([OX2H]),$([OX1-])])[OX2][#6;!$(C=[O,N,S])] + +Sulfuric_diester: [SX4](=[OX1])(=[OX1])([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])] + +Sulfuric_monoamide: [SX4](=[OX1])(=[OX1])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[$([OX2H]),$([OX1-])] + +Sulfuric_diamide: [SX4](=[OX1])(=[OX1])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] + +Sulfuric_esteramide: [SX4](=[OX1])(=[OX1])([#7X3][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])] + +Sulfuric_derivative: [SX4D4](=[!#6])(=[!#6])([!#6])[!#6] +# everything else (would not be a "true" derivative of sulfuric acid, if one of the substituents were less electronegative +# than sulfur, but this should be very very rare, anyway) + + + +#### sulfurous acid and derivatives missing!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + + + +Sulfonic_acid: [SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[$([OX2H]),$([OX1-])] + +Sulfonamide: [SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] + +Sulfonic_ester: [SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[OX2][#6;!$(C=[O,N,S])] + +Sulfonic_halide: [SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[FX1,ClX1,BrX1,IX1] + +Sulfonic_derivative: [SX4;$([H1]),$([H0][#6])](=[!#6])(=[!#6])[!#6] +# includes all of the above and many more +# for comparison: this is what "all sulfonic derivatives but not the ones above" would look like: +# [$([SX4;$([H1]),$([H0][#6])](=[!#6])(=[!#6;!O])[!#6]),$([SX4;$([H1]),$([H0][#6])](=[OX1])(=[OX1])[!$([FX1,ClX1,BrX1,IX1]);!$([#6]);!$([OX2H]);!$([OX1-]);!$([OX2][#6;!$(C=[O,N,S])]);!$([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])])] + + +Sulfinic_acid: [SX3;$([H1]),$([H0][#6])](=[OX1])[$([OX2H]),$([OX1-])] + +Sulfinic_amide: [SX3;$([H1]),$([H0][#6])](=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] + +Sulfinic_ester: [SX3;$([H1]),$([H0][#6])](=[OX1])[OX2][#6;!$(C=[O,N,S])] + +Sulfinic_halide: [SX3;$([H1]),$([H0][#6])](=[OX1])[FX1,ClX1,BrX1,IX1] + +Sulfinic_derivative: [SX3;$([H1]),$([H0][#6])](=[!#6])[!#6] + +Sulfenic_acid: [SX2;$([H1]),$([H0][#6])][$([OX2H]),$([OX1-])] + +Sulfenic_amide: [SX2;$([H1]),$([H0][#6])][#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] + +Sulfenic_ester: [SX2;$([H1]),$([H0][#6])][OX2][#6;!$(C=[O,N,S])] + +Sulfenic_halide: [SX2;$([H1]),$([H0][#6])][FX1,ClX1,BrX1,IX1] + +Sulfenic_derivative: [SX2;$([H1]),$([H0][#6])][!#6] + + +# II.3 Phosphorous +# ---------------- + +Phosphine: [PX3;$([H3]),$([H2][#6]),$([H1]([#6])[#6]),$([H0]([#6])([#6])[#6])] +# similar to amine, but less restrictive: includes also amide- and aminal-analogues + +Phosphine_oxide: [PX4;$([H3]=[OX1]),$([H2](=[OX1])[#6]),$([H1](=[OX1])([#6])[#6]),$([H0](=[OX1])([#6])([#6])[#6])] + +Phosphonium: [P+;!$([P]~[!#6]);!$([P]*~[#7,#8,#15,#16])] +# similar to Ammonium + +Phosphorylen: [PX4;$([H3]=[CX3]),$([H2](=[CX3])[#6]),$([H1](=[CX3])([#6])[#6]),$([H0](=[CX3])([#6])([#6])[#6])] + + +# conventions for the following acids and derivatives: +# acids find protonated and deprotonated acids +# esters do not find mixed anhydrides ( ...P-O-C(=O)) +# derivatives: subtituents which go in place of the OH and =O are not H or C (may also be O, +# thus including acids and esters) + +Phosphonic_acid: [PX4;$([H1]),$([H0][#6])](=[OX1])([$([OX2H]),$([OX1-])])[$([OX2H]),$([OX1-])] +# includes anions + +Phosphonic_monoester: [PX4;$([H1]),$([H0][#6])](=[OX1])([$([OX2H]),$([OX1-])])[OX2][#6;!$(C=[O,N,S])] + +Phosphonic_diester: [PX4;$([H1]),$([H0][#6])](=[OX1])([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])] + +Phosphonic_monoamide: [PX4;$([H1]),$([H0][#6])](=[OX1])([$([OX2H]),$([OX1-])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] + +Phosphonic_diamide: [PX4;$([H1]),$([H0][#6])](=[OX1])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] + +Phosphonic_esteramide: [PX4;$([H1]),$([H0][#6])](=[OX1])([OX2][#6;!$(C=[O,N,S])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] + +Phosphonic_acid_derivative: [PX4;$([H1]),$([H0][#6])](=[!#6])([!#6])[!#6] +# all of the above and much more + + +Phosphoric_acid: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([$([OX2H]),$([OX1-])])[$([OX2H]),$([OX1-])] +# includes anions + +Phosphoric_monoester: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([$([OX2H]),$([OX1-])])[OX2][#6;!$(C=[O,N,S])] + +Phosphoric_diester: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])] + +Phosphoric_triester: [PX4D4](=[OX1])([OX2][#6;!$(C=[O,N,S])])([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])] + +Phosphoric_monoamide: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([$([OX2H]),$([OX1-])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] + +Phosphoric_diamide: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] + +Phosphoric_triamide: [PX4D4](=[OX1])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] + +Phosphoric_monoestermonoamide: [PX4D4](=[OX1])([$([OX2H]),$([OX1-])])([OX2][#6;!$(C=[O,N,S])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] + +Phosphoric_diestermonoamide: [PX4D4](=[OX1])([OX2][#6;!$(C=[O,N,S])])([OX2][#6;!$(C=[O,N,S])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] + +Phosphoric_monoesterdiamide: [PX4D4](=[OX1])([OX2][#6;!$(C=[O,N,S])])([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] + +Phosphoric_acid_derivative: [PX4D4](=[!#6])([!#6])([!#6])[!#6] + + +Phosphinic_acid: [PX4;$([H2]),$([H1][#6]),$([H0]([#6])[#6])](=[OX1])[$([OX2H]),$([OX1-])] + +Phosphinic_ester: [PX4;$([H2]),$([H1][#6]),$([H0]([#6])[#6])](=[OX1])[OX2][#6;!$(C=[O,N,S])] + +Phosphinic_amide: [PX4;$([H2]),$([H1][#6]),$([H0]([#6])[#6])](=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] + +Phosphinic_acid_derivative: [PX4;$([H2]),$([H1][#6]),$([H0]([#6])[#6])](=[!#6])[!#6] + + +Phosphonous_acid: [PX3;$([H1]),$([H0][#6])]([$([OX2H]),$([OX1-])])[$([OX2H]),$([OX1-])] + +Phosphonous_monoester: [PX3;$([H1]),$([H0][#6])]([$([OX2H]),$([OX1-])])[OX2][#6;!$(C=[O,N,S])] + +Phosphonous_diester: [PX3;$([H1]),$([H0][#6])]([OX2][#6;!$(C=[O,N,S])])[OX2][#6;!$(C=[O,N,S])] + +Phosphonous_monoamide: [PX3;$([H1]),$([H0][#6])]([$([OX2H]),$([OX1-])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] + +Phosphonous_diamide: [PX3;$([H1]),$([H0][#6])]([#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] + +Phosphonous_esteramide: [PX3;$([H1]),$([H0][#6])]([OX2][#6;!$(C=[O,N,S])])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] + +Phosphonous_derivatives: [PX3;$([D2]),$([D3][#6])]([!#6])[!#6] + + +Phosphinous_acid: [PX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6])][$([OX2H]),$([OX1-])] + +Phosphinous_ester: [PX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6])][OX2][#6;!$(C=[O,N,S])] + +Phosphinous_amide: [PX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6])][#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])] + +Phosphinous_derivatives: [PX3;$([H2]),$([H1][#6]),$([H0]([#6])[#6])][!#6] + + +# II.4 Silicon +# ------------ + +Quart_silane: [SiX4]([#6])([#6])([#6])[#6] +# four C-substituents. non-reactive, non-toxic, in experimental phase for drug development + +Non-quart_silane: [SiX4;$([H1]([#6])([#6])[#6]),$([H2]([#6])[#6]),$([H3][#6]),$([H4])] +# has 1-4 hydride(s), reactive. Daylight's depictmatch does not add hydrogens automatically to +# the free positions at Si, thus Hs had to be added implicitly + +Silylmonohalide: [SiX4]([FX1,ClX1,BrX1,IX1])([#6])([#6])[#6] +# reagents for inserting protection groups + +Het_trialkylsilane: [SiX4]([!#6])([#6])([#6])[#6] +# mostly acid-labile protection groups such as trimethylsilyl-ethers + +Dihet_dialkylsilane: [SiX4]([!#6])([!#6])([#6])[#6] + +Trihet_alkylsilane: [SiX4]([!#6])([!#6])([!#6])[#6] + +Silicic_acid_derivative: [SiX4]([!#6])([!#6])([!#6])[!#6] +# four substituent which are neither C nor H + + +# II.5 Boron +# ---------- + +Trialkylborane: [BX3]([#6])([#6])[#6] +# also carbonyls allowed + +Boric_acid_derivatives: [BX3]([!#6])([!#6])[!#6] +# includes acids, esters, amides, ... H-substituent at B is very rare. + +Boronic_acid_derivative: [BX3]([!#6])([!#6])[!#6] +# # includes acids, esters, amides, ... + +Borohydride: [BH1,BH2,BH3,BH4] +# at least one H attached to B + +Quaternary_boron: [BX4] +# mostly borates (negative charge), in complex with Lewis-base + + + +# Part III: Some Special Patterns +# =============================== + + +# III.1 Chains +# ------------ + +# some simple chains + + + +# III.2 Rings +# ----------- + +Aromatic: a + +Heterocyclic: [!#6;!R0] +# may be aromatic or not + +Epoxide: [OX2r3]1[#6r3][#6r3]1 +# toxic/reactive. may be annelated to aromat, but must not be aromatic itself (oxirane-2,3-dione) + +NH_aziridine: [NX3H1r3]1[#6r3][#6r3]1 +# toxic/reactive according to Maybridge's garbage filter + +Spiro: [D4R;$(*(@*)(@*)(@*)@*)] +# at least two different rings can be found which are sharing just one atom. +# these two rings can be connected by a third ring, so it matches also some +# bridged systems, like morphine + +Annelated_rings: [R;$(*(@*)(@*)@*);!$([R2;$(*(@*)(@*)(@*)@*)])]@[R;$(*(@*)(@*)@*);!$([R2;$(*(@*)(@*)(@*)@*)])] +# two different rings sharing exactly two atoms + +Bridged_rings: [R;$(*(@*)(@*)@*);!$([D4R;$(*(@*)(@*)(@*)@*)]);!$([R;$(*(@*)(@*)@*);!$([R2;$(*(@*)(@*)(@*)@*)])]@[R;$(*(@*)(@*)@*);!$([R2;$(*(@*)(@*)(@*)@*)])])] +# part of two or more rings, not spiro, not annelated -> finds bridgehead atoms, +# but only if they are not annelated at the same time - otherwise impossible (?) +# to distinguish from non-bridgehead annelated atoms + +# some basic ring-patterns (just size, no other information): + + + + + +# III.3 Sugars and Nucleosides/Nucleotides, Steroids +# -------------------------------------------------- + +# because of the large variety of sugar derivatives, different patterns can be applied. +# The choice of patterns and their combinations will depend on the contents of the database +# e.g. natural products, nucleoside analoges with modified sugars, ... as well as on the +# desired restriction + + +Sugar_pattern_1: [OX2;$([r5]1@C@C@C(O)@C1),$([r6]1@C@C@C(O)@C(O)@C1)] +# 5 or 6-membered ring containing one O and at least one (r5) or two (r6) oxygen-substituents. + +Sugar_pattern_2: [OX2;$([r5]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)] +# 5 or 6-membered ring containing one O and an acetal-like bond at postion 2. + +Sugar_pattern_combi: [OX2;$([r5]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C(O)@C1),$([r6]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C(O)@C(O)@C1)] +# combination of the two above + +Sugar_pattern_2_reducing: [OX2;$([r5]1@C(!@[OX2H1])@C@C@C1),$([r6]1@C(!@[OX2H1])@C@C@C@C1)] +# 5 or 6-membered cyclic hemi-acetal + +Sugar_pattern_2_alpha: [OX2;$([r5]1@[C@@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@[C@@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)] +# 5 or 6-membered cyclic hemi-acetal + +Sugar_pattern_2_beta: [OX2;$([r5]1@[C@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@[C@](!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)] +# 5 or 6-membered cyclic hemi-acetal + +##Poly_sugar_1: ([OX2;$([r5]1@C@C@C(O)@C1),$([r6]1@C@C@C(O)@C(O)@C1)].[OX2;$([r5]1@C@C@C(O)@C1),$([r6]1@C@C@C(O)@C(O)@C1)]) +# pattern1 occours more than once (in same molecule, but moieties don't have to be adjacent!) + +##Poly_sugar_2: ([OX2;$([r5]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)].[OX2;$([r5]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C1),$([r6]1@C(!@[OX2,NX3,SX2,FX1,ClX1,BrX1,IX1])@C@C@C@C1)]) +# pattern2 occours more than once (in same molecule, but moieties don't have to be adjacent!) + + +# III.4 Everything else... +# ------------------------ + +Conjugated_double_bond: *=*[*]=,#,:[*] + +Conjugated_tripple_bond: *#*[*]=,#,:[*] + +Cis_double_bond: */[D2]=[D2]\* +# only one single-bonded substituent on each DB-atom. no aromats. +# only found when character of DB is explicitely stated. + +Trans_double_bond: */[D2]=[D2]/* +# analog + +Mixed_anhydrides: [$(*=O),$([#16,#14,#5]),$([#7]([#6]=[OX1]))][#8X2][$(*=O),$([#16,#14,#5]),$([#7]([#6]=[OX1]))] +# should hits all combinations of two acids + +Halogen_on_hetero: [FX1,ClX1,BrX1,IX1][!#6] + +Halogen_multi_subst: [F,Cl,Br,I;!$([X1]);!$([X0-])] +# Halogen which is not mono-substituted nor an anion, e.g. chlorate. +# Most of these cases should be also filtered by Halogen_on_hetero. + +Trifluoromethyl: [FX1][CX4;!$([H0][Cl,Br,I]);!$([F][C]([F])([F])[F])]([FX1])([FX1]) +# C with three F attached, connected to anything which is not another halogen + +C_ONS_bond: [#6]~[#7,#8,#16] +# probably all drug-like molecules have at least one O, N, or S connected to a C -> nice filter + +## Mixture: (*).(*) +# two or more seperate parts, may also be salt +# component-level grouping is not yet supported in Open Babel Version 2.0 + + +Charged: [!+0] + +Anion: [-1,-2,-3,-4,-5,-6,-7] + +Kation: [+1,+2,+3,+4,+5,+6,+7] + +Salt: ([-1,-2,-3,-4,-5,-6,-7]).([+1,+2,+3,+4,+5,+6,+7]) +# two or more seperate components with opposite charges + +##Zwitterion: ([-1,-2,-3,-4,-5,-6,-7].[+1,+2,+3,+4,+5,+6,+7]) +# both negative and positive charges somewhere within the same molecule. + +1,3-Tautomerizable: [$([#7X2,OX1,SX1]=*[!H0;!$([a;!n])]),$([#7X3,OX2,SX2;!H0]*=*),$([#7X3,OX2,SX2;!H0]*:n)] +# 1,3 migration of H allowed. Includes keto/enol and amide/enamide. +# Aromatic rings must stay aromatic - no keto form of phenol + +1,5-Tautomerizable: [$([#7X2,OX1,SX1]=,:**=,:*[!H0;!$([a;!n])]),$([#7X3,OX2,SX2;!H0]*=**=*),$([#7X3,OX2,SX2;!H0]*=,:**:n)] + +Rotatable_bond: [!$(*#*)&!D1]-!@[!$(*#*)&!D1] +# taken from http://www.daylight.com/support/contrib/smarts/content.html + +Michael_acceptor: [CX3]=[CX3][$([CX3]=[O,N,S]),$(C#[N]),$([S,P]=[OX1]),$([NX3]=O),$([NX3+](=O)[O-])] +# the classical case: C=C near carbonyl, nitrile, nitro, or similar +# Oxo-heteroaromats and similar are not included. + +Dicarbodiazene: [CX3](=[OX1])[NX2]=[NX2][CX3](=[OX1]) +# Michael-like acceptor, see Mitsunobu reaction + +# H-Bond_donor: + +# H-Bond_acceptor: + +# Pos_ionizable: + +# Neg_ionizable: + +# Unlikely_ions: +# O+,N-,C+,C-, ... + +CH-acidic: [$([CX4;!$([H0]);!$(C[!#6;!$([P,S]=O);!$(N(~O)~O)])][$([CX3]=[O,N,S]),$(C#[N]),$([S,P]=[OX1]),$([NX3]=O),$([NX3+](=O)[O-]);!$(*[S,O,N;H1,H2]);!$([*+0][S,O;X1-])]),$([CX4;!$([H0])]1[CX3]=[CX3][CX3]=[CX3]1)] +# C-H alpha to carbony, nitro or similar, C is not double-bonded, only C, H, S,P=O and nitro substituents allowed. +# pentadiene is included. acids, their salts, prim./sec. amides, and imides are excluded. +# hits also CH-acidic_strong + +CH-acidic_strong: [CX4;!$([H0]);!$(C[!#6;!$([P,S]=O);!$(N(~O)~O)])]([$([CX3]=[O,N,S]),$(C#[N]),$([S,P]=[OX1]),$([NX3]=O),$([NX3+](=O)[O-]);!$(*[S,O,N;H1,H2]);!$([*+0][S,O;X1-])])[$([CX3]=[O,N,S]),$(C#[N]),$([S,P]=[OX1]),$([NX3]=O),$([NX3+](=O)[O-]);!$(*[S,O,N;H1,H2]);!$([*+0][S,O;X1-])] +# same as above (without pentadiene), but carbonyl or similar on two or three sides + +Chiral_center_specified: [$([*@](~*)(~*)(*)*),$([*@H](*)(*)*),$([*@](~*)(*)*),$([*@H](~*)~*)] +# Hits atoms with tetrahedral chirality, if chiral center is specified in the SMILES string +# depictmach does not find oxonium, sulfonium, or sulfoxides! + +# Chiral_center_unspecified: [$([*@?](~*)(~*)(*)*),$([*@?H](*)(*)*),$([*@?](~*)(*)*),$([*@?H](~*)~*)] +# Hits atoms with tetrahedral chirality, if chiral center is not specified in the SMILES string +# "@?" (unspecified chirality) is not yet supported in Open Babel Version 2.0 +
\ No newline at end of file diff --git a/lib/algorithm.rb b/lib/algorithm.rb new file mode 100644 index 0000000..113f847 --- /dev/null +++ b/lib/algorithm.rb @@ -0,0 +1,21 @@ +module OpenTox + + module Algorithm + + # Generic method to execute algorithms + # Algorithms should: + # - accept a Compound, an Array of Compounds or a Dataset as first argument + # - optional parameters as second argument + # - return an object corresponding to the input type as result (eg. Compound -> value, Array of Compounds -> Array of values, Dataset -> Dataset with values + # @param [OpenTox::Compound,Array,OpenTox::Dataset] Input object + # @param [Hash] Algorithm parameters + # @return Algorithm result + def self.run algorithm, object, parameters=nil + bad_request_error "Cannot run '#{algorithm}' algorithm. Please provide an OpenTox::Algorithm." unless algorithm =~ /^OpenTox::Algorithm/ + klass,method = algorithm.split('.') + parameters.nil? ? Object.const_get(klass).send(method,object) : Object.const_get(klass).send(method,object, parameters) + end + + end +end + diff --git a/lib/bbrc.rb b/lib/bbrc.rb new file mode 100644 index 0000000..c83b9b3 --- /dev/null +++ b/lib/bbrc.rb @@ -0,0 +1,165 @@ +module OpenTox + module Algorithm + class Fminer + TABLE_OF_ELEMENTS = [ +"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Uut", "Fl", "Uup", "Lv", "Uus", "Uuo"] + + # + # Run bbrc algorithm on dataset + # + # @param [OpenTox::Dataset] training dataset + # @param [optional] parameters BBRC parameters, accepted parameters are + # - min_frequency Minimum frequency (default 5) + # - feature_type Feature type, can be 'paths' or 'trees' (default "trees") + # - backbone BBRC classes, pass 'false' to switch off mining for BBRC representatives. (default "true") + # - min_chisq_significance Significance threshold (between 0 and 1) + # - nr_hits Set to "true" to get hit count instead of presence + # - get_target Set to "true" to obtain target variable as feature + # @return [OpenTox::Dataset] Fminer Dataset + def self.bbrc training_dataset, params={} + + time = Time.now + bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1 + + prediction_feature = training_dataset.features.first + if params[:min_frequency] + minfreq = params[:min_frequency] + else + per_mil = 5 # value from latest version + per_mil = 8 # as suggested below + i = training_dataset.feature_ids.index prediction_feature.id + nr_labeled_cmpds = training_dataset.data_entries.select{|de| !de[i].nil?}.size + minfreq = per_mil * nr_labeled_cmpds.to_f / 1000.0 # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST + minfreq = 2 unless minfreq > 2 + minfreq = minfreq.round + end + + @bbrc ||= Bbrc::Bbrc.new + @bbrc.Reset + if prediction_feature.numeric + @bbrc.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations! + else + bad_request_error "No accept values for "\ + "dataset '#{training_dataset.id}' and "\ + "feature '#{prediction_feature.id}'" unless prediction_feature.accept_values + value2act = Hash[[*prediction_feature.accept_values.map.with_index]] + end + @bbrc.SetMinfreq(minfreq) + @bbrc.SetType(1) if params[:feature_type] == "paths" + @bbrc.SetBackbone(false) if params[:backbone] == "false" + @bbrc.SetChisqSig(params[:min_chisq_significance].to_f) if params[:min_chisq_significance] + @bbrc.SetConsoleOut(false) + + params[:nr_hits] ? nr_hits = params[:nr_hits] : nr_hits = false + feature_dataset = FminerDataset.new( + :training_dataset_id => training_dataset.id, + :training_algorithm => "#{self.to_s}.bbrc", + :training_feature_id => prediction_feature.id , + :training_parameters => { + :min_frequency => minfreq, + :nr_hits => nr_hits, + :backbone => (params[:backbone] == false ? false : true) + } + + ) + feature_dataset.compounds = training_dataset.compounds + + # add data + training_dataset.compounds.each_with_index do |compound,i| + act = value2act[training_dataset.data_entries[i].first] + if act # TODO check if this works + @bbrc.AddCompound(compound.smiles,i+1) + @bbrc.AddActivity(act,i+1) + end + end + #g_median=@fminer.all_activities.values.to_scale.median + + #task.progress 10 + #step_width = 80 / @bbrc.GetNoRootNodes().to_f + + $logger.debug "BBRC setup: #{Time.now-time}" + time = Time.now + ftime = 0 + itime = 0 + rtime = 0 + + # run @bbrc + (0 .. @bbrc.GetNoRootNodes()-1).each do |j| + results = @bbrc.MineRoot(j) + results.each do |result| + rt = Time.now + f = YAML.load(result)[0] + smarts = f.shift + # convert fminer SMARTS representation into a more human readable format + smarts.gsub!(%r{\[#(\d+)&(\w)\]}) do + element = TABLE_OF_ELEMENTS[$1.to_i-1] + $2 == "a" ? element.downcase : element + end + p_value = f.shift + f.flatten! + compound_idxs = f.collect{|e| e.first.first-1} + # majority class + effect = compound_idxs.collect{|i| training_dataset.data_entries[i].first}.mode + +=begin + if (!@bbrc.GetRegression) + id_arrs = f[2..-1].flatten + max = OpenTox::Algorithm::Fminer.effect(f[2..-1].reverse, @fminer.db_class_sizes) # f needs reversal for bbrc + effect = max+1 + else #regression part + id_arrs = f[2] + # DV: effect calculation + f_arr=Array.new + f[2].each do |id| + id=id.keys[0] # extract id from hit count hash + f_arr.push(@fminer.all_activities[id]) + end + f_median=f_arr.to_scale.median + if g_median >= f_median + effect = 'activating' + else + effect = 'deactivating' + end + end +=end + rtime += Time.now - rt + + ft = Time.now + feature = OpenTox::FminerSmarts.find_or_create_by({ + "smarts" => smarts, + "p_value" => p_value.to_f.abs.round(5), + "effect" => effect, + "dataset_id" => feature_dataset.id + }) + feature_dataset.feature_ids << feature.id + ftime += Time.now - ft + + it = Time.now + f.each do |id_count_hash| + id_count_hash.each do |id,count| + nr_hits ? count = count.to_i : count = 1 + feature_dataset.data_entries[id-1] ||= [] + feature_dataset.data_entries[id-1][feature_dataset.feature_ids.size-1] = count + end + end + itime += Time.now - it + + end + end + + $logger.debug "Fminer: #{Time.now-time} (read: #{rtime}, iterate: #{itime}, find/create Features: #{ftime})" + time = Time.now + + feature_dataset.fill_nil_with 0 + + $logger.debug "Prepare save: #{Time.now-time}" + time = Time.now + feature_dataset.save_all + + $logger.debug "Save: #{Time.now-time}" + feature_dataset + + end + end + end +end diff --git a/lib/classification.rb b/lib/classification.rb new file mode 100644 index 0000000..b4b2e59 --- /dev/null +++ b/lib/classification.rb @@ -0,0 +1,110 @@ +module OpenTox + module Algorithm + + class Classification + + def self.weighted_majority_vote compound, params + neighbors = params[:neighbors] + return {:value => nil,:confidence => nil,:warning => "Cound not find similar compounds."} if neighbors.empty? + weighted_sum = {} + sim_sum = 0.0 + confidence = 0.0 + neighbors.each do |row| + n,sim,acts = row + #confidence = sim if sim > confidence # distance to nearest neighbor + acts.each do |act| + weighted_sum[act] ||= 0 + weighted_sum[act] += sim + end + end + case weighted_sum.size + when 1 + return {:value => weighted_sum.keys.first, :confidence => weighted_sum.values.first/neighbors.size.abs} + when 2 + sim_sum = weighted_sum[weighted_sum.keys[0]] + sim_sum -= weighted_sum[weighted_sum.keys[1]] + sim_sum > 0 ? prediction = weighted_sum.keys[0] : prediction = weighted_sum.keys[1] + confidence = (sim_sum/neighbors.size).abs + return {:value => prediction,:confidence => confidence} + else + bad_request_error "Cannot predict more than 2 classes, multinomial classifications is not yet implemented. Received classes were: '#{weighted.sum.keys}'" + end + end + + # Classification with majority vote from neighbors weighted by similarity + # @param [Hash] params Keys `:activities, :sims, :value_map` are required + # @return [Numeric] A prediction value. + def self.fminer_weighted_majority_vote neighbors, training_dataset + + neighbor_contribution = 0.0 + confidence_sum = 0.0 + + $logger.debug "Weighted Majority Vote Classification." + + values = neighbors.collect{|n| n[2]}.uniq + neighbors.each do |neighbor| + i = training_dataset.compound_ids.index n.id + neighbor_weight = neighbor[1] + activity = values.index(neighbor[2]) + 1 # map values to integers > 1 + neighbor_contribution += activity * neighbor_weight + if values.size == 2 # AM: provide compat to binary classification: 1=>false 2=>true + case activity + when 1 + confidence_sum -= neighbor_weight + when 2 + confidence_sum += neighbor_weight + end + else + confidence_sum += neighbor_weight + end + end + if values.size == 2 + if confidence_sum >= 0.0 + prediction = values[1] + elsif confidence_sum < 0.0 + prediction = values[0] + end + elsif values.size == 1 # all neighbors have the same value + prediction = values[0] + else + prediction = (neighbor_contribution/confidence_sum).round # AM: new multinomial prediction + end + + confidence = (confidence_sum/neighbors.size).abs + {:value => prediction, :confidence => confidence.abs} + end + + # Local support vector regression from neighbors + # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required + # @return [Numeric] A prediction value. + def self.local_svm_classification(params) + + confidence = 0.0 + prediction = nil + + $logger.debug "Local SVM." + if params[:activities].size>0 + if params[:props] + n_prop = params[:props][0].collect.to_a + q_prop = params[:props][1].collect.to_a + props = [ n_prop, q_prop ] + end + activities = params[:activities].collect.to_a + activities = activities.collect{|v| "Val" + v.to_s} # Convert to string for R to recognize classification + prediction = local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting + prediction = prediction.sub(/Val/,"") if prediction # Convert back + confidence = 0.0 if prediction.nil? + #$logger.debug "Prediction: '" + prediction.to_s + "' ('#{prediction.class}')." + confidence = get_confidence({:sims => params[:sims][1], :activities => params[:activities]}) + end + {:value => prediction, :confidence => confidence} + + end + + + + end + + end +end + diff --git a/lib/compound.rb b/lib/compound.rb new file mode 100644 index 0000000..7a3dc5c --- /dev/null +++ b/lib/compound.rb @@ -0,0 +1,370 @@ +# TODO: check +# *** Open Babel Error in ParseFile +# Could not find contribution data file. + +CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/" + +module OpenTox + + class Compound + include OpenTox + + DEFAULT_FINGERPRINT = "MP2D" + + field :inchi, type: String + field :smiles, type: String + field :inchikey, type: String + field :names, type: Array + field :warning, type: String + field :cid, type: String + field :chemblid, type: String + field :png_id, type: BSON::ObjectId + field :svg_id, type: BSON::ObjectId + field :sdf_id, type: BSON::ObjectId + field :fingerprints, type: Hash, default: {} + field :default_fingerprint_size, type: Integer + + index({smiles: 1}, {unique: true}) + + # Overwrites standard Mongoid method to create fingerprints before database insertion + def self.find_or_create_by params + compound = self.find_or_initialize_by params + compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT) + compound.save + compound + end + + def fingerprint type="MP2D" + unless fingerprints[type] + return [] unless self.smiles + #http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format + if type == "MP2D" + fp = obconversion(smiles,"smi","mpd").strip.split("\t") + name = fp.shift # remove Title + fingerprints[type] = fp + #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html + elsif type== "MNA" + level = 2 # TODO: level as parameter, evaluate level 1, see paper + fp = obconversion(smiles,"smi","mna","xL\"#{level}\"").split("\n") + fp.shift # remove Title + fingerprints[type] = fp + else # standard fingerprints + fp = OpenBabel::OBFingerprint.find_fingerprint(type) + obmol = OpenBabel::OBMol.new + obconversion = OpenBabel::OBConversion.new + obconversion.set_in_format "smi" + obconversion.read_string obmol, self.smiles + result = OpenBabel::VectorUnsignedInt.new + fp.get_fingerprint(obmol,result) + # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i + #p OpenBabel::OBFingerprint.describe_bits(result) + # convert result to a list of the bits that are set + # from openbabel/scripts/python/pybel.py line 830 + # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints + result = result.to_a + bitsperint = OpenBabel::OBFingerprint.getbitsperint() + bits_set = [] + start = 1 + result.each do |x| + i = start + while x > 0 do + bits_set << i if (x % 2) == 1 + x >>= 1 + i += 1 + end + start += bitsperint + end + fingerprints[type] = bits_set + end + save + end + fingerprints[type] + end + + # Create a compound from smiles string + # @example + # compound = OpenTox::Compound.from_smiles("c1ccccc1") + # @param [String] smiles Smiles string + # @return [OpenTox::Compound] Compound + def self.from_smiles smiles + smiles = obconversion(smiles,"smi","can") + if smiles.empty? + return nil + #Compound.find_or_create_by(:warning => "SMILES parsing failed for '#{smiles}', this may be caused by an incorrect SMILES string.") + else + Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can") + end + end + + # Create a compound from inchi string + # @param inchi [String] smiles InChI string + # @return [OpenTox::Compound] Compound + def self.from_inchi inchi + # Temporary workaround for OpenBabels Inchi bug + # http://sourceforge.net/p/openbabel/bugs/957/ + # bug has not been fixed in latest git/development version + #smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip + smiles = obconversion(inchi,"inchi","can") + if smiles.empty? + Compound.find_or_create_by(:warning => "InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries.") + else + Compound.find_or_create_by(:smiles => smiles, :inchi => inchi) + end + end + + # Create a compound from sdf string + # @param sdf [String] smiles SDF string + # @return [OpenTox::Compound] Compound + def self.from_sdf sdf + # do not store sdf because it might be 2D + Compound.from_smiles obconversion(sdf,"sdf","can") + end + + # Create a compound from name. Relies on an external service for name lookups. + # @example + # compound = OpenTox::Compound.from_name("Benzene") + # @param name [String] can be also an InChI/InChiKey, CAS number, etc + # @return [OpenTox::Compound] Compound + def self.from_name name + Compound.from_smiles RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"smiles")) + end + + # Get InChI + # @return [String] InChI string + def inchi + unless self["inchi"] + + result = obconversion(smiles,"smi","inchi") + #result = `echo "#{self.smiles}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -ismi - -oinchi`.chomp + update(:inchi => result.chomp) if result and !result.empty? + end + self["inchi"] + end + + # Get InChIKey + # @return [String] InChIKey string + def inchikey + update(:inchikey => obconversion(smiles,"smi","inchikey")) unless self["inchikey"] + self["inchikey"] + end + + # Get (canonical) smiles + # @return [String] Smiles string + def smiles + update(:smiles => obconversion(self["smiles"],"smi","can")) unless self["smiles"] + self["smiles"] + end + + # Get sdf + # @return [String] SDF string + def sdf + if self.sdf_id.nil? + sdf = obconversion(smiles,"smi","sdf") + file = Mongo::Grid::File.new(sdf, :filename => "#{id}.sdf",:content_type => "chemical/x-mdl-sdfile") + sdf_id = $gridfs.insert_one file + update :sdf_id => sdf_id + end + $gridfs.find_one(_id: self.sdf_id).data + end + + # Get SVG image + # @return [image/svg] Image data + def svg + if self.svg_id.nil? + svg = obconversion(smiles,"smi","svg") + file = Mongo::Grid::File.new(svg, :filename => "#{id}.svg", :content_type => "image/svg") + update(:svg_id => $gridfs.insert_one(file)) + end + $gridfs.find_one(_id: self.svg_id).data + + end + + # Get png image + # @example + # image = compound.png + # @return [image/png] Image data + def png + if self.png_id.nil? + png = obconversion(smiles,"smi","_png2") + file = Mongo::Grid::File.new(Base64.encode64(png), :filename => "#{id}.png", :content_type => "image/png") + update(:png_id => $gridfs.insert_one(file)) + end + Base64.decode64($gridfs.find_one(_id: self.png_id).data) + + end + + # Get all known compound names. Relies on an external service for name lookups. + # @example + # names = compound.names + # @return [String] Compound names + def names + update(:names => RestClientWrapper.get("#{CACTUS_URI}#{inchi}/names").split("\n")) unless self["names"] + self["names"] + end + + # @return [String] PubChem Compound Identifier (CID), derieved via restcall to pubchem + def cid + pug_uri = "http://pubchem.ncbi.nlm.nih.gov/rest/pug/" + update(:cid => RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"] + self["cid"] + end + + # @return [String] ChEMBL database compound id, derieved via restcall to chembl + def chemblid + # https://www.ebi.ac.uk/chembldb/ws#individualCompoundByInChiKey + uri = "http://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json" + update(:chemblid => JSON.parse(RestClientWrapper.get(uri))["compounds"].first["chemblId"]) unless self["chemblid"] + self["chemblid"] + end + + def fingerprint_count_neighbors params + # TODO fix + neighbors = [] + query_fingerprint = self.fingerprint params[:type] + training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound| + unless self == compound + candidate_fingerprint = compound.fingerprint params[:type] + features = (query_fingerprint + candidate_fingerprint).uniq + min_sum = 0 + max_sum = 0 + features.each do |f| + min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax + min_sum += min + max_sum += max + end + max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f + neighbors << [compound.id, sim] if sim and sim >= params[:min_sim] + end + end + neighbors.sort{|a,b| b.last <=> a.last} + end + + def fingerprint_neighbors params + bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim] + neighbors = [] + #if params[:type] == DEFAULT_FINGERPRINT + #neighbors = db_neighbors params + #p neighbors + #else + query_fingerprint = self.fingerprint params[:type] + training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound| + unless self == compound + candidate_fingerprint = compound.fingerprint params[:type] + sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f + neighbors << [compound.id, sim] if sim >= params[:min_sim] + end + end + #end + neighbors.sort{|a,b| b.last <=> a.last} + end + + def fminer_neighbors params + bad_request_error "Incorrect parameters for Compound#fminer_neighbors. Please provide :feature_dataset_id, :min_sim." unless params[:feature_dataset_id] and params[:min_sim] + feature_dataset = Dataset.find params[:feature_dataset_id] + query_fingerprint = Algorithm::Descriptor.smarts_match(self, feature_dataset.features) + neighbors = [] + + # find neighbors + feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i| + sim = Algorithm::Similarity.tanimoto candidate_fingerprint, query_fingerprint + if sim >= params[:min_sim] + neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming + end + end + neighbors + end + + def physchem_neighbors params + feature_dataset = Dataset.find params[:feature_dataset_id] + query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors] + neighbors = [] + feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i| + # TODO implement pearson and cosine similarity separatly + R.assign "x", query_fingerprint + R.assign "y", candidate_fingerprint + # pearson r + #sim = R.eval("cor(x,y,use='complete.obs',method='pearson')").to_ruby + #p "pearson" + #p sim + #p "cosine" + sim = R.eval("x %*% y / sqrt(x%*%x * y%*%y)").to_ruby.first + #p sim + if sim >= params[:min_sim] + neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming + end + end + neighbors + end + + def db_neighbors params + p "DB NEIGHBORS" + p params + # TODO restrict to dataset + # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb + qn = fingerprint(params[:type]).size + #qmin = qn * threshold + #qmax = qn / threshold + #not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...) + #reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)] + aggregate = [ + #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}}, + {'$match' => {'_id' => {'$ne' => self.id}}}, # remove self + {'$project' => { + 'tanimoto' => {'$let' => { + 'vars' => {'common' => {'$size' => {'$setIntersection' => ["'$#{DEFAULT_FINGERPRINT}'", DEFAULT_FINGERPRINT]}}}, + 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [qn, '$default_fingerprint_size']}, '$$common']}]} + }}, + '_id' => 1 + }}, + {'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}}, + {'$sort' => {'tanimoto' => -1}} + ] + + $mongo["compounds"].aggregate(aggregate).collect{ |r| [r["_id"], r["tanimoto"]] } + + end + + private + + def self.obconversion(identifier,input_format,output_format,option=nil) + obconversion = OpenBabel::OBConversion.new + obconversion.set_options(option, OpenBabel::OBConversion::OUTOPTIONS) if option + obmol = OpenBabel::OBMol.new + obconversion.set_in_and_out_formats input_format, output_format + return nil if identifier.nil? + obconversion.read_string obmol, identifier + case output_format + when /smi|can|inchi/ + obconversion.write_string(obmol).gsub(/\s/,'').chomp + when /sdf/ + # TODO: find disconnected structures + # strip_salts + # separate + obmol.add_hydrogens + builder = OpenBabel::OBBuilder.new + builder.build(obmol) + + sdf = obconversion.write_string(obmol) +print sdf + if sdf.match(/.nan/) + + $logger.warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure" + obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS) + sdf = obconversion.write_string(obmol) + if sdf.match(/.nan/) + $logger.warn "2D generation failed for compound #{identifier}, rendering without coordinates." + obconversion.remove_option("gen2D", OpenBabel::OBConversion::GENOPTIONS) + sdf = obconversion.write_string(obmol) + end + end + sdf + else + obconversion.write_string(obmol) + end + end + + def obconversion(identifier,input_format,output_format,option=nil) + self.class.obconversion(identifier,input_format,output_format,option) + end + end +end diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb new file mode 100644 index 0000000..cbffb7c --- /dev/null +++ b/lib/crossvalidation.rb @@ -0,0 +1,302 @@ +module OpenTox + + class CrossValidation + field :validation_ids, type: Array, default: [] + field :model_id, type: BSON::ObjectId + field :folds, type: Integer + field :nr_instances, type: Integer + field :nr_unpredicted, type: Integer + field :predictions, type: Array, default: [] + field :finished_at, type: Time + + def time + finished_at - created_at + end + + def validations + validation_ids.collect{|vid| Validation.find vid} + end + + def model + Model::Lazar.find model_id + end + + def self.create model, n=10 + model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation + bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass + cv = klass.new( + name: model.name, + model_id: model.id, + folds: n + ) + cv.save # set created_at + nr_instances = 0 + nr_unpredicted = 0 + predictions = [] + training_dataset = Dataset.find model.training_dataset_id + training_dataset.folds(n).each_with_index do |fold,fold_nr| + fork do # parallel execution of validations + $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" + t = Time.now + validation = Validation.create(model, fold[0], fold[1],cv) + $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" + end + end + Process.waitall + cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id) + cv.validations.each do |validation| + nr_instances += validation.nr_instances + nr_unpredicted += validation.nr_unpredicted + predictions += validation.predictions + end + cv.update_attributes( + nr_instances: nr_instances, + nr_unpredicted: nr_unpredicted, + predictions: predictions.sort{|a,b| b[3] <=> a[3]} # sort according to confidence + ) + $logger.debug "Nr unpredicted: #{nr_unpredicted}" + cv.statistics + cv + end + end + + class ClassificationCrossValidation < CrossValidation + + field :accept_values, type: Array + field :confusion_matrix, type: Array + field :weighted_confusion_matrix, type: Array + field :accuracy, type: Float + field :weighted_accuracy, type: Float + field :true_rate, type: Hash + field :predictivity, type: Hash + field :confidence_plot_id, type: BSON::ObjectId + # TODO auc, f-measure (usability??) + + def statistics + accept_values = Feature.find(model.prediction_feature_id).accept_values + confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} + weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} + true_rate = {} + predictivity = {} + predictions.each do |pred| + compound_id,activity,prediction,confidence = pred + if activity and prediction and confidence.numeric? + if prediction == activity + if prediction == accept_values[0] + confusion_matrix[0][0] += 1 + weighted_confusion_matrix[0][0] += confidence + elsif prediction == accept_values[1] + confusion_matrix[1][1] += 1 + weighted_confusion_matrix[1][1] += confidence + end + elsif prediction != activity + if prediction == accept_values[0] + confusion_matrix[0][1] += 1 + weighted_confusion_matrix[0][1] += confidence + elsif prediction == accept_values[1] + confusion_matrix[1][0] += 1 + weighted_confusion_matrix[1][0] += confidence + end + end + else + nr_unpredicted += 1 if prediction.nil? + end + end + true_rate = {} + predictivity = {} + accept_values.each_with_index do |v,i| + true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f + predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f + end + confidence_sum = 0 + weighted_confusion_matrix.each do |r| + r.each do |c| + confidence_sum += c + end + end + update_attributes( + accept_values: accept_values, + confusion_matrix: confusion_matrix, + weighted_confusion_matrix: weighted_confusion_matrix, + accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f, + weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, + true_rate: true_rate, + predictivity: predictivity, + finished_at: Time.now + ) + $logger.debug "Accuracy #{accuracy}" + end + + def confidence_plot + tmpfile = "/tmp/#{id.to_s}_confidence.svg" + accuracies = [] + confidences = [] + correct_predictions = 0 + incorrect_predictions = 0 + predictions.each do |p| + if p[1] and p[2] + p[1] == p [2] ? correct_predictions += 1 : incorrect_predictions += 1 + accuracies << correct_predictions/(correct_predictions+incorrect_predictions).to_f + confidences << p[3] + + end + end + R.assign "accuracy", accuracies + R.assign "confidence", confidences + R.eval "image = qplot(confidence,accuracy)+ylab('accumulated accuracy')+scale_x_reverse()" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") + plot_id = $gridfs.insert_one(file) + update(:confidence_plot_id => plot_id) + $gridfs.find_one(_id: confidence_plot_id).data + end + + #Average area under roc 0.646 + #Area under roc 0.646 + #F measure carcinogen: 0.769, noncarcinogen: 0.348 + end + + class RegressionCrossValidation < CrossValidation + + field :rmse, type: Float + field :mae, type: Float + field :weighted_rmse, type: Float + field :weighted_mae, type: Float + field :r_squared, type: Float + field :correlation_plot_id, type: BSON::ObjectId + field :confidence_plot_id, type: BSON::ObjectId + + def statistics + rmse = 0 + weighted_rmse = 0 + rse = 0 + weighted_rse = 0 + mae = 0 + weighted_mae = 0 + rae = 0 + weighted_rae = 0 + confidence_sum = 0 + predictions.each do |pred| + compound_id,activity,prediction,confidence = pred + if activity and prediction + error = Math.log10(prediction)-Math.log10(activity) + rmse += error**2 + weighted_rmse += confidence*error**2 + mae += error.abs + weighted_mae += confidence*error.abs + confidence_sum += confidence + else + warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + end + end + x = predictions.collect{|p| p[1]} + y = predictions.collect{|p| p[2]} + R.assign "measurement", x + R.assign "prediction", y + R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" + r = R.eval("r").to_ruby + + mae = mae/predictions.size + weighted_mae = weighted_mae/confidence_sum + rmse = Math.sqrt(rmse/predictions.size) + weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) + update_attributes( + mae: mae, + rmse: rmse, + weighted_mae: weighted_mae, + weighted_rmse: weighted_rmse, + r_squared: r**2, + finished_at: Time.now + ) + $logger.debug "R^2 #{r**2}" + $logger.debug "RMSE #{rmse}" + $logger.debug "MAE #{mae}" + end + + def misclassifications n=nil + #n = predictions.size unless n + n ||= 10 + model = Model::Lazar.find(self.model_id) + training_dataset = Dataset.find(model.training_dataset_id) + prediction_feature = training_dataset.features.first + predictions.collect do |p| + unless p.include? nil + compound = Compound.find(p[0]) + neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters) + neighbors.collect! do |n| + neighbor = Compound.find(n[0]) + values = training_dataset.values(neighbor,prediction_feature) + { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values} + end + { + :smiles => compound.smiles, + #:fingerprint => compound.fp4.collect{|id| Smarts.find(id).name}, + :measured => p[1], + :predicted => p[2], + #:relative_error => (Math.log10(p[1])-Math.log10(p[2])).abs/Math.log10(p[1]).to_f.abs, + :log_error => (Math.log10(p[1])-Math.log10(p[2])).abs, + :relative_error => (p[1]-p[2]).abs/p[1], + :confidence => p[3], + :neighbors => neighbors + } + end + end.compact.sort{|a,b| p a; b[:relative_error] <=> a[:relative_error]}[0..n-1] + end + + def confidence_plot + tmpfile = "/tmp/#{id.to_s}_confidence.svg" + sorted_predictions = predictions.collect{|p| [(Math.log10(p[1])-Math.log10(p[2])).abs,p[3]] if p[1] and p[2]}.compact + R.assign "error", sorted_predictions.collect{|p| p[0]} + R.assign "confidence", sorted_predictions.collect{|p| p[1]} + # TODO fix axis names + R.eval "image = qplot(confidence,error)" + R.eval "image = image + stat_smooth(method='lm', se=FALSE)" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_confidence_plot.svg") + plot_id = $gridfs.insert_one(file) + update(:confidence_plot_id => plot_id) + $gridfs.find_one(_id: confidence_plot_id).data + end + + def correlation_plot + unless correlation_plot_id + tmpfile = "/tmp/#{id.to_s}_correlation.svg" + x = predictions.collect{|p| p[1]} + y = predictions.collect{|p| p[2]} + attributes = Model::Lazar.find(self.model_id).attributes + attributes.delete_if{|key,_| key.match(/_id|_at/) or ["_id","creator","name"].include? key} + attributes = attributes.values.collect{|v| v.is_a?(String) ? v.sub(/OpenTox::/,'') : v}.join("\n") + R.assign "measurement", x + R.assign "prediction", y + R.eval "all = c(-log(measurement),-log(prediction))" + R.eval "range = c(min(all), max(all))" + R.eval "image = qplot(-log(prediction),-log(measurement),main='#{self.name}',asp=1,xlim=range, ylim=range)" + R.eval "image = image + geom_abline(intercept=0, slope=1)" + R.eval "ggsave(file='#{tmpfile}', plot=image)" + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_correlation_plot.svg") + plot_id = $gridfs.insert_one(file) + update(:correlation_plot_id => plot_id) + end + $gridfs.find_one(_id: correlation_plot_id).data + end + end + + class RepeatedCrossValidation + field :crossvalidation_ids, type: Array, default: [] + def self.create model, folds=10, repeats=3 + repeated_cross_validation = self.new + repeats.times do |n| + $logger.debug "Crossvalidation #{n+1} for #{model.name}" + repeated_cross_validation.crossvalidation_ids << CrossValidation.create(model, folds).id + end + repeated_cross_validation.save + repeated_cross_validation + end + def crossvalidations + crossvalidation_ids.collect{|id| CrossValidation.find(id)} + end + end + + +end diff --git a/lib/dataset.rb b/lib/dataset.rb new file mode 100644 index 0000000..60f3bb5 --- /dev/null +++ b/lib/dataset.rb @@ -0,0 +1,350 @@ +require 'csv' +require 'tempfile' + +module OpenTox + + class Dataset + + attr_writer :data_entries + + # associations like has_many, belongs_to deteriorate performance + field :feature_ids, type: Array, default: [] + field :compound_ids, type: Array, default: [] + field :data_entries_id, type: BSON::ObjectId#, default: [] + field :source, type: String + + # Save all data including data_entries + # Should be used instead of save + def save_all + dump = Marshal.dump(@data_entries) + file = Mongo::Grid::File.new(dump, :filename => "#{self.id.to_s}.data_entries") + entries_id = $gridfs.insert_one(file) + update(:data_entries_id => entries_id) + end + + # Readers + + # Get all compounds + def compounds + @compounds ||= self.compound_ids.collect{|id| OpenTox::Compound.find id} + @compounds + end + + # Get all features + def features + @features ||= self.feature_ids.collect{|id| OpenTox::Feature.find(id)} + @features + end + + # Get all data_entries + def data_entries + unless @data_entries + t = Time.now + data_entry_file = $gridfs.find_one(_id: data_entries_id) + if data_entry_file.nil? + @data_entries = [] + else + @data_entries = Marshal.load(data_entry_file.data) + bad_request_error "Data entries (#{data_entries_id}) are not a 2D-Array" unless @data_entries.is_a? Array and @data_entries.first.is_a? Array + unless @data_entries.first.size == feature_ids.size + # TODO: fix (unknown) source of empty data_entries + sleep 1 + data_entry_file = $gridfs.find_one(_id: data_entries_id) + @data_entries = Marshal.load(data_entry_file.data) + end + bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.size} rows, but dataset (#{id}) has #{compound_ids.size} compounds" unless @data_entries.size == compound_ids.size + # TODO: data_entries can be empty, poorly reproducible, mongo problem? + bad_request_error "Data entries (#{data_entries_id}) have #{@data_entries.first.size} columns, but dataset (#{id}) has #{feature_ids.size} features" unless @data_entries.first.size == feature_ids.size + #$logger.debug "Retrieving data: #{Time.now-t}" + end + end + @data_entries + end + + # Find data entry values for a given compound and feature + # @param compound [OpenTox::Compound] OpenTox Compound object + # @param feature [OpenTox::Feature] OpenTox Feature object + # @return [Array] Data entry values + def values(compound, feature) + rows = compound_ids.each_index.select{|r| compound_ids[r] == compound.id } + col = feature_ids.index feature.id + rows.collect{|row| data_entries[row][col]} + end + + # Writers + + # Set compounds + def compounds=(compounds) + self.compound_ids = compounds.collect{|c| c.id} + end + + # Set features + def features=(features) + self.feature_ids = features.collect{|f| f.id} + end + + # Dataset operations + + # Split a dataset into n folds + # @param [Integer] number of folds + # @return [Array] Array with folds [training_dataset,test_dataset] + def folds n + len = self.compound_ids.size + indices = (0..len-1).to_a.shuffle + mid = (len/n) + chunks = [] + start = 0 + 1.upto(n) do |i| + last = start+mid + last = last-1 unless len%n >= i + test_idxs = indices[start..last] || [] + test_cids = test_idxs.collect{|i| self.compound_ids[i]} + test_data_entries = test_idxs.collect{|i| self.data_entries[i]} + test_dataset = self.class.new(:compound_ids => test_cids, :feature_ids => self.feature_ids, :data_entries => test_data_entries) + training_idxs = indices-test_idxs + training_cids = training_idxs.collect{|i| self.compound_ids[i]} + training_data_entries = training_idxs.collect{|i| self.data_entries[i]} + training_dataset = self.class.new(:compound_ids => training_cids, :feature_ids => self.feature_ids, :data_entries => training_data_entries) + test_dataset.save_all + training_dataset.save_all + chunks << [training_dataset,test_dataset] + start = last+1 + end + chunks + end + + # Diagnostics + + def correlation_plot training_dataset + # TODO: create/store svg + R.assign "features", data_entries + R.assign "activities", training_dataset.data_entries.collect{|de| de.first} + R.eval "featurePlot(features,activities)" + end + + def density_plot + # TODO: create/store svg + R.assign "acts", data_entries.collect{|r| r.first }#.compact + R.eval "plot(density(-log(acts),na.rm= TRUE), main='-log(#{features.first.name})')" + end + + # Serialisation + + # converts dataset to csv format including compound smiles as first column, other column headers are feature names + # @return [String] + def to_csv(inchi=false) + CSV.generate() do |csv| #{:force_quotes=>true} + csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} + compounds.each_with_index do |c,i| + csv << [inchi ? c.inchi : c.smiles] + data_entries[i] + end + end + end + + + # Parsers + + # Create a dataset from file (csv,sdf,...) + # @param filename [String] + # @return [String] dataset uri + # TODO + #def self.from_sdf_file + #end + + # Create a dataset from CSV file + # TODO: document structure + def self.from_csv_file file, source=nil, bioassay=true + source ||= file + name = File.basename(file,".*") + dataset = self.find_by(:source => source, :name => name) + if dataset + $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})." + else + $logger.debug "Parsing #{file}." + table = CSV.read file, :skip_blanks => true + dataset = self.new(:source => source, :name => name) + dataset.parse_table table, bioassay + end + dataset + end + + # parse data in tabular format (e.g. from csv) + # does a lot of guesswork in order to determine feature types + def parse_table table, bioassay=true + + time = Time.now + + # features + feature_names = table.shift.collect{|f| f.strip} + warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size + compound_format = feature_names.shift.strip + bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i + + numeric = [] + # guess feature types + feature_names.each_with_index do |f,i| + metadata = {:name => f} + values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact + types = values.collect{|v| v.numeric? ? true : false}.uniq + if values.size == 0 # empty feature + elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes + metadata["numeric"] = true + numeric[i] = true + else + metadata["nominal"] = true + metadata["accept_values"] = values + numeric[i] = false + end + if bioassay + if metadata["numeric"] + feature = NumericBioAssay.find_or_create_by(metadata) + elsif metadata["nominal"] + feature = NominalBioAssay.find_or_create_by(metadata) + end + else + metadata.merge({:measured => false, :calculated => true}) + if metadata["numeric"] + feature = NumericFeature.find_or_create_by(metadata) + elsif metadata["nominal"] + feature = NominalFeature.find_or_create_by(metadata) + end + end + feature_ids << feature.id if feature + end + + $logger.debug "Feature values: #{Time.now-time}" + time = Time.now + + r = -1 + compound_time = 0 + value_time = 0 + + # compounds and values + @data_entries = [] #Array.new(table.size){Array.new(table.first.size-1)} + + table.each_with_index do |vals,i| + ct = Time.now + identifier = vals.shift + warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty? + begin + case compound_format + when /SMILES/i + compound = OpenTox::Compound.from_smiles(identifier) + when /InChI/i + compound = OpenTox::Compound.from_inchi(identifier) + end + rescue + compound = nil + end + if compound.nil? + # compound parsers may return nil + warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." + next + end + # TODO insert empty compounds to keep positions? + compound_time += Time.now-ct + + r += 1 + unless vals.size == feature_ids.size # way cheaper than accessing features + warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." + next + end + + compound_ids << compound.id + table.first.size == 0 ? @data_entries << Array.new(0) : @data_entries << Array.new(table.first.size-1) + + vals.each_with_index do |v,j| + if v.blank? + warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})." + next + elsif numeric[j] + @data_entries.last[j] = v.to_f + else + @data_entries.last[j] = v.strip + end + end + end + compounds.duplicates.each do |compound| + positions = [] + compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi} + warnings << "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." + end + + $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})" + time = Time.now + save_all + $logger.debug "Saving: #{Time.now-time}" + + end + + # Fill unset data entries + # @param any value + def fill_nil_with n + (0 .. compound_ids.size-1).each do |i| + @data_entries[i] ||= [] + (0 .. feature_ids.size-1).each do |j| + @data_entries[i][j] ||= n + end + end + end + + def scale + scaled_data_entries = Array.new(data_entries.size){Array.new(data_entries.first.size)} + centers = [] + scales = [] + feature_ids.each_with_index do |feature_id,col| + R.assign "x", data_entries.collect{|de| de[col]} + R.eval "scaled = scale(x,center=T,scale=T)" + centers[col] = R.eval("attr(scaled, 'scaled:center')").to_ruby + scales[col] = R.eval("attr(scaled, 'scaled:scale')").to_ruby + R.eval("scaled").to_ruby.each_with_index do |value,row| + scaled_data_entries[row][col] = value + end + end + scaled_dataset = ScaledDataset.new(attributes) + scaled_dataset["_id"] = BSON::ObjectId.new + scaled_dataset["_type"] = "OpenTox::ScaledDataset" + scaled_dataset.centers = centers + scaled_dataset.scales = scales + scaled_dataset.data_entries = scaled_data_entries + scaled_dataset.save_all + scaled_dataset + end + end + + # Dataset for lazar predictions + class LazarPrediction < Dataset + field :creator, type: String + field :prediction_feature_id, type: String + + def prediction_feature + Feature.find prediction_feature_id + end + + end + + # Dataset for descriptors (physchem) + class DescriptorDataset < Dataset + field :feature_calculation_algorithm, type: String + + end + + class ScaledDataset < DescriptorDataset + + field :centers, type: Array, default: [] + field :scales, type: Array, default: [] + + def original_value value, i + value * scales[i] + centers[i] + end + end + + # Dataset for fminer descriptors + class FminerDataset < DescriptorDataset + field :training_algorithm, type: String + field :training_dataset_id, type: BSON::ObjectId + field :training_feature_id, type: BSON::ObjectId + field :training_parameters, type: Hash + end + +end diff --git a/lib/descriptor.rb b/lib/descriptor.rb new file mode 100644 index 0000000..9733bde --- /dev/null +++ b/lib/descriptor.rb @@ -0,0 +1,248 @@ +require 'digest/md5' +ENV["JAVA_HOME"] ||= "/usr/lib/jvm/java-7-openjdk" +# TODO store descriptors in mongodb + +module OpenTox + + module Algorithm + + # Class for descriptor calculations + class Descriptor + include OpenTox + + JAVA_DIR = File.join(File.dirname(__FILE__),"..","java") + CDK_JAR = Dir[File.join(JAVA_DIR,"cdk-*jar")].last + JOELIB_JAR = File.join(JAVA_DIR,"joelib2.jar") + LOG4J_JAR = File.join(JAVA_DIR,"log4j.jar") + JMOL_JAR = File.join(JAVA_DIR,"Jmol.jar") + + obexclude = ["cansmi","cansmiNS","formula","InChI","InChIKey","s","smarts","title","L5"] + OBDESCRIPTORS = Hash[OpenBabel::OBDescriptor.list_as_string("descriptors").split("\n").collect do |d| + name,description = d.split(/\s+/,2) + ["Openbabel."+name,description] unless obexclude.include? name + end.compact.sort{|a,b| a[0] <=> b[0]}] + + cdk_desc = YAML.load(`java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptorInfo`) + CDKDESCRIPTORS = Hash[cdk_desc.collect { |d| ["Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''), d[:description]] }.sort{|a,b| a[0] <=> b[0]}] + CDKDESCRIPTOR_VALUES = cdk_desc.collect { |d| prefix="Cdk."+d[:java_class].split('.').last.sub(/Descriptor/,''); d[:names].collect{ |name| prefix+"."+name } }.flatten + + # exclude Hashcode (not a physchem property) and GlobalTopologicalChargeIndex (Joelib bug) + joelibexclude = ["MoleculeHashcode","GlobalTopologicalChargeIndex"] + # strip Joelib messages from stdout + JOELIBDESCRIPTORS = Hash[YAML.load(`java -classpath #{JOELIB_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptorInfo | sed '0,/---/d'`).collect do |d| + name = d[:java_class].sub(/^joelib2.feature.types./,'') + # impossible to obtain meaningful descriptions from JOELIb, see java/JoelibDescriptors.java + ["Joelib."+name, "no description available"] unless joelibexclude.include? name + end.compact.sort{|a,b| a[0] <=> b[0]}] + + DESCRIPTORS = OBDESCRIPTORS.merge(CDKDESCRIPTORS.merge(JOELIBDESCRIPTORS)) + DESCRIPTOR_VALUES = OBDESCRIPTORS.keys + CDKDESCRIPTOR_VALUES + JOELIBDESCRIPTORS.keys + + require_relative "unique_descriptors.rb" + + # Description of available descriptors + def self.description descriptor + lib = descriptor.split('.').first + case lib + when "Openbabel" + OBDESCRIPTORS[descriptor] + when "Cdk" + name = descriptor.split('.')[0..-2].join('.') + CDKDESCRIPTORS[name] + when "Joelib" + JOELIBDESCRIPTORS[descriptor] + when "lookup" + "Read feature values from a dataset" + end + end + + # Match an array of smarts features + def self.smarts_match compounds, smarts_features, count=false + bad_request_error "Compounds for smarts_match are empty" unless compounds + bad_request_error "Smarts features for smarts_match are empty" unless smarts_features + parse compounds + @count = count + obconversion = OpenBabel::OBConversion.new + obmol = OpenBabel::OBMol.new + obconversion.set_in_format('smi') + smarts_pattern = OpenBabel::OBSmartsPattern.new + smarts_features = [smarts_features] if smarts_features.is_a?(Feature) + @smarts = smarts_features.collect{|f| f.smarts} + @physchem_descriptors = nil + @data_entries = Array.new(@compounds.size){Array.new(@smarts.size,false)} + @compounds.each_with_index do |compound,c| + obconversion.read_string(obmol,compound.smiles) + @smarts.each_with_index do |smart,s| + smarts_pattern.init(smart) + if smarts_pattern.match(obmol) + count ? value = smarts_pattern.get_map_list.to_a.size : value = 1 + else + value = 0 + end + @data_entries[c][s] = value + end + end + serialize + end + + # Count matches of an array with smarts features + def self.smarts_count compounds, smarts + # TODO: non-overlapping matches? + smarts_match compounds,smarts,true + end + + # Calculate physchem descriptors + # @param [OpenTox::Compound,Array,OpenTox::Dataset] input object, either a compound, an array of compounds or a dataset + def self.physchem compounds, descriptors=UNIQUEDESCRIPTORS + parse compounds + @data_entries = Array.new(@compounds.size){[]} + @descriptors = descriptors + @smarts = nil + @physchem_descriptors = [] # CDK may return more than one result per descriptor, they are stored as separate features + des = {} + @descriptors.each do |d| + lib, descriptor = d.split(".",2) + lib = lib.downcase.to_sym + des[lib] ||= [] + des[lib] << descriptor + end + des.each do |lib,descriptors| + p lib, descriptors + send(lib, descriptors) + end + serialize + end + + def self.openbabel descriptors + $logger.debug "compute #{descriptors.size} openbabel descriptors for #{@compounds.size} compounds" + obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d} + obmol = OpenBabel::OBMol.new + obconversion = OpenBabel::OBConversion.new + obconversion.set_in_format 'smi' + last_feature_idx = @physchem_descriptors.size + @compounds.each_with_index do |compound,c| + obconversion.read_string obmol, compound.smiles + obdescriptors.each_with_index do |descriptor,d| + @data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol)) + end + end + @physchem_descriptors += descriptors.collect{|d| "Openbabel.#{d}"} + end + + def self.java_descriptors descriptors, lib + $logger.debug "compute #{descriptors.size} cdk descriptors for #{@compounds.size} compounds" + sdf = sdf_3d + # use java system call (rjb blocks within tasks) + # use Tempfiles to avoid "Argument list too long" error + case lib + when "cdk" + run_cmd "java -classpath #{CDK_JAR}:#{JAVA_DIR} CdkDescriptors #{sdf} #{descriptors.join(" ")}" + when "joelib" + run_cmd "java -classpath #{JOELIB_JAR}:#{JMOL_JAR}:#{LOG4J_JAR}:#{JAVA_DIR} JoelibDescriptors #{sdf} #{descriptors.join(' ')}" + end + last_feature_idx = @physchem_descriptors.size + YAML.load_file("#{sdf}#{lib}.yaml").each_with_index do |calculation,i| + # TODO create warnings + #$logger.error "Descriptor calculation failed for compound #{@compounds[i].inchi}." if calculation.empty? + # CDK Descriptors may calculate multiple values, they are stored in separate features + @physchem_descriptors += calculation.keys if i == 0 + calculation.keys.each_with_index do |name,j| + @data_entries[i][j+last_feature_idx] = fix_value(calculation[name]) + end + end + FileUtils.rm "#{sdf}#{lib}.yaml" + end + + def self.cdk descriptors + java_descriptors descriptors, "cdk" + end + + def self.joelib descriptors + java_descriptors descriptors, "joelib" + end + + def self.lookup compounds, features, dataset + parse compounds + fingerprint = [] + compounds.each do |compound| + fingerprint << [] + features.each do |feature| + end + end + end + + def self.run_cmd cmd + cmd = "#{cmd} 2>&1" + $logger.debug "running external cmd: '#{cmd}'" + p = IO.popen(cmd) do |io| + while line = io.gets + $logger.debug "> #{line.chomp}" + end + io.close + raise "external cmd failed '#{cmd}' (see log file for error msg)" unless $?.to_i == 0 + end + end + + def self.sdf_3d + # TODO check if 3d sdfs are stored in GridFS + sdf = "" + @compounds.each do |compound| + sdf << compound.sdf + end + sdf_file = "/tmp/#{SecureRandom.uuid}.sdf" + File.open(sdf_file,"w+"){|f| f.print sdf} + sdf_file + end + + def self.parse compounds + @input_class = compounds.class.to_s + case @input_class + when "OpenTox::Compound" + @compounds = [compounds] + when "Array" + @compounds = compounds + when "OpenTox::Dataset" + @compounds = compounds.compounds + else + bad_request_error "Cannot calculate descriptors for #{compounds.class} objects." + end + end + + def self.serialize + @data_entries.collect!{|de| de.collect{|v| v.round(5) unless v.nil?}} + case @input_class + when "OpenTox::Compound" + @data_entries.first + when "Array" + @data_entries + when "OpenTox::Dataset" + dataset = OpenTox::DescriptorDataset.new(:compound_ids => @compounds.collect{|c| c.id}) + if @smarts + dataset.feature_ids = @smarts.collect{|smart| Smarts.find_or_create_by(:smarts => smart).id} + @count ? algo = "count" : algo = "match" + dataset.feature_calculation_algorithm = "#{self}.smarts_#{algo}" + + elsif @physchem_descriptors + dataset.feature_ids = @physchem_descriptors.collect{|d| PhysChemDescriptor.find_or_create_by(:name => d, :creator => __FILE__).id} + dataset.data_entries = @data_entries + dataset.feature_calculation_algorithm = "#{self}.physchem" + #TODO params? + end + dataset.save_all + dataset + end + end + + def self.fix_value val + val = val.first if val.is_a? Array and val.size == 1 + val = nil if val == "NaN" + if val.numeric? + val = Float(val) + val = nil if val.nan? or val.infinite? + end + val + end + private_class_method :sdf_3d, :fix_value, :parse, :run_cmd, :serialize + end + end +end diff --git a/lib/error.rb b/lib/error.rb new file mode 100644 index 0000000..39b3c76 --- /dev/null +++ b/lib/error.rb @@ -0,0 +1,66 @@ +module OpenToxError + attr_accessor :http_code, :message, :cause + def initialize message=nil + message = message.to_s.gsub(/\A"|"\Z/, '') if message # remove quotes + super message + @http_code ||= 500 + @message = message.to_s + @cause = cut_backtrace(caller) + $logger.error("\n"+JSON.pretty_generate({ + :http_code => @http_code, + :message => @message, + :cause => @cause + })) + end + + def cut_backtrace(trace) + if trace.is_a?(Array) + cut_index = trace.find_index{|line| line.match(/sinatra|minitest/)} + cut_index ||= trace.size + cut_index -= 1 + cut_index = trace.size-1 if cut_index < 0 + trace[0..cut_index] + else + trace + end + end + +end + +class RuntimeError + include OpenToxError +end + +# clutters log file with library errors +#class NoMethodError + #include OpenToxError +#end + +module OpenTox + + class Error < RuntimeError + include OpenToxError + + def initialize(code, message=nil) + @http_code = code + super message + end + end + + # OpenTox errors + RestClientWrapper.known_errors.each do |error| + # create error classes + c = Class.new Error do + define_method :initialize do |message=nil| + super error[:code], message + end + end + OpenTox.const_set error[:class],c + + # define global methods for raising errors, eg. bad_request_error + Object.send(:define_method, error[:method]) do |message| + raise c.new(message) + end + end + +end diff --git a/lib/experiment.rb b/lib/experiment.rb new file mode 100644 index 0000000..0dfdf86 --- /dev/null +++ b/lib/experiment.rb @@ -0,0 +1,99 @@ +module OpenTox + + class Experiment + field :dataset_ids, type: Array + field :model_settings, type: Array, default: [] + field :results, type: Hash, default: {} + + def run + dataset_ids.each do |dataset_id| + dataset = Dataset.find(dataset_id) + results[dataset_id.to_s] = [] + model_settings.each do |setting| + setting = setting.dup + model_algorithm = setting.delete :model_algorithm #if setting[:model_algorithm] + model = Object.const_get(model_algorithm).create dataset, setting + $logger.debug model + model.save + repeated_crossvalidation = RepeatedCrossValidation.create model + results[dataset_id.to_s] << {:model_id => model.id, :repeated_crossvalidation_id => repeated_crossvalidation.id} + end + end + save + end + + def report + # statistical significances http://www.r-bloggers.com/anova-and-tukeys-test-on-r/ + report = {} + report[:name] = name + report[:experiment_id] = self.id.to_s + report[:results] = {} + parameters = [] + dataset_ids.each do |dataset_id| + dataset_name = Dataset.find(dataset_id).name + report[:results][dataset_name] = {} + report[:results][dataset_name][:anova] = {} + report[:results][dataset_name][:data] = [] + # TODO results[dataset_id.to_s] does not exist + results[dataset_id.to_s].each do |result| + model = Model::Lazar.find(result[:model_id]) + repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id]) + crossvalidations = repeated_cv.crossvalidations + if crossvalidations.first.is_a? ClassificationCrossValidation + parameters = [:accuracy,:true_rate,:predictivity] + elsif crossvalidations.first.is_a? RegressionCrossValidation + parameters = [:rmse,:mae,:r_squared] + end + summary = {} + [:neighbor_algorithm, :neighbor_algorithm_parameters, :prediction_algorithm].each do |key| + summary[key] = model[key] + end + summary[:nr_instances] = crossvalidations.first.nr_instances + summary[:nr_unpredicted] = crossvalidations.collect{|cv| cv.nr_unpredicted} + summary[:time] = crossvalidations.collect{|cv| cv.time} + parameters.each do |param| + summary[param] = crossvalidations.collect{|cv| cv.send(param)} + end + report[:results][dataset_name][:data] << summary + end + end + report[:results].each do |dataset,results| + ([:time,:nr_unpredicted]+parameters).each do |param| + experiments = [] + outcome = [] + results[:data].each_with_index do |result,i| + result[param].each do |p| + experiments << i + p = nil if p.kind_of? Float and p.infinite? # TODO fix @ division by 0 + outcome << p + end + end + begin + R.assign "experiment_nr",experiments.collect{|i| "Experiment #{i}"} + R.eval "experiment_nr = factor(experiment_nr)" + R.assign "outcome", outcome + R.eval "data = data.frame(experiment_nr,outcome)" + # one-way ANOVA + R.eval "fit = aov(outcome ~ experiment_nr, data=data,na.action='na.omit')" + # http://stackoverflow.com/questions/3366506/extract-p-value-from-aov + p_value = R.eval("summary(fit)[[1]][['Pr(>F)']][[1]]").to_ruby + # aequivalent + # sum = R.eval("summary(fit)") + #p_value = sum.to_ruby.first.last.first + rescue + p_value = nil + end + report[:results][dataset][:anova][param] = p_value +=begin +=end + end + end + report + end + + def summary + report[:results].collect{|dataset,data| {dataset => data[:anova].select{|param,p_val| p_val < 0.1}}} + end + end + +end diff --git a/lib/feature.rb b/lib/feature.rb new file mode 100644 index 0000000..13fa6d1 --- /dev/null +++ b/lib/feature.rb @@ -0,0 +1,95 @@ +module OpenTox + + # Basic feature class + class Feature + field :nominal, type: Boolean + field :numeric, type: Boolean + field :measured, type: Boolean + end + + # Feature for categorical variables + class NominalFeature < Feature + # TODO check if accept_values are still needed + field :accept_values, type: Array + def initialize params + super params + nominal = true + end + end + + # Feature for quantitative variables + class NumericFeature < Feature + def initialize params + super params + numeric = true + end + end + + # Feature for SMARTS fragments + class Smarts < NominalFeature + field :smarts, type: String + index "smarts" => 1 + def self.from_smarts smarts + self.find_or_create_by :smarts => smarts + end + end + + # Feature for supervised fragments from Fminer algorithm + class FminerSmarts < Smarts + field :p_value, type: Float + # TODO check if effect is used + field :effect, type: String + field :dataset_id + end + + # Feature for database fingerprints + # needs count for efficient retrieval (see compound.rb) + class FingerprintSmarts < Smarts + field :count, type: Integer + def self.fingerprint +=begin + @@fp4 ||= OpenTox::FingerprintSmarts.all + unless @@fp4.size == 306 + @@fp4 = [] + # OpenBabel FP4 fingerprints + # OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html + # TODO investigate other types of fingerprints (MACCS) + # OpenBabel http://open-babel.readthedocs.org/en/latest/Fingerprints/intro.html + # http://www.dalkescientific.com/writings/diary/archive/2008/06/26/fingerprint_background.html + # OpenBabel MNA http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html#multilevel-neighborhoods-of-atoms-mna + # Morgan ECFP, FCFP + # http://cdk.github.io/cdk/1.5/docs/api/org/openscience/cdk/fingerprint/CircularFingerprinter.html + # http://www.rdkit.org/docs/GettingStartedInPython.html + # Chemfp + # https://chemfp.readthedocs.org/en/latest/using-tools.html + # CACTVS/PubChem + + File.open(File.join(File.dirname(__FILE__),"SMARTS_InteLigand.txt")).each do |l| + l.strip! + unless l.empty? or l.match /^#/ + name,smarts = l.split(': ') + @@fp4 << OpenTox::FingerprintSmarts.find_or_create_by(:name => name, :smarts => smarts) unless smarts.nil? + end + end + end + @@fp4 +=end + end + end + + # Feature for physico-chemical descriptors + class PhysChemDescriptor < NumericFeature + field :algorithm, type: String, default: "OpenTox::Algorithm::Descriptor.physchem" + field :parameters, type: Hash + field :creator, type: String + end + + # Feature for categorical bioassay results + class NominalBioAssay < NominalFeature + end + + # Feature for quantitative bioassay results + class NumericBioAssay < NumericFeature + end + +end diff --git a/lib/lazar.rb b/lib/lazar.rb new file mode 100644 index 0000000..f801062 --- /dev/null +++ b/lib/lazar.rb @@ -0,0 +1,72 @@ +require 'rubygems' +require "bundler/setup" +require "rest-client" +require 'yaml' +require 'json' +require 'logger' +require 'mongoid' +require 'rserve' +require "nokogiri" +require "base64" + +# Mongo setup +# TODO retrieve correct environment from Rack/Sinatra +ENV["MONGOID_ENV"] ||= "development" +# TODO remove config files, change default via ENV or directly in Mongoid class +Mongoid.load!("#{File.expand_path(File.join(File.dirname(__FILE__),'..','mongoid.yml'))}") +Mongoid.raise_not_found_error = false # return nil if no document is found +$mongo = Mongo::Client.new('mongodb://127.0.0.1:27017/opentox') +#$mongo = Mongoid.default_client +$gridfs = $mongo.database.fs + +# R setup +R = Rserve::Connection.new +R.eval "library(ggplot2)" +R.eval "library(grid)" +R.eval "library(gridExtra)" + +# Logger setup +STDOUT.sync = true # for redirection, etc see http://stackoverflow.com/questions/8549443/why-doesnt-logger-output-to-stdout-get-redirected-to-files +$logger = Logger.new STDOUT # STDERR did not work on my development machine (CH) +$logger.level = Logger::DEBUG +Mongo::Logger.level = Logger::WARN +#Mongo::Logger.logger = $logger + +# Require sub-Repositories +require_relative '../libfminer/libbbrc/bbrc' # include before openbabel +require_relative '../libfminer/liblast/last' # +require_relative '../last-utils/lu.rb' +require_relative '../openbabel/lib/openbabel' + +# Fminer environment variables +ENV['FMINER_SMARTS'] = 'true' +ENV['FMINER_NO_AROMATIC'] = 'true' +ENV['FMINER_PVALUES'] = 'true' +ENV['FMINER_SILENT'] = 'true' +ENV['FMINER_NR_HITS'] = 'true' + +# OpenTox classes and includes +CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules + +[ # be aware of the require sequence as it affects class/method overwrites + "overwrite.rb", + "rest-client-wrapper.rb", + "error.rb", + "opentox.rb", + "feature.rb", + "compound.rb", + "dataset.rb", + "descriptor.rb", + "algorithm.rb", + "descriptor.rb", + "bbrc.rb", + "model.rb", + "similarity.rb", + #"neighbor.rb", + "classification.rb", + "regression.rb", + "validation.rb", + "crossvalidation.rb", + "experiment.rb", +].each{ |f| require_relative f } + diff --git a/lib/model.rb b/lib/model.rb new file mode 100644 index 0000000..98433d0 --- /dev/null +++ b/lib/model.rb @@ -0,0 +1,265 @@ +module OpenTox + + module Model + + class Model + include OpenTox + include Mongoid::Document + include Mongoid::Timestamps + store_in collection: "models" + + field :name, type: String + field :creator, type: String, default: __FILE__ + # datasets + field :training_dataset_id, type: BSON::ObjectId + # algorithms + field :prediction_algorithm, type: String + # prediction feature + field :prediction_feature_id, type: BSON::ObjectId + + def training_dataset + Dataset.find(training_dataset_id) + end + end + + class Lazar < Model + + # algorithms + field :neighbor_algorithm, type: String + field :neighbor_algorithm_parameters, type: Hash, default: {} + + # Create a lazar model from a training_dataset and a feature_dataset + # @param [OpenTox::Dataset] training_dataset + # @return [OpenTox::Model::Lazar] Regression or classification model + def initialize training_dataset, params={} + + super params + bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1 + + # TODO document convention + prediction_feature = training_dataset.features.first + # set defaults for empty parameters + self.prediction_feature_id ||= prediction_feature.id + self.training_dataset_id ||= training_dataset.id + self.name ||= "#{training_dataset.name} #{prediction_feature.name}" + self.neighbor_algorithm_parameters ||= {} + self.neighbor_algorithm_parameters[:training_dataset_id] = training_dataset.id + save + self + end + + def predict object, use_database_values=true + + t = Time.now + at = Time.now + + training_dataset = Dataset.find training_dataset_id + prediction_feature = Feature.find prediction_feature_id + + # parse data + compounds = [] + case object.class.to_s + when "OpenTox::Compound" + compounds = [object] + when "Array" + compounds = object + when "OpenTox::Dataset" + compounds = object.compounds + else + bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter." + end + + # make predictions + predictions = [] + neighbors = [] + compounds.each_with_index do |compound,c| + t = Time.new + database_activities = training_dataset.values(compound,prediction_feature) + if use_database_values and database_activities and !database_activities.empty? + database_activities = database_activities.first if database_activities.size == 1 + predictions << {:compound => compound, :value => database_activities, :confidence => "measured", :warning => "Compound #{compound.smiles} occurs in training dataset with activity '#{database_activities}'."} + next + end + neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) + + # add activities + # TODO: improve efficiency, takes 3 times longer than previous version + neighbors.collect! do |n| + rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first} + acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact + acts.empty? ? nil : n << acts + end + neighbors.compact! # remove neighbors without training activities + predictions << Algorithm.run(prediction_algorithm, compound, {:neighbors => neighbors,:training_dataset_size => training_dataset.data_entries.size}) +=begin +# TODO scaled dataset for physchem + p neighbor_algorithm_parameters + p (neighbor_algorithm_parameters["feature_dataset_id"]) + d = Dataset.find(neighbor_algorithm_parameters["feature_dataset_id"]) + p d + p d.class + if neighbor_algorithm_parameters["feature_dataset_id"] and Dataset.find(neighbor_algorithm_parameters["feature_dataset_id"]).kind_of? ScaledDataset + p "SCALED" + end +=end + end + + # serialize result + case object.class.to_s + when "OpenTox::Compound" + prediction = predictions.first + prediction[:neighbors] = neighbors.sort{|a,b| b[1] <=> a[1]} # sort according to similarity + return prediction + when "Array" + return predictions + when "OpenTox::Dataset" + # prepare prediction dataset + prediction_dataset = LazarPrediction.new( + :name => "Lazar prediction for #{prediction_feature.name}", + :creator => __FILE__, + :prediction_feature_id => prediction_feature.id + + ) + confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Prediction confidence" ) + # TODO move into warnings field + warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings") + prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ] + prediction_dataset.compounds = compounds + prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence], p[:warning]]} + prediction_dataset.save_all + return prediction_dataset + end + + end + + def training_activities + i = training_dataset.feature_ids.index prediction_feature_id + training_dataset.data_entries.collect{|de| de[i]} + end + + end + + class LazarClassification < Lazar + + def self.create training_dataset, params={} + model = self.new training_dataset, params + model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm + model.neighbor_algorithm ||= "fingerprint_neighbors" + model.neighbor_algorithm_parameters ||= {} + { + :type => "MP2D", + :training_dataset_id => training_dataset.id, + :min_sim => 0.1 + }.each do |key,value| + model.neighbor_algorithm_parameters[key] ||= value + end + model.save + model + end + end + + class LazarRegression < Lazar + + def self.create training_dataset, params={} + model = self.new training_dataset, params + model.neighbor_algorithm ||= "fingerprint_neighbors" + model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.weighted_average" + model.neighbor_algorithm_parameters ||= {} + { + :type => "MP2D", + :training_dataset_id => training_dataset.id, + :min_sim => 0.1 + #:type => "FP4", + #:training_dataset_id => training_dataset.id, + #:min_sim => 0.7 + }.each do |key,value| + model.neighbor_algorithm_parameters[key] ||= value + end + model.save + model + end + end + + class LazarFminerClassification < LazarClassification + field :feature_calculation_parameters, type: Hash + + def self.create training_dataset, fminer_params={} + model = super(training_dataset) + model.update "_type" => self.to_s # adjust class + model = self.find model.id # adjust class + model.neighbor_algorithm = "fminer_neighbors" + model.neighbor_algorithm_parameters = { + :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match", + :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset,fminer_params).id, + :min_sim => 0.3 + } + model.feature_calculation_parameters = fminer_params + model.save + model + end + end + + class Prediction + include OpenTox + include Mongoid::Document + include Mongoid::Timestamps + + # TODO cv -> repeated cv + # TODO field Validations + field :endpoint, type: String + field :species, type: String + field :source, type: String + field :unit, type: String + field :model_id, type: BSON::ObjectId + field :repeated_crossvalidation_id, type: BSON::ObjectId + + def predict object + Lazar.find(model_id).predict object + end + + def training_dataset + model.training_dataset + end + + def model + Lazar.find model_id + end + + def repeated_crossvalidation + RepeatedCrossValidation.find repeated_crossvalidation_id + end + + def crossvalidations + repeated_crossvalidation.crossvalidations + end + + def regression? + training_dataset.features.first.numeric? + end + + def classification? + training_dataset.features.first.nominal? + end + + def self.from_csv_file file + metadata_file = file.sub(/csv$/,"json") + bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file + prediction_model = self.new JSON.parse(File.read(metadata_file)) + training_dataset = Dataset.from_csv_file file + model = nil + if training_dataset.features.first.nominal? + #model = LazarFminerClassification.create training_dataset + model = LazarClassification.create training_dataset + elsif training_dataset.features.first.numeric? + model = LazarRegression.create training_dataset + end + prediction_model[:model_id] = model.id + prediction_model[:repeated_crossvalidation_id] = RepeatedCrossValidation.create(model).id + prediction_model.save + prediction_model + end + end + + end + +end diff --git a/lib/opentox.rb b/lib/opentox.rb new file mode 100644 index 0000000..186c87a --- /dev/null +++ b/lib/opentox.rb @@ -0,0 +1,22 @@ +module OpenTox + + # Ruby interface + + # create default OpenTox classes (defined in opentox-client.rb) + # provides Mongoid's query and persistence methods + # http://mongoid.org/en/mongoid/docs/persistence.html + # http://mongoid.org/en/mongoid/docs/querying.html + CLASSES.each do |klass| + c = Class.new do + include OpenTox + include Mongoid::Document + include Mongoid::Timestamps + store_in collection: klass.downcase.pluralize + field :name, type: String + field :warnings, type: Array, default: [] + end + OpenTox.const_set klass,c + end + +end + diff --git a/lib/overwrite.rb b/lib/overwrite.rb new file mode 100644 index 0000000..c92ad2b --- /dev/null +++ b/lib/overwrite.rb @@ -0,0 +1,148 @@ +require "base64" +class Object + # An object is blank if it's false, empty, or a whitespace string. + # For example, "", " ", +nil+, [], and {} are all blank. + def blank? + respond_to?(:empty?) ? empty? : !self + end + + def numeric? + true if Float(self) rescue false + end + + # Returns dimension of nested arrays + def dimension + self.class == Array ? 1 + self[0].dimension : 0 + end +end + +class Numeric + def percent_of(n) + self.to_f / n.to_f * 100.0 + end +end + +module Enumerable + # @return [Array] only the duplicates of an enumerable + def duplicates + inject({}) {|h,v| h[v]=h[v].to_i+1; h}.reject{|k,v| v==1}.keys + end + # http://stackoverflow.com/questions/2562256/find-most-common-string-in-an-array + Enumerable.class_eval do + def mode + group_by do |e| + e + end.values.max_by(&:size).first + end + end +end + +class String + # @return [String] converts camel-case to underscore-case (OpenTox::SuperModel -> open_tox/super_model) + def underscore + self.gsub(/::/, '/'). + gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2'). + gsub(/([a-z\d])([A-Z])/,'\1_\2'). + tr("-", "_"). + downcase + end + + # convert strings to boolean values + # @return [TrueClass,FalseClass] true or false + def to_boolean + return true if self == true || self =~ (/(true|t|yes|y|1)$/i) + return false if self == false || self.nil? || self =~ (/(false|f|no|n|0)$/i) + bad_request_error "invalid value for Boolean: \"#{self}\"" + end + +end + +class File + # @return [String] mime_type including charset using linux cmd command + def mime_type + `file -ib '#{self.path}'`.chomp + end +end + +class Array + + # Sum up the size of single arrays in an array of arrays + # @param [Array] Array of arrays + # @return [Integer] Sum of size of array elements + def sum_size + self.inject(0) { |s,a| + if a.respond_to?('size') + s+=a.size + else + internal_server_error "No size available: #{a.inspect}" + end + } + end + + # For symbolic features + # @param [Array] Array to test. + # @return [Boolean] Whether the array has just one unique value. + def zero_variance? + return self.uniq.size == 1 + end + + def median + sorted = self.sort + len = sorted.length + (sorted[(len - 1) / 2] + sorted[len / 2]) / 2.0 + end + + def mean + self.inject{ |sum, el| sum + el }.to_f / self.size + end + + def sample_variance + m = self.mean + sum = self.inject(0){|accum, i| accum +(i-m)**2 } + sum/(self.length - 1).to_f + end + + def standard_deviation + Math.sqrt(self.sample_variance) + end + +end + +module URI + + def self.ssl? uri + URI.parse(uri).instance_of? URI::HTTPS + end + + # @return [Boolean] checks if resource exists by making a HEAD-request + def self.accessible?(uri) + parsed_uri = URI.parse(uri + (OpenTox::RestClientWrapper.subjectid ? "?subjectid=#{CGI.escape OpenTox::RestClientWrapper.subjectid}" : "")) + http_code = URI.task?(uri) ? 600 : 400 + http = Net::HTTP.new(parsed_uri.host, parsed_uri.port) + unless (URI.ssl? uri) == true + http = Net::HTTP.new(parsed_uri.host, parsed_uri.port) + request = Net::HTTP::Head.new(parsed_uri.request_uri) + http.request(request).code.to_i < http_code + else + http = Net::HTTP.new(parsed_uri.host, parsed_uri.port) + http.use_ssl = true + http.verify_mode = OpenSSL::SSL::VERIFY_NONE + request = Net::HTTP::Head.new(parsed_uri.request_uri) + http.request(request).code.to_i < http_code + end + rescue + false + end + + def self.valid? uri + u = URI.parse(uri) + u.scheme!=nil and u.host!=nil + rescue URI::InvalidURIError + false + end + + def self.task? uri + uri =~ /task/ and URI.valid? uri + end + +end diff --git a/lib/regression.rb b/lib/regression.rb new file mode 100644 index 0000000..868c25f --- /dev/null +++ b/lib/regression.rb @@ -0,0 +1,262 @@ +# TODO install R packages kernlab, caret, doMC, class, e1071 + + + # log transform activities (create new dataset) + # scale, normalize features, might not be necessary + # http://stats.stackexchange.com/questions/19216/variables-are-often-adjusted-e-g-standardised-before-making-a-model-when-is + # http://stats.stackexchange.com/questions/7112/when-and-how-to-use-standardized-explanatory-variables-in-linear-regression + # zero-order correlation and the semi-partial correlation + # seems to be necessary for svm + # http://stats.stackexchange.com/questions/77876/why-would-scaling-features-decrease-svm-performance?lq=1 + # http://stackoverflow.com/questions/15436367/svm-scaling-input-values + # use lasso or elastic net?? + # select relevant features + # remove features with a single value + # remove correlated features + # remove features not correlated with endpoint +module OpenTox + module Algorithm + + class Regression + + def self.weighted_average compound, params + weighted_sum = 0.0 + sim_sum = 0.0 + confidence = 0.0 + neighbors = params[:neighbors] + activities = [] + neighbors.each do |row| + n,sim,acts = row + confidence = sim if sim > confidence # distance to nearest neighbor + # TODO add LOO errors + acts.each do |act| + weighted_sum += sim*Math.log10(act) + activities << act + sim_sum += sim + end + end + #R.assign "activities", activities + #R.eval "cv = cv(activities)" + #confidence /= activities.standard_deviation#/activities.mean + #confidence = sim_sum*neighbors.size.to_f/params[:training_dataset_size] + #confidence = sim_sum/neighbors.size.to_f + #confidence = neighbors.size.to_f + confidence = 0 if confidence.nan? + sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) + {:value => prediction,:confidence => confidence} + end + + def self.local_linear_regression compound, neighbors + p neighbors.size + return nil unless neighbors.size > 0 + features = neighbors.collect{|n| Compound.find(n.first).fp4}.flatten.uniq + p features + training_data = Array.new(neighbors.size){Array.new(features.size,0)} + neighbors.each_with_index do |n,i| + #p n.first + neighbor = Compound.find n.first + features.each_with_index do |f,j| + training_data[i][j] = 1 if neighbor.fp4.include? f + end + end + p training_data + + R.assign "activities", neighbors.collect{|n| n[2].median} + R.assign "features", training_data + R.eval "model <- lm(activities ~ features)" + R.eval "summary <- summary(model)" + p R.summary + compound_features = features.collect{|f| compound.fp4.include? f ? 1 : 0} + R.assign "compound_features", compound_features + R.eval "prediction <- predict(model,compound_features)" + p R.prediction + + end + + def self.weighted_average_with_relevant_fingerprints neighbors + weighted_sum = 0.0 + sim_sum = 0.0 + fingerprint_features = [] + neighbors.each do |row| + n,sim,acts = row + neighbor = Compound.find n + fingerprint_features += neighbor.fp4 + end + fingerprint_features.uniq! + p fingerprint_features +=begin + p n + acts.each do |act| + weighted_sum += sim*Math.log10(act) + sim_sum += sim + end + end +=end + confidence = sim_sum/neighbors.size.to_f + sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) + {:value => prediction,:confidence => confidence} + end + + # Local support vector regression from neighbors + # @param [Hash] params Keys `:props, :activities, :sims, :min_train_performance` are required + # @return [Numeric] A prediction value. + def self.local_svm_regression neighbors, params={:min_train_performance => 0.1} + + confidence = 0.0 + prediction = nil + + $logger.debug "Local SVM." + props = neighbors.collect{|row| row[3] } + neighbors.shift + activities = neighbors.collect{|n| n[2]} + prediction = self.local_svm_prop( props, activities, params[:min_train_performance]) # params[:props].nil? signals non-prop setting + prediction = nil if (!prediction.nil? && prediction.infinite?) + $logger.debug "Prediction: '#{prediction}' ('#{prediction.class}')." + if prediction + confidence = get_confidence({:sims => neighbors.collect{|n| n[1]}, :activities => activities}) + else + confidence = nil if prediction.nil? + end + [prediction, confidence] + + end + + + # Local support vector prediction from neighbors. + # Uses propositionalized setting. + # Not to be called directly (use local_svm_regression or local_svm_classification). + # @param [Array] props, propositionalization of neighbors and query structure e.g. [ Array_for_q, two-nested-Arrays_for_n ] + # @param [Array] activities, activities for neighbors. + # @param [Float] min_train_performance, parameter to control censoring + # @return [Numeric] A prediction value. + def self.local_svm_prop(props, activities, min_train_performance) + + $logger.debug "Local SVM (Propositionalization / Kernlab Kernel)." + n_prop = props[1..-1] # is a matrix, i.e. two nested Arrays. + q_prop = props[0] # is an Array. + + prediction = nil + if activities.uniq.size == 1 + prediction = activities[0] + else + t = Time.now + #$logger.debug gram_matrix.to_yaml + #@r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests + @r = Rserve::Connection.new#(true,false) # global R instance leads to Socket errors after a large number of requests + rs = [] + ["caret", "doMC", "class"].each do |lib| + #raise "failed to load R-package #{lib}" unless @r.void_eval "suppressPackageStartupMessages(library('#{lib}'))" + rs << "suppressPackageStartupMessages(library('#{lib}'))" + end + #@r.eval "registerDoMC()" # switch on parallel processing + rs << "registerDoMC()" # switch on parallel processing + #@r.eval "set.seed(1)" + rs << "set.seed(1)" + $logger.debug "Loading R packages: #{Time.now-t}" + t = Time.now + p n_prop + begin + + # set data + rs << "n_prop <- c(#{n_prop.flatten.join(',')})" + rs << "n_prop <- c(#{n_prop.flatten.join(',')})" + rs << "n_prop_x_size <- c(#{n_prop.size})" + rs << "n_prop_y_size <- c(#{n_prop[0].size})" + rs << "y <- c(#{activities.join(',')})" + rs << "q_prop <- c(#{q_prop.join(',')})" + rs << "y = matrix(y)" + rs << "prop_matrix = matrix(n_prop, n_prop_x_size, n_prop_y_size, byrow=T)" + rs << "q_prop = matrix(q_prop, 1, n_prop_y_size, byrow=T)" + + $logger.debug "Setting R data: #{Time.now-t}" + t = Time.now + # prepare data + rs << " + weights=NULL + if (!(class(y) == 'numeric')) { + y = factor(y) + weights=unlist(as.list(prop.table(table(y)))) + weights=(weights-1)^2 + } + " + + rs << " + rem = nearZeroVar(prop_matrix) + if (length(rem) > 0) { + prop_matrix = prop_matrix[,-rem,drop=F] + q_prop = q_prop[,-rem,drop=F] + } + rem = findCorrelation(cor(prop_matrix)) + if (length(rem) > 0) { + prop_matrix = prop_matrix[,-rem,drop=F] + q_prop = q_prop[,-rem,drop=F] + } + " + + #p @r.eval("y").to_ruby + #p "weights" + #p @r.eval("weights").to_ruby + $logger.debug "Preparing R data: #{Time.now-t}" + t = Time.now + # model + support vectors + #train_success = @r.eval <<-EOR + rs << ' + model = train(prop_matrix,y, + method="svmRadial", + preProcess=c("center", "scale"), + class.weights=weights, + trControl=trainControl(method="LGOCV",number=10), + tuneLength=8 + ) + perf = ifelse ( class(y)!="numeric", max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared ) + ' + File.open("/tmp/r.r","w+"){|f| f.puts rs.join("\n")} + p rs.join("\n") + p `Rscript /tmp/r.r` +=begin + @r.void_eval <<-EOR + model = train(prop_matrix,y, + method="svmRadial", + #preProcess=c("center", "scale"), + #class.weights=weights, + #trControl=trainControl(method="LGOCV",number=10), + #tuneLength=8 + ) + perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared ) + EOR +=end + + $logger.debug "Creating R SVM model: #{Time.now-t}" + t = Time.now + if train_success + # prediction + @r.eval "predict(model,q_prop); p = predict(model,q_prop)" # kernlab bug: predict twice + #@r.eval "p = predict(model,q_prop)" # kernlab bug: predict twice + @r.eval "if (class(y)!='numeric') p = as.character(p)" + prediction = @r.p + + # censoring + prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance.to_f ) + prediction = nil if prediction =~ /NA/ + $logger.debug "Performance: '#{sprintf("%.2f", @r.perf)}'" + else + $logger.debug "Model creation failed." + prediction = nil + end + $logger.debug "R Prediction: #{Time.now-t}" + rescue Exception => e + $logger.debug "#{e.class}: #{e.message}" + $logger.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + ensure + #puts @r.inspect + #TODO: broken pipe + #@r.quit # free R + end + end + prediction + end + end + + end +end + diff --git a/lib/rest-client-wrapper.rb b/lib/rest-client-wrapper.rb new file mode 100644 index 0000000..de1b74f --- /dev/null +++ b/lib/rest-client-wrapper.rb @@ -0,0 +1,98 @@ +module OpenTox + + class RestClientWrapper + + attr_accessor :request, :response + + @@subjectid = nil + + def self.subjectid=(subjectid) + @@subjectid = subjectid + end + + def self.subjectid + @@subjectid + end + + # REST methods + # Raises OpenTox::Error if call fails (rescued in overwrite.rb -> halt 502) + # Does not wait for task to finish and returns task uri + # @param [String] destination URI + # @param [optional,Hash|String] Payload data posted to the service + # @param [optional,Hash] Headers with params like :accept, :content_type, :subjectid, :verify_ssl + # @return [RestClient::Response] REST call response + [:head,:get,:post,:put,:delete].each do |method| + + define_singleton_method method do |uri,payload={},headers={},waiting_task=nil| + + # check input + bad_request_error "Headers are not a hash: #{headers.inspect}", uri unless headers==nil or headers.is_a?(Hash) + headers[:subjectid] ||= @@subjectid + bad_request_error "Invalid URI: '#{uri}'", uri unless URI.valid? uri + #resource_not_found_error "URI '#{uri}' not found.", uri unless URI.accessible?(uri, @subjectid) unless URI.ssl?(uri) + # make sure that no header parameters are set in the payload + [:accept,:content_type,:subjectid].each do |header| + if defined? $aa || URI(uri).host == URI($aa[:uri]).host + else + bad_request_error "#{header} should be submitted in the headers", uri if payload and payload.is_a?(Hash) and payload[header] + end + end + + # create request + args={} + args[:method] = method + args[:url] = uri + args[:verify_ssl] = 0 if headers[:verify_ssl].nil? || headers[:verify_ssl].empty? + args[:timeout] = 1800 + args[:payload] = payload + headers.each{ |k,v| headers.delete(k) if v==nil } if headers #remove keys with empty values, as this can cause problems + args[:headers] = headers + + $logger.debug "post to #{uri} with params #{payload.inspect.to_s[0..1000]}" if method.to_s=="post" + + @request = RestClient::Request.new(args) + # ignore error codes from Task services (may return error codes >= 400 according to API, which causes exceptions in RestClient and RDF::Reader) + @response = @request.execute do |response, request, result| + if [301, 302, 307].include? response.code and request.method == :get + response.follow_redirection(request, result) + elsif response.code >= 400 and !URI.task?(uri) + #TODO add parameters to error-report + #parameters = request.args + #parameters[:headers][:subjectid] = "REMOVED" if parameters[:headers] and parameters[:headers][:subjectid] + #parameters[:url] = parameters[:url].gsub(/(http|https|)\:\/\/[a-zA-Z0-9\-]+\:[a-zA-Z0-9]+\@/, "REMOVED@") if parameters[:url] + #message += "\nREST parameters:\n#{parameters.inspect}" + error = known_errors.collect{|e| e if e[:code] == response.code}.compact.first + begin # errors are returned as error reports in json, try to parse + # TODO: may be the reason for failure of task.rb -n test_11_wait_for_error_task + content = JSON.parse(response) + msg = content["message"].to_s + cause = content["errorCause"].to_s + raise if msg.size==0 && cause.size==0 # parsing failed + rescue # parsing error failed, use complete content as message + msg = "Could not parse error response from rest call '#{method}' to '#{uri}':\n#{response}" + cause = nil + end + Object.method(error[:method]).call msg, uri, cause # call error method + else + response + end + end + end + end + + #@return [Array] of hashes with error code, method and class + def self.known_errors + errors = [] + RestClient::STATUSES.each do |code,k| + if code >= 400 + method = k.underscore.gsub(/ |'/,'_') + method += "_error" unless method.match(/_error$/) + klass = method.split("_").collect{|s| s.capitalize}.join("") + errors << {:code => code, :method => method.to_sym, :class => klass} + end + end + errors + end + + end +end diff --git a/lib/similarity.rb b/lib/similarity.rb new file mode 100644 index 0000000..91e18db --- /dev/null +++ b/lib/similarity.rb @@ -0,0 +1,58 @@ +=begin +* Name: similarity.rb +* Description: Similarity algorithms +* Author: Andreas Maunz <andreas@maunz.de +* Date: 10/2012 +=end + +module OpenTox + module Algorithm + + class Similarity + + #TODO weighted tanimoto + + # Tanimoto similarity + # @param [Array] a fingerprints of first compound + # @param [Array] b fingerprints of second compound + # @return [Float] Tanimoto similarity + def self.tanimoto(a,b) + bad_request_error "fingerprints #{a} and #{b} don't have equal size" unless a.size == b.size + #common = 0.0 + #a.each_with_index do |n,i| + #common += 1 if n == b[i] + #end + #common/a.size + # TODO check if calculation speed can be improved + common_p_sum = 0.0 + all_p_sum = 0.0 + (0...a.size).each { |idx| + common_p_sum += [ a[idx], b[idx] ].min + all_p_sum += [ a[idx], b[idx] ].max + } + common_p_sum/all_p_sum + end + + + # Cosine similarity + # @param [Array] a fingerprints of first compound + # @param [Array] b fingerprints of second compound + # @return [Float] Cosine similarity, the cosine of angle enclosed between vectors a and b + def self.cosine(a, b) + val = 0.0 + if a.size>0 and b.size>0 + if a.size>12 && b.size>12 + a = a[0..11] + b = b[0..11] + end + a_vec = a.to_gv + b_vec = b.to_gv + val = a_vec.dot(b_vec) / (a_vec.norm * b_vec.norm) + end + val + end + + end + + end +end diff --git a/lib/unique_descriptors.rb b/lib/unique_descriptors.rb new file mode 100644 index 0000000..cf9cbf3 --- /dev/null +++ b/lib/unique_descriptors.rb @@ -0,0 +1,120 @@ +# set of non redundant descriptors, faster algorithms are preferred +# TODO: +# select logP algorithm +# select l5 algorithm +# use smarts matcher for atom counts +# check correlations +UNIQUEDESCRIPTORS = [ + "Openbabel.abonds", #Number of aromatic bonds + "Openbabel.atoms", #Number of atoms + "Openbabel.bonds", #Number of bonds + "Openbabel.dbonds", #Number of double bonds + "Openbabel.HBA1", #Number of Hydrogen Bond Acceptors 1 (JoelLib) + "Openbabel.HBA2", #Number of Hydrogen Bond Acceptors 2 (JoelLib) + "Openbabel.HBD", #Number of Hydrogen Bond Donors (JoelLib) + #"Openbabel.L5", #Lipinski Rule of Five# TODO Openbabel.L5 returns nil, investigate!!! + "Openbabel.logP", #octanol/water partition coefficient + "Openbabel.MP", #Melting point + "Openbabel.MR", #molar refractivity + "Openbabel.MW", #Molecular Weight filter + "Openbabel.nF", #Number of Fluorine Atoms + "Openbabel.sbonds", #Number of single bonds + "Openbabel.tbonds", #Number of triple bonds + "Openbabel.TPSA", #topological polar surface area + "Cdk.ALOGP", #Calculates atom additive logP and molar refractivity values as described by Ghose and Crippen and + "Cdk.APol", #Descriptor that calculates the sum of the atomic polarizabilities (including implicit hydrogens). + "Cdk.AcidicGroupCount", #Returns the number of acidic groups. + "Cdk.AminoAcidCount", #Returns the number of amino acids found in the system + #"Cdk.AromaticAtomsCount", #Descriptor based on the number of aromatic atoms of a molecule. + #"Cdk.AromaticBondsCount", #Descriptor based on the number of aromatic bonds of a molecule. + #"Cdk.AtomCount", #Descriptor based on the number of atoms of a certain element type. + "Cdk.AutocorrelationCharge", #The Moreau-Broto autocorrelation descriptors using partial charges + "Cdk.AutocorrelationMass", #The Moreau-Broto autocorrelation descriptors using atomic weight + "Cdk.AutocorrelationPolarizability", #The Moreau-Broto autocorrelation descriptors using polarizability + "Cdk.BCUT", #Eigenvalue based descriptor noted for its utility in chemical diversity described by Pearlman et al. . + "Cdk.BPol", #Descriptor that calculates the sum of the absolute value of the difference between atomic polarizabilities of all bonded atoms in the molecule (including implicit hydrogens). + "Cdk.BasicGroupCount", #Returns the number of basic groups. + #"Cdk.BondCount", #Descriptor based on the number of bonds of a certain bond order. + "Cdk.CPSA", #A variety of descriptors combining surface area and partial charge information + "Cdk.CarbonTypes", #Characterizes the carbon connectivity in terms of hybridization + "Cdk.ChiChain", #Evaluates the Kier & Hall Chi chain indices of orders 3,4,5 and 6 + "Cdk.ChiCluster", #Evaluates the Kier & Hall Chi cluster indices of orders 3,4,5,6 and 7 + "Cdk.ChiPathCluster", #Evaluates the Kier & Hall Chi path cluster indices of orders 4,5 and 6 + "Cdk.ChiPath", #Evaluates the Kier & Hall Chi path indices of orders 0,1,2,3,4,5,6 and 7 + "Cdk.EccentricConnectivityIndex", #A topological descriptor combining distance and adjacency information. + "Cdk.FMF", #Descriptor characterizing molecular complexity in terms of its Murcko framework + "Cdk.FragmentComplexity", #Class that returns the complexity of a system. The complexity is defined as @cdk.cite{Nilakantan06} + "Cdk.GravitationalIndex", #Descriptor characterizing the mass distribution of the molecule. + #"Cdk.HBondAcceptorCount", #Descriptor that calculates the number of hydrogen bond acceptors. + #"Cdk.HBondDonorCount", #Descriptor that calculates the number of hydrogen bond donors. + "Cdk.HybridizationRatio", #Characterizes molecular complexity in terms of carbon hybridization states. + "Cdk.IPMolecularLearning", #Descriptor that evaluates the ionization potential. + "Cdk.KappaShapeIndices", #Descriptor that calculates Kier and Hall kappa molecular shape indices. + "Cdk.KierHallSmarts", #Counts the number of occurrences of the E-state fragments + "Cdk.LargestChain", #Returns the number of atoms in the largest chain + "Cdk.LargestPiSystem", #Returns the number of atoms in the largest pi chain + "Cdk.LengthOverBreadth", #Calculates the ratio of length to breadth. + "Cdk.LongestAliphaticChain", #Returns the number of atoms in the longest aliphatic chain + "Cdk.MDE", #Evaluate molecular distance edge descriptors for C, N and O + #"Cdk.MannholdLogP", #Descriptor that calculates the LogP based on a simple equation using the number of carbons and hetero atoms . + "Cdk.MomentOfInertia", #Descriptor that calculates the principal moments of inertia and ratios of the principal moments. Als calculates the radius of gyration. + "Cdk.PetitjeanNumber", #Descriptor that calculates the Petitjean Number of a molecule. + "Cdk.PetitjeanShapeIndex", #The topological and geometric shape indices described Petitjean and Bath et al. respectively. Both measure the anisotropy in a molecule. + "Cdk.RotatableBondsCount", #Descriptor that calculates the number of nonrotatable bonds on a molecule. + #"Cdk.RuleOfFive", #This Class contains a method that returns the number failures of the Lipinski's Rule Of Five. + #"Cdk.TPSA", #Calculation of topological polar surface area based on fragment contributions . + "Cdk.VABC", #Describes the volume of a molecule. + "Cdk.VAdjMa", #Descriptor that calculates the vertex adjacency information of a molecule. + "Cdk.WHIM", #Holistic descriptors described by Todeschini et al . + #"Cdk.Weight", #Descriptor based on the weight of atoms of a certain element type. If no element is specified, the returned value is the Molecular Weight + "Cdk.WeightedPath", #The weighted path (molecular ID) descriptors described by Randic. They characterize molecular branching. + "Cdk.WienerNumbers", #This class calculates Wiener path number and Wiener polarity number. + "Cdk.XLogP", #Prediction of logP based on the atom-type method called XLogP. + "Cdk.ZagrebIndex", #The sum of the squared atom degrees of all heavy atoms. + "Joelib.count.NumberOfS", #no description available + "Joelib.count.NumberOfP", #no description available + "Joelib.count.NumberOfO", #no description available + "Joelib.count.NumberOfN", #no description available + #"Joelib.count.AromaticBonds", #no description available + "Joelib.count.NumberOfI", #no description available + "Joelib.count.NumberOfF", #no description available + "Joelib.count.NumberOfC", #no description available + "Joelib.count.NumberOfB", #no description available + "Joelib.count.HydrophobicGroups", #no description available + #"Joelib.KierShape3", #no description available + #"Joelib.KierShape2", #no description available + #"Joelib.KierShape1", #no description available + #"Joelib.count.AcidicGroups", #no description available + "Joelib.count.AliphaticOHGroups", #no description available + #"Joelib.count.NumberOfAtoms", #no description available + "Joelib.TopologicalRadius", #no description available + "Joelib.GeometricalShapeCoefficient", #no description available + #"Joelib.MolecularWeight", #no description available + "Joelib.FractionRotatableBonds", #no description available + #"Joelib.count.HBD2", #no description available + #"Joelib.count.HBD1", #no description available + "Joelib.LogP", #no description available + "Joelib.GraphShapeCoefficient", #no description available + "Joelib.count.BasicGroups", #no description available + #"Joelib.count.RotatableBonds", #no description available + "Joelib.count.HeavyBonds", #no description available + "Joelib.PolarSurfaceArea", #no description available + #"Joelib.ZagrebIndex1", #no description available + "Joelib.GeometricalRadius", #no description available + "Joelib.count.SO2Groups", #no description available + "Joelib.count.AromaticOHGroups", #no description available + "Joelib.GeometricalDiameter", #no description available + #"Joelib.MolarRefractivity", #no description available + "Joelib.count.NumberOfCl", #no description available + "Joelib.count.OSOGroups", #no description available + "Joelib.count.NumberOfBr", #no description available + "Joelib.count.NO2Groups", #no description available + "Joelib.count.HeteroCycles", #no description available + #"Joelib.count.HBA2", #no description available + #"Joelib.count.HBA1", #no description available + #"Joelib.count.NumberOfBonds", #no description available + "Joelib.count.SOGroups", #no description available + "Joelib.TopologicalDiameter", #no description available + "Joelib.count.NumberOfHal", #no description available + +].sort diff --git a/lib/validation.rb b/lib/validation.rb new file mode 100644 index 0000000..c52ffc0 --- /dev/null +++ b/lib/validation.rb @@ -0,0 +1,68 @@ +module OpenTox + + class Validation + + field :model_id, type: BSON::ObjectId + field :prediction_dataset_id, type: BSON::ObjectId + field :crossvalidation_id, type: BSON::ObjectId + field :test_dataset_id, type: BSON::ObjectId + field :nr_instances, type: Integer + field :nr_unpredicted, type: Integer + field :predictions, type: Array + + def prediction_dataset + Dataset.find prediction_dataset_id + end + + def test_dataset + Dataset.find test_dataset_id + end + + def model + Model::Lazar.find model_id + end + + def self.create model, training_set, test_set, crossvalidation=nil + + atts = model.attributes.dup # do not modify attributes from original model + atts["_id"] = BSON::ObjectId.new + atts[:training_dataset_id] = training_set.id + validation_model = model.class.create training_set, atts + validation_model.save + test_set_without_activities = Dataset.new(:compound_ids => test_set.compound_ids) # just to be sure that activities cannot be used + prediction_dataset = validation_model.predict test_set_without_activities + predictions = [] + nr_unpredicted = 0 + activities = test_set.data_entries.collect{|de| de.first} + prediction_dataset.data_entries.each_with_index do |de,i| + if de[0] and de[1] and de[1].numeric? + activity = activities[i] + prediction = de.first + confidence = de[1] + predictions << [prediction_dataset.compound_ids[i], activity, prediction, de[1]] + else + nr_unpredicted += 1 + end + end + validation = self.new( + :model_id => validation_model.id, + :prediction_dataset_id => prediction_dataset.id, + :test_dataset_id => test_set.id, + :nr_instances => test_set.compound_ids.size, + :nr_unpredicted => nr_unpredicted, + :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence + ) + validation.crossvalidation_id = crossvalidation.id if crossvalidation + validation.save + validation + end + + end + + class ClassificationValidation < Validation + end + + class RegressionValidation < Validation + end + +end |