summaryrefslogtreecommitdiff
path: root/java/ApplyCDKDescriptors.java
diff options
context:
space:
mode:
Diffstat (limited to 'java/ApplyCDKDescriptors.java')
-rw-r--r--java/ApplyCDKDescriptors.java307
1 files changed, 307 insertions, 0 deletions
diff --git a/java/ApplyCDKDescriptors.java b/java/ApplyCDKDescriptors.java
new file mode 100644
index 0000000..6207031
--- /dev/null
+++ b/java/ApplyCDKDescriptors.java
@@ -0,0 +1,307 @@
+import java.io.*;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Arrays;
+import java.util.Vector;
+import java.util.Map;
+
+import org.openscience.cdk.ChemFile;
+import org.openscience.cdk.aromaticity.CDKHueckelAromaticityDetector;
+import org.openscience.cdk.interfaces.IAtomContainer;
+import org.openscience.cdk.interfaces.IChemFile;
+import org.openscience.cdk.interfaces.IChemObject;
+import org.openscience.cdk.interfaces.IMolecule;
+import org.openscience.cdk.io.ISimpleChemObjectReader;
+import org.openscience.cdk.io.ReaderFactory;
+import org.openscience.cdk.qsar.DescriptorEngine;
+import org.openscience.cdk.qsar.IDescriptor;
+import org.openscience.cdk.qsar.IMolecularDescriptor;
+import org.openscience.cdk.qsar.result.DoubleArrayResult;
+import org.openscience.cdk.qsar.result.DoubleArrayResultType;
+import org.openscience.cdk.qsar.result.DoubleResult;
+import org.openscience.cdk.qsar.result.IDescriptorResult;
+import org.openscience.cdk.qsar.result.IntegerArrayResult;
+import org.openscience.cdk.qsar.result.IntegerArrayResultType;
+import org.openscience.cdk.qsar.result.IntegerResult;
+import org.openscience.cdk.tools.manipulator.AtomContainerManipulator;
+import org.openscience.cdk.tools.manipulator.ChemFileManipulator;
+import org.openscience.cdk.qsar.descriptors.molecular.IPMolecularLearningDescriptor;
+import org.openscience.cdk.smiles.SmilesGenerator;
+
+
+/**
+ * ApplyCDKDescriptors.java
+ * Purpose: Calculate CDK descriptors in CSV format from input SDF files
+ * For ease of use, the design is completely static, i.e. no member functions
+ * Calling the constructor executes the algorithm
+ *
+ * @author Martin Guetlein, Andreas Maunz
+ * @version 1.0 20/9/2012
+ */
+public class ApplyCDKDescriptors {
+ private static DescriptorEngine ENGINE = new DescriptorEngine(DescriptorEngine.MOLECULAR);
+
+
+ /**
+ * Constructor, executing the algorithm
+ *
+ * @param: string The path to the input SDF file
+ * @param: string The path to the output CSV file
+ */
+ public ApplyCDKDescriptors(String inpath, String outpath, String descNamesStr) throws java.io.IOException {
+ getDescriptorCSV(inpath,outpath,descNamesStr);
+ }
+
+
+ /**
+ * Example main
+ *
+ */
+ public static void main(String args[]) throws java.io.IOException
+ {
+ int length = args.length;
+ if (length != 2) {
+ System.out.println("Enter two file names: <input.sdf> <output.csv>");
+ System.exit(1);
+ }
+ String inpath = args[0];
+ String outpath = args[1];
+ getDescriptorCSV(inpath,outpath,"");
+ }
+
+
+ /**
+ * Calculate descriptors. Omits IPMolecularLearningDescriptor
+ *
+ * @param string path to SDF input file
+ * @param string path to CSV output file
+ * @param string comma-seperated list of descriptor names (if empty, all descriptors will be calculated)
+ */
+ public static void getDescriptorCSV(String sdfInputPath, String csvOutputPath, String descNamesStr) throws java.io.IOException
+ {
+ List<IMolecule> mols = readMolecules(sdfInputPath);
+ System.err.println("read " + mols.size() + " compounds");
+ List<IDescriptor> descriptors = ENGINE.getDescriptorInstances();
+ System.err.println("found " + descriptors.size() + " descriptors");
+
+ List<String> descNames = Arrays.asList(descNamesStr.split(","));
+ ArrayList<String> colNames = new ArrayList<String>();
+ ArrayList<Double[]> values = new ArrayList<Double[]>();
+ for (IDescriptor desc : descriptors) {
+ if (desc instanceof IPMolecularLearningDescriptor)
+ continue;
+ String tname = desc.getClass().getName();
+ String[] tnamebits = tname.split("\\.");
+ tname = tnamebits[tnamebits.length-1];
+ if ((descNamesStr.length()>0) && (!descNames.contains(tname)))
+ continue;
+ String[] colNamesArr = desc.getDescriptorNames();
+ for (int idx=0; idx<colNamesArr.length; idx++) {
+ colNamesArr[idx] = tname + "-" + colNamesArr[idx];
+ }
+ colNames.addAll(Arrays.asList(colNamesArr));
+ List<Double[]> valuesList = computeLists(mols, (IMolecularDescriptor) desc);
+ values.addAll(valuesList);
+ }
+
+ int ncol = values.size();
+ int nrow = mols.size();
+ FileWriter fstream = new FileWriter(csvOutputPath);
+ BufferedWriter out = new BufferedWriter(fstream);
+ out.write("SMILES,");
+ for (int c=0; c<ncol; c++) {
+ if (c!=0) out.write(",");
+ out.write(colNames.get(c));
+ }
+ out.write("\n");
+ for (int r=0; r<nrow; r++) {
+ String smi = getSmiles(mols.get(r));
+ out.write(smi + ",");
+ for (int c=0; c<ncol; c++) {
+ if (c!=0) out.write(",");
+ out.write(""+values.get(c)[r]);
+ }
+ out.write("\n");
+ }
+ out.flush();
+ }
+
+
+ /**
+ * Get SMILES code for a molecule
+ *
+ * @param IMolecule The molecule
+ * @return string The SMILES code
+ */
+ public static String getSmiles(IMolecule m)
+ {
+ Map<Object, Object> props = m.getProperties();
+ for (Object key : props.keySet()) {
+ if (key.toString().equals("STRUCTURE_SMILES") || key.toString().equals("SMILES"))
+ return props.get(key).toString();
+ }
+ SmilesGenerator g = new SmilesGenerator();
+ return g.createSMILES(m);
+ }
+
+
+ /**
+ * Compute descriptor values, convert to list
+ *
+ * @param List<IMolecule> The molecules
+ * @param IMoleculeDescriptor The descriptor
+ * @return List<Double[]> The descriptor values as list
+ */
+ public static List<Double[]> computeLists(List<IMolecule> mols, IMolecularDescriptor desc )
+ {
+ System.out.println("computing descriptor " + getName(desc));
+ List<Double[]> values = computeDescriptors(mols, (IMolecularDescriptor) desc);
+ return values;
+ }
+
+
+ /**
+ * Read in molecules, using any supported format
+ *
+ * @param string The input file
+ * @return Vector<IMolecule> The molecules
+ */
+ public static List<IMolecule> readMolecules(String filepath)
+ {
+ Vector<IMolecule> mols = new Vector<IMolecule>();
+ File file = new File(filepath);
+ if (!file.exists())
+ throw new IllegalArgumentException("file not found: " + filepath);
+ List<IAtomContainer> list;
+ try
+ {
+ ISimpleChemObjectReader reader = new ReaderFactory().createReader(new InputStreamReader(
+ new FileInputStream(file)));
+ if (reader == null)
+ throw new IllegalArgumentException("Could not determine input file type");
+ IChemFile content = (IChemFile) reader.read((IChemObject) new ChemFile());
+ list = ChemFileManipulator.getAllAtomContainers(content);
+ reader.close();
+ }
+ catch (Exception e)
+ {
+ e.printStackTrace();
+ return null;
+ }
+
+ for (IAtomContainer iAtomContainer : list)
+ {
+ IMolecule mol = (IMolecule) iAtomContainer;
+ mol = (IMolecule) AtomContainerManipulator.removeHydrogens(mol);
+ try
+ {
+ AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol);
+ }
+ catch (Exception e)
+ {
+ e.printStackTrace();
+ }
+ try
+ {
+ CDKHueckelAromaticityDetector.detectAromaticity(mol);
+ }
+ catch (Exception e)
+ {
+ e.printStackTrace();
+ }
+ if (mol.getAtomCount() == 0)
+ System.err.println("molecule has no atoms");
+ else
+ mols.add(mol);
+ }
+ return mols;
+ }
+
+
+ /**
+ * Compute descriptors
+ *
+ * @param List<IMolecule> The molecules
+ * @param IMoleculeDescriptor The descriptor
+ * @return List<Double[]> The results as list
+ */
+ public static List<Double[]> computeDescriptors(List<IMolecule> mols, IMolecularDescriptor descriptor)
+ {
+ List<Double[]> vv = new ArrayList<Double[]>();
+
+ for (int j = 0; j < getSize(descriptor); j++)
+ vv.add(new Double[mols.size()]);
+
+ for (int i = 0; i < mols.size(); i++)
+ {
+ if (mols.get(i).getAtomCount() == 0)
+ {
+ for (int j = 0; j < getSize(descriptor); j++)
+ vv.get(j)[i] = null;
+ }
+ else
+ {
+ try
+ {
+ IDescriptorResult res = descriptor.calculate(mols.get(i)).getValue();
+ if (res instanceof IntegerResult)
+ vv.get(0)[i] = (double) ((IntegerResult) res).intValue();
+ else if (res instanceof DoubleResult)
+ vv.get(0)[i] = ((DoubleResult) res).doubleValue();
+ else if (res instanceof DoubleArrayResult)
+ for (int j = 0; j < getSize(descriptor); j++)
+ vv.get(j)[i] = ((DoubleArrayResult) res).get(j);
+ else if (res instanceof IntegerArrayResult)
+ for (int j = 0; j < getSize(descriptor); j++)
+ vv.get(j)[i] = (double) ((IntegerArrayResult) res).get(j);
+ else
+ throw new IllegalStateException("Unknown idescriptor result value for '" + descriptor + "' : "
+ + res.getClass());
+ }
+ catch (Throwable e)
+ {
+ System.err.println("Could not compute cdk feature " + descriptor);
+ e.printStackTrace();
+ for (int j = 0; j < getSize(descriptor); j++)
+ vv.get(j)[i] = null;
+ }
+ }
+ for (int j = 0; j < getSize(descriptor); j++)
+ if (vv.get(j)[i] != null && (vv.get(j)[i].isNaN() || vv.get(j)[i].isInfinite()))
+ vv.get(j)[i] = null;
+ }
+
+ return vv;
+ }
+
+
+ /**
+ * Get length of result for a given descriptor
+ *
+ * @param IMolecularDescriptor The descriptor
+ * @return int The length
+ */
+ private static int getSize(IMolecularDescriptor descriptor)
+ {
+ IDescriptorResult r = descriptor.getDescriptorResultType();
+ if (r instanceof DoubleArrayResultType)
+ return ((DoubleArrayResultType) r).length();
+ else if (r instanceof IntegerArrayResultType)
+ return ((IntegerArrayResultType) r).length();
+ else
+ return 1;
+ }
+
+
+ /**
+ * Get name for a given descriptor
+ *
+ * @param IMolecularDescriptor The descriptor
+ */
+ private static String getName(IDescriptor descriptor)
+ {
+ return ENGINE.getDictionaryTitle(descriptor.getSpecification()).trim();
+ }
+
+
+}