import java.io.*; import java.util.ArrayList; import java.util.List; import java.util.Arrays; import java.util.Vector; import java.util.Map; import org.openscience.cdk.ChemFile; import org.openscience.cdk.aromaticity.CDKHueckelAromaticityDetector; import org.openscience.cdk.interfaces.IAtomContainer; import org.openscience.cdk.interfaces.IChemFile; import org.openscience.cdk.interfaces.IChemObject; import org.openscience.cdk.interfaces.IMolecule; import org.openscience.cdk.io.ISimpleChemObjectReader; import org.openscience.cdk.io.ReaderFactory; import org.openscience.cdk.qsar.DescriptorEngine; import org.openscience.cdk.qsar.IDescriptor; import org.openscience.cdk.qsar.IMolecularDescriptor; import org.openscience.cdk.qsar.result.DoubleArrayResult; import org.openscience.cdk.qsar.result.DoubleArrayResultType; import org.openscience.cdk.qsar.result.DoubleResult; import org.openscience.cdk.qsar.result.IDescriptorResult; import org.openscience.cdk.qsar.result.IntegerArrayResult; import org.openscience.cdk.qsar.result.IntegerArrayResultType; import org.openscience.cdk.qsar.result.IntegerResult; import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; import org.openscience.cdk.tools.manipulator.ChemFileManipulator; import org.openscience.cdk.qsar.descriptors.molecular.IPMolecularLearningDescriptor; import org.openscience.cdk.smiles.SmilesGenerator; /** * ApplyCDKDescriptors.java * Purpose: Calculate CDK descriptors in CSV format from input SDF files * For ease of use, the design is completely static, i.e. no member functions * Calling the constructor executes the algorithm * * @author Martin Guetlein, Andreas Maunz * @version 1.0 20/9/2012 */ public class ApplyCDKDescriptors { private static DescriptorEngine ENGINE = new DescriptorEngine(DescriptorEngine.MOLECULAR); /** * Constructor, executing the algorithm * * @param: string The path to the input SDF file * @param: string The path to the output CSV file */ public ApplyCDKDescriptors(String inpath, String outpath, String descNamesStr) throws java.io.IOException { getDescriptorCSV(inpath,outpath,descNamesStr); } /** * Example main * */ public static void main(String args[]) throws java.io.IOException { String inpath = "hamster_3d.sdf"; String outpath = "hamster_desc.csv"; getDescriptorCSV(inpath,outpath,""); } /** * Calculate descriptors. Omits IPMolecularLearningDescriptor * * @param string path to SDF input file * @param string path to CSV output file * @param string comma-seperated list of descriptor names (if empty, all descriptors will be calculated) */ public static void getDescriptorCSV(String sdfInputPath, String csvOutputPath, String descNamesStr) throws java.io.IOException { List mols = readMolecules(sdfInputPath); System.err.println("read " + mols.size() + " compounds"); List descriptors = ENGINE.getDescriptorInstances(); System.err.println("found " + descriptors.size() + " descriptors"); List descNames = Arrays.asList(descNamesStr.split(",")); ArrayList colNames = new ArrayList(); ArrayList values = new ArrayList(); for (IDescriptor desc : descriptors) { if (desc instanceof IPMolecularLearningDescriptor) continue; String tname = desc.getClass().getName(); String[] tnamebits = tname.split("\\."); tname = tnamebits[tnamebits.length-1]; if ((descNamesStr.length()>0) && (!descNames.contains(tname))) continue; String[] colNamesArr = desc.getDescriptorNames(); for (int idx=0; idx valuesList = computeLists(mols, (IMolecularDescriptor) desc); values.addAll(valuesList); } int ncol = values.size(); int nrow = mols.size(); FileWriter fstream = new FileWriter(csvOutputPath); BufferedWriter out = new BufferedWriter(fstream); out.write("SMILES,"); for (int c=0; c props = m.getProperties(); for (Object key : props.keySet()) { if (key.toString().equals("STRUCTURE_SMILES") || key.toString().equals("SMILES")) return props.get(key).toString(); } SmilesGenerator g = new SmilesGenerator(); return g.createSMILES(m); } /** * Compute descriptor values, convert to list * * @param List The molecules * @param IMoleculeDescriptor The descriptor * @return List The descriptor values as list */ public static List computeLists(List mols, IMolecularDescriptor desc ) { System.out.println("computing descriptor " + getName(desc)); List values = computeDescriptors(mols, (IMolecularDescriptor) desc); return values; } /** * Read in molecules, using any supported format * * @param string The input file * @return Vector The molecules */ public static List readMolecules(String filepath) { Vector mols = new Vector(); File file = new File(filepath); if (!file.exists()) throw new IllegalArgumentException("file not found: " + filepath); List list; try { ISimpleChemObjectReader reader = new ReaderFactory().createReader(new InputStreamReader( new FileInputStream(file))); if (reader == null) throw new IllegalArgumentException("Could not determine input file type"); IChemFile content = (IChemFile) reader.read((IChemObject) new ChemFile()); list = ChemFileManipulator.getAllAtomContainers(content); reader.close(); } catch (Exception e) { e.printStackTrace(); return null; } for (IAtomContainer iAtomContainer : list) { IMolecule mol = (IMolecule) iAtomContainer; mol = (IMolecule) AtomContainerManipulator.removeHydrogens(mol); try { AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol); } catch (Exception e) { e.printStackTrace(); } try { CDKHueckelAromaticityDetector.detectAromaticity(mol); } catch (Exception e) { e.printStackTrace(); } if (mol.getAtomCount() == 0) System.err.println("molecule has no atoms"); else mols.add(mol); } return mols; } /** * Compute descriptors * * @param List The molecules * @param IMoleculeDescriptor The descriptor * @return List The results as list */ public static List computeDescriptors(List mols, IMolecularDescriptor descriptor) { List vv = new ArrayList(); for (int j = 0; j < getSize(descriptor); j++) vv.add(new Double[mols.size()]); for (int i = 0; i < mols.size(); i++) { if (mols.get(i).getAtomCount() == 0) { for (int j = 0; j < getSize(descriptor); j++) vv.get(j)[i] = null; } else { try { IDescriptorResult res = descriptor.calculate(mols.get(i)).getValue(); if (res instanceof IntegerResult) vv.get(0)[i] = (double) ((IntegerResult) res).intValue(); else if (res instanceof DoubleResult) vv.get(0)[i] = ((DoubleResult) res).doubleValue(); else if (res instanceof DoubleArrayResult) for (int j = 0; j < getSize(descriptor); j++) vv.get(j)[i] = ((DoubleArrayResult) res).get(j); else if (res instanceof IntegerArrayResult) for (int j = 0; j < getSize(descriptor); j++) vv.get(j)[i] = (double) ((IntegerArrayResult) res).get(j); else throw new IllegalStateException("Unknown idescriptor result value for '" + descriptor + "' : " + res.getClass()); } catch (Throwable e) { System.err.println("Could not compute cdk feature " + descriptor); e.printStackTrace(); for (int j = 0; j < getSize(descriptor); j++) vv.get(j)[i] = null; } } for (int j = 0; j < getSize(descriptor); j++) if (vv.get(j)[i] != null && (vv.get(j)[i].isNaN() || vv.get(j)[i].isInfinite())) vv.get(j)[i] = null; } return vv; } /** * Get length of result for a given descriptor * * @param IMolecularDescriptor The descriptor * @return int The length */ private static int getSize(IMolecularDescriptor descriptor) { IDescriptorResult r = descriptor.getDescriptorResultType(); if (r instanceof DoubleArrayResultType) return ((DoubleArrayResultType) r).length(); else if (r instanceof IntegerArrayResultType) return ((IntegerArrayResultType) r).length(); else return 1; } /** * Get name for a given descriptor * * @param IMolecularDescriptor The descriptor */ private static String getName(IDescriptor descriptor) { return ENGINE.getDictionaryTitle(descriptor.getSpecification()).trim(); } }