|
| 1 | +#!/usr/bin/python |
| 2 | + |
| 3 | +#given a smarts rxn file, a core scaffold smarts file (with name containing connecting atoms) |
| 4 | +# and an sdf file, extract the matching scaffolds from of the sdf file and cluster |
| 5 | +#them greedily to identify a set of scaffold conformations |
| 6 | + |
| 7 | +import sys,gzip,argparse |
| 8 | +from rdkit.Chem import AllChem |
| 9 | + |
| 10 | +def subMol(mol, match): |
| 11 | + #not sure why this functionality isn't implemented natively |
| 12 | + #but get the interconnected bonds for the match |
| 13 | + atoms = set(match) |
| 14 | + bonds = set() |
| 15 | + for a in atoms: |
| 16 | + atom = mol.GetAtomWithIdx(a) |
| 17 | + for b in atom.GetBonds(): |
| 18 | + if b.GetOtherAtomIdx(a) in atoms: |
| 19 | + bonds.add(b.GetIdx()) |
| 20 | + return AllChem.PathToSubmol(mol,list(bonds)) |
| 21 | + |
| 22 | +#compute the distances between the matching connecting atoms |
| 23 | +#and return true if all distance are small enough |
| 24 | +def checkConnect(center, cmatch, mol, match, connectIndices, connect): |
| 25 | + cconf = center.GetConformer(0) |
| 26 | + mconf = mol.GetConformer(0) |
| 27 | + for i in connectIndices: |
| 28 | + cidx = cmatch[i] |
| 29 | + midx = match[i] |
| 30 | + cpt = cconf.GetAtomPosition(cidx) |
| 31 | + mpt = mconf.GetAtomPosition(midx) |
| 32 | + dist = cpt.Distance(mpt) |
| 33 | + if dist > connect: |
| 34 | + return False |
| 35 | + return True |
| 36 | + |
| 37 | +#find all the scaffolds in mols that are within rmsd of center and where the connecting |
| 38 | +#atoms are within connect, return new cluster and new mols |
| 39 | +def createCluster(center,cmatch, mols, pattern, core, rmsd, connectIndices, connect): |
| 40 | + cluster = list() |
| 41 | + newmols = list() |
| 42 | + for (mol,match) in mols: |
| 43 | + r = AllChem.GetBestRMS(mol,center,maps=[zip(cmatch,match)]) |
| 44 | + if r < rmsd and checkConnect(center, cmatch, mol,match,connectIndices, connect): |
| 45 | + cluster.append((mol,match,r)) |
| 46 | + else: |
| 47 | + newmols.append((mol,match)) |
| 48 | + cluster.sort(key = lambda (m,mtch,r): r ) |
| 49 | + return (cluster, newmols) |
| 50 | + |
| 51 | +#find the mol in mols that has the maximum minimum distance between the first |
| 52 | +#mol in each cluster |
| 53 | +#ACTUALLY, for the tight tolerances we need, this really doesn't make a difference |
| 54 | +#and just slows things down, so just pick the first available conformer |
| 55 | +def computeNext(clusters,mols): |
| 56 | + if len(mols) > 0: |
| 57 | + return mols[0] |
| 58 | + else: |
| 59 | + return (None,None) |
| 60 | + max = 0 |
| 61 | + best = (None,None) |
| 62 | + for (mol,match) in mols: |
| 63 | + min = float('inf') |
| 64 | + for cl in clusters: |
| 65 | + cmol = cl[0][0] |
| 66 | + cmatch = cl[0][1] |
| 67 | + r = AllChem.GetBestRMS(cmol,mol,maps=[zip(match,cmatch)]) |
| 68 | + if r < min: |
| 69 | + min = r |
| 70 | + if min > max: |
| 71 | + max = min |
| 72 | + best = (mol,match) |
| 73 | + return best |
| 74 | + |
| 75 | +#MAIN |
| 76 | +if len(sys.argv) < 5: |
| 77 | + print "Need reaction file, core scaffold file, sdf input file and sdf output" |
| 78 | + sys.exit(1) |
| 79 | + |
| 80 | +parser = argparse.ArgumentParser() |
| 81 | +parser.add_argument('-r','--rxn', help="Reaction file") |
| 82 | +parser.add_argument('-c','--core',help="Core scaffold with connecting atoms in name") |
| 83 | +parser.add_argument('-i','--input',help="Input conformers") |
| 84 | +parser.add_argument('-o','--output',help="Clustered core scaffold output") |
| 85 | +parser.add_argument("--rmsd",type=float,default=0.5,help="Maximum RMSD for cluster membership") |
| 86 | +parser.add_argument("--connect",type=float,default=0.1,help="Maximum allowed deviation of connecting atoms for cluster membership") |
| 87 | +parser.add_argument("--sample",type=int,default=1,help="Amount to sample conformations") |
| 88 | +args = parser.parse_args() |
| 89 | + |
| 90 | +rxnf = open(args.rxn) |
| 91 | +rxnsm = rxnf.readline().split()[0] #ignore any name |
| 92 | +rxn = AllChem.ReactionFromSmarts(rxnsm) |
| 93 | +rxn.Initialize() |
| 94 | + |
| 95 | +if rxn.GetNumProductTemplates() == 1: |
| 96 | + product = rxn.GetProductTemplate(0) |
| 97 | + reactants = list() |
| 98 | + for i in xrange(rxn.GetNumReactantTemplates()): |
| 99 | + reactants.append(rxn.GetReactantTemplate(i)) |
| 100 | +elif rxn.GetNumReactantTemplates() == 1: |
| 101 | + product = rxn.GetReactantTemplate(0) |
| 102 | + reactants = list() |
| 103 | + for i in xrange(rxn.GetNumProductTemplates()): |
| 104 | + reactants.append(rxn.GetProductTemplate(i)) |
| 105 | +else: |
| 106 | + print "Can have only one product" |
| 107 | + sys.exit(1) |
| 108 | + |
| 109 | +coref = open(args.core) |
| 110 | +corel = coref.readline() |
| 111 | +coreconnects = corel.split()[1:] |
| 112 | +core = AllChem.MolFromSmarts(corel.split()[0]) |
| 113 | + |
| 114 | +inmols = AllChem.SDMolSupplier(args.input) |
| 115 | +if inmols is None: |
| 116 | + print "Could not open ",args.input |
| 117 | + sys.exit(-1) |
| 118 | + |
| 119 | +sdwriter = AllChem.SDWriter(args.output) |
| 120 | +if sdwriter is None: |
| 121 | + print "Could not open ",args.output |
| 122 | + sys.exit(-1) |
| 123 | + |
| 124 | +smart = AllChem.MolToSmarts(product) |
| 125 | +pattern = AllChem.MolFromSmarts(smart) |
| 126 | + |
| 127 | +#figure out the indices of connected atoms in the smart core pattern |
| 128 | +connectIndices = list() |
| 129 | +for c in coreconnects: |
| 130 | + cm = AllChem.MolFromSmarts(c) |
| 131 | + a = cm.GetAtoms()[0] |
| 132 | + if a.HasProp('molAtomMapNumber'): |
| 133 | + mapnum = a.GetProp('molAtomMapNumber') |
| 134 | + for sma in core.GetAtoms(): |
| 135 | + if sma.HasProp('molAtomMapNumber') and sma.GetProp('molAtomMapNumber') == mapnum: |
| 136 | + connectIndices.append(sma.GetIdx()) |
| 137 | + |
| 138 | +#read all core scaffold molecules into memory |
| 139 | +mols = list() |
| 140 | +cnt = 0 |
| 141 | +for mol in inmols: |
| 142 | + if cnt % args.sample == 0 and mol is not None: |
| 143 | + try: |
| 144 | + mol = AllChem.AddHs(mol) |
| 145 | + match = mol.GetSubstructMatch(pattern) #just one? why not, we're only sampling |
| 146 | + if match: |
| 147 | + sub = subMol(mol, match) |
| 148 | + cmatch = sub.GetSubstructMatch(core) |
| 149 | + if cmatch: |
| 150 | + sub = subMol(sub,cmatch) |
| 151 | + mols.append((sub,sub.GetSubstructMatch(core))) |
| 152 | + except (KeyboardInterrupt, SystemExit): |
| 153 | + raise |
| 154 | + except Exception as e: |
| 155 | + print "Exception occurred",mol.GetProp('_Name'),e |
| 156 | + cnt += 1 |
| 157 | + |
| 158 | +if len(mols) == 0: |
| 159 | + print "No molecules!" |
| 160 | + sys.exit(-1) |
| 161 | +print "Done reading" |
| 162 | +clusters = list() #these are just defined by a list of all the scffolds assigned to the cluster |
| 163 | +(center, cmatch) = mols[0] |
| 164 | + |
| 165 | +while len(mols) > 0: |
| 166 | + (cluster, mols) = createCluster(center,cmatch,mols, pattern, core, args.rmsd, connectIndices, args.connect) |
| 167 | + clusters.append(cluster) |
| 168 | + (center, cmatch) = computeNext(clusters,mols) |
| 169 | + |
| 170 | +print len(clusters) |
| 171 | +for cl in clusters: |
| 172 | + cmol = cl[0][0] |
| 173 | + cmol.SetProp("ClusterSize",str(len(cl))) |
| 174 | + AllChem.GetBestRMS(clusters[0][0][0],cmol) #align to very first |
| 175 | + sdwriter.write(cmol) |
| 176 | +sdwriter.close() |
| 177 | + |
0 commit comments