Skip to content

Commit b580b64

Browse files
committed
useful(?) scripts that use rdkit
1 parent bb9b91e commit b580b64

11 files changed

+1326
-0
lines changed

applysdfcoordstopdb.py

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#!/usr/bin/env python
2+
3+
'''Given a pdb file and an sdf file of the same ligand, find the correspondance
4+
between the atoms and output a pdb with the same atom names as the pdb input
5+
but with the coordinates of the sdf file. Atoms missing in the sdf
6+
will not have their coordinates changed, but note that hydrogens are stripped'''
7+
8+
import sys
9+
from rdkit.Chem import AllChem
10+
11+
if len(sys.argv) != 3:
12+
print "Need sdf and pdb files"
13+
sys.exit(1)
14+
15+
sdf = sys.argv[1]
16+
pdb = sys.argv[2]
17+
18+
#figure out starting atom number
19+
startnum = 1
20+
for line in open(pdb):
21+
if line.startswith('ATOM') or line.startswith('HETATM'):
22+
startnum = int(line[6:11])
23+
break
24+
25+
pmol = AllChem.MolFromPDBFile(pdb)
26+
smol = AllChem.SDMolSupplier("newsnappy.sdf").next()
27+
pmol = AllChem.AssignBondOrdersFromTemplate(smol,pmol)
28+
29+
if smol.HasSubstructMatch(pmol):
30+
m = smol.GetSubstructMatch(pmol)
31+
pconf = pmol.GetConformer()
32+
sconf = smol.GetConformer()
33+
for (pi, si) in enumerate(m):
34+
pconf.SetAtomPosition(pi, sconf.GetAtomPosition(si))
35+
pdbout = AllChem.MolToPDBBlock(pmol, flavor=2).split('\n')
36+
num = startnum
37+
for line in pdbout:
38+
if line.startswith('ATOM') or line.startswith('HETATM'):
39+
print '%s%5d%s' % (line[:6],num,line[11:])
40+
num += 1
41+
else:
42+
print "Could not matchup pdb and sdf"
43+
sys.exit(1)

clustermatches.py

+177
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#!/usr/bin/python
2+
3+
#given a smarts rxn file, a core scaffold smarts file (with name containing connecting atoms)
4+
# and an sdf file, extract the matching scaffolds from of the sdf file and cluster
5+
#them greedily to identify a set of scaffold conformations
6+
7+
import sys,gzip,argparse
8+
from rdkit.Chem import AllChem
9+
10+
def subMol(mol, match):
11+
#not sure why this functionality isn't implemented natively
12+
#but get the interconnected bonds for the match
13+
atoms = set(match)
14+
bonds = set()
15+
for a in atoms:
16+
atom = mol.GetAtomWithIdx(a)
17+
for b in atom.GetBonds():
18+
if b.GetOtherAtomIdx(a) in atoms:
19+
bonds.add(b.GetIdx())
20+
return AllChem.PathToSubmol(mol,list(bonds))
21+
22+
#compute the distances between the matching connecting atoms
23+
#and return true if all distance are small enough
24+
def checkConnect(center, cmatch, mol, match, connectIndices, connect):
25+
cconf = center.GetConformer(0)
26+
mconf = mol.GetConformer(0)
27+
for i in connectIndices:
28+
cidx = cmatch[i]
29+
midx = match[i]
30+
cpt = cconf.GetAtomPosition(cidx)
31+
mpt = mconf.GetAtomPosition(midx)
32+
dist = cpt.Distance(mpt)
33+
if dist > connect:
34+
return False
35+
return True
36+
37+
#find all the scaffolds in mols that are within rmsd of center and where the connecting
38+
#atoms are within connect, return new cluster and new mols
39+
def createCluster(center,cmatch, mols, pattern, core, rmsd, connectIndices, connect):
40+
cluster = list()
41+
newmols = list()
42+
for (mol,match) in mols:
43+
r = AllChem.GetBestRMS(mol,center,maps=[zip(cmatch,match)])
44+
if r < rmsd and checkConnect(center, cmatch, mol,match,connectIndices, connect):
45+
cluster.append((mol,match,r))
46+
else:
47+
newmols.append((mol,match))
48+
cluster.sort(key = lambda (m,mtch,r): r )
49+
return (cluster, newmols)
50+
51+
#find the mol in mols that has the maximum minimum distance between the first
52+
#mol in each cluster
53+
#ACTUALLY, for the tight tolerances we need, this really doesn't make a difference
54+
#and just slows things down, so just pick the first available conformer
55+
def computeNext(clusters,mols):
56+
if len(mols) > 0:
57+
return mols[0]
58+
else:
59+
return (None,None)
60+
max = 0
61+
best = (None,None)
62+
for (mol,match) in mols:
63+
min = float('inf')
64+
for cl in clusters:
65+
cmol = cl[0][0]
66+
cmatch = cl[0][1]
67+
r = AllChem.GetBestRMS(cmol,mol,maps=[zip(match,cmatch)])
68+
if r < min:
69+
min = r
70+
if min > max:
71+
max = min
72+
best = (mol,match)
73+
return best
74+
75+
#MAIN
76+
if len(sys.argv) < 5:
77+
print "Need reaction file, core scaffold file, sdf input file and sdf output"
78+
sys.exit(1)
79+
80+
parser = argparse.ArgumentParser()
81+
parser.add_argument('-r','--rxn', help="Reaction file")
82+
parser.add_argument('-c','--core',help="Core scaffold with connecting atoms in name")
83+
parser.add_argument('-i','--input',help="Input conformers")
84+
parser.add_argument('-o','--output',help="Clustered core scaffold output")
85+
parser.add_argument("--rmsd",type=float,default=0.5,help="Maximum RMSD for cluster membership")
86+
parser.add_argument("--connect",type=float,default=0.1,help="Maximum allowed deviation of connecting atoms for cluster membership")
87+
parser.add_argument("--sample",type=int,default=1,help="Amount to sample conformations")
88+
args = parser.parse_args()
89+
90+
rxnf = open(args.rxn)
91+
rxnsm = rxnf.readline().split()[0] #ignore any name
92+
rxn = AllChem.ReactionFromSmarts(rxnsm)
93+
rxn.Initialize()
94+
95+
if rxn.GetNumProductTemplates() == 1:
96+
product = rxn.GetProductTemplate(0)
97+
reactants = list()
98+
for i in xrange(rxn.GetNumReactantTemplates()):
99+
reactants.append(rxn.GetReactantTemplate(i))
100+
elif rxn.GetNumReactantTemplates() == 1:
101+
product = rxn.GetReactantTemplate(0)
102+
reactants = list()
103+
for i in xrange(rxn.GetNumProductTemplates()):
104+
reactants.append(rxn.GetProductTemplate(i))
105+
else:
106+
print "Can have only one product"
107+
sys.exit(1)
108+
109+
coref = open(args.core)
110+
corel = coref.readline()
111+
coreconnects = corel.split()[1:]
112+
core = AllChem.MolFromSmarts(corel.split()[0])
113+
114+
inmols = AllChem.SDMolSupplier(args.input)
115+
if inmols is None:
116+
print "Could not open ",args.input
117+
sys.exit(-1)
118+
119+
sdwriter = AllChem.SDWriter(args.output)
120+
if sdwriter is None:
121+
print "Could not open ",args.output
122+
sys.exit(-1)
123+
124+
smart = AllChem.MolToSmarts(product)
125+
pattern = AllChem.MolFromSmarts(smart)
126+
127+
#figure out the indices of connected atoms in the smart core pattern
128+
connectIndices = list()
129+
for c in coreconnects:
130+
cm = AllChem.MolFromSmarts(c)
131+
a = cm.GetAtoms()[0]
132+
if a.HasProp('molAtomMapNumber'):
133+
mapnum = a.GetProp('molAtomMapNumber')
134+
for sma in core.GetAtoms():
135+
if sma.HasProp('molAtomMapNumber') and sma.GetProp('molAtomMapNumber') == mapnum:
136+
connectIndices.append(sma.GetIdx())
137+
138+
#read all core scaffold molecules into memory
139+
mols = list()
140+
cnt = 0
141+
for mol in inmols:
142+
if cnt % args.sample == 0 and mol is not None:
143+
try:
144+
mol = AllChem.AddHs(mol)
145+
match = mol.GetSubstructMatch(pattern) #just one? why not, we're only sampling
146+
if match:
147+
sub = subMol(mol, match)
148+
cmatch = sub.GetSubstructMatch(core)
149+
if cmatch:
150+
sub = subMol(sub,cmatch)
151+
mols.append((sub,sub.GetSubstructMatch(core)))
152+
except (KeyboardInterrupt, SystemExit):
153+
raise
154+
except Exception as e:
155+
print "Exception occurred",mol.GetProp('_Name'),e
156+
cnt += 1
157+
158+
if len(mols) == 0:
159+
print "No molecules!"
160+
sys.exit(-1)
161+
print "Done reading"
162+
clusters = list() #these are just defined by a list of all the scffolds assigned to the cluster
163+
(center, cmatch) = mols[0]
164+
165+
while len(mols) > 0:
166+
(cluster, mols) = createCluster(center,cmatch,mols, pattern, core, args.rmsd, connectIndices, args.connect)
167+
clusters.append(cluster)
168+
(center, cmatch) = computeNext(clusters,mols)
169+
170+
print len(clusters)
171+
for cl in clusters:
172+
cmol = cl[0][0]
173+
cmol.SetProp("ClusterSize",str(len(cl)))
174+
AllChem.GetBestRMS(clusters[0][0][0],cmol) #align to very first
175+
sdwriter.write(cmol)
176+
sdwriter.close()
177+

corerxnscaffold.py

+110
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
#!/usr/bin/python
2+
3+
#given a rxn smarts (either forward or backwards, but there is assumed to be
4+
#only one product) identify a minimal core scaffold that has every connecting atom
5+
#from each reactant and only those scaffold atoms that are necessary to minimally
6+
#connect these atoms
7+
#this assume the input smarts is fully annoated with atom mappings for the heavy atoms
8+
#output the smarts (with atom numbers) of the core scaffold
9+
10+
import sys
11+
from rdkit.Chem import AllChem
12+
import igraph
13+
14+
Debug = False
15+
16+
if len(sys.argv) < 1:
17+
print "Need reaction file"
18+
sys.exit(1)
19+
if len(sys.argv) > 2:
20+
Debug = True
21+
22+
rxnf = open(sys.argv[1])
23+
rxnsm = rxnf.readline()
24+
rxn = AllChem.ReactionFromSmarts(rxnsm)
25+
rxn.Initialize()
26+
27+
if rxn.GetNumProductTemplates() == 1:
28+
product = rxn.GetProductTemplate(0)
29+
reactants = list()
30+
for i in xrange(rxn.GetNumReactantTemplates()):
31+
reactants.append(rxn.GetReactantTemplate(i))
32+
elif rxn.GetNumReactantTemplates() == 1:
33+
product = rxn.GetReactantTemplate(0)
34+
reactants = list()
35+
for i in xrange(rxn.GetNumProductTemplates()):
36+
reactants.append(rxn.GetProductTemplate(i))
37+
else:
38+
print "Can have only one product"
39+
sys.exit(1)
40+
41+
#record what atom mapping number belong to each reactant
42+
reactantMapNumbers = dict()
43+
for (i, react) in enumerate(reactants):
44+
for a in react.GetAtoms():
45+
if a.HasProp('molAtomMapNumber'):
46+
mapnum = a.GetProp('molAtomMapNumber')
47+
reactantMapNumbers[mapnum] = i
48+
49+
#now create an igraph from the product
50+
molgraph = igraph.Graph()
51+
molgraph.add_vertices(product.GetNumAtoms())
52+
53+
for a in product.GetAtoms():
54+
s = a.GetIdx()
55+
v = molgraph.vs[s]
56+
v['react'] = -1 #reactant this atom belongs to, -1 if none
57+
v['heavy'] = a.GetAtomicNum() != 1 #note this things [#1,#6] is H only, but that should be okay - if H can go there, no need to worry about scaffold connectivity
58+
if a.HasProp('molAtomMapNumber'):
59+
mapnum = a.GetProp('molAtomMapNumber')
60+
if mapnum in reactantMapNumbers:
61+
v['react'] = reactantMapNumbers[mapnum]
62+
for na in a.GetNeighbors():
63+
t = na.GetIdx()
64+
molgraph.add_edge(s,t)
65+
66+
#now identify the connecting heavy atoms for reactants
67+
connecting = list()
68+
for v in molgraph.vs:
69+
if v['react'] >= 0 and v['heavy']:
70+
r = v['react']
71+
for nv in v.neighbors():
72+
if nv['react'] != r and nv['heavy']:
73+
connecting.append(v.index)
74+
break
75+
76+
if Debug:
77+
for i in connecting:
78+
print product.GetAtomWithIdx(i).GetSmarts(),molgraph.vs[i]['react']
79+
80+
#compute the shortest paths between all these connecting atoms
81+
#the atoms along these paths make the minimal scaffold
82+
scaffold = set()
83+
for i in connecting:
84+
paths = molgraph.get_all_shortest_paths(i,connecting,mode=igraph.ALL)
85+
for p in paths:
86+
for v in p:
87+
scaffold.add(v)
88+
89+
if Debug:
90+
print "\nScaffold"
91+
for i in scaffold:
92+
print product.GetAtomWithIdx(i).GetSmarts(),molgraph.vs[i]['react']
93+
94+
#need to identify the bonds between the scaffold atoms
95+
bonds = set()
96+
for i in scaffold:
97+
a = product.GetAtomWithIdx(i)
98+
for b in a.GetBonds():
99+
end = b.GetOtherAtomIdx(i)
100+
if end in scaffold:
101+
bonds.add(b.GetIdx())
102+
103+
if Debug:
104+
print "Bonds",list(bonds)
105+
106+
subMol = AllChem.PathToSubmol(product,list(bonds))
107+
print AllChem.MolToSmarts(subMol),
108+
for i in connecting:
109+
print product.GetAtomWithIdx(i).GetSmarts(),
110+
print ""

0 commit comments

Comments
 (0)