-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathgen_stereo_rdkit.py
executable file
·256 lines (207 loc) · 10.9 KB
/
gen_stereo_rdkit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
#!/usr/bin/env python3
__author__ = 'pavel'
import argparse
import sys
from rdkit import Chem
from rdkit.Chem import AllChem
from itertools import product
from copy import deepcopy
from multiprocessing import Pool, cpu_count
from read_input import read_input
def prep_input(fname, id_field_name, tetrahedral, double_bond, max_attempts, max_undef):
for mol, mol_name in read_input(fname, id_field_name=id_field_name):
yield mol, mol_name, tetrahedral, double_bond, max_attempts, max_undef
def map_enumerate_stereo(args):
return enumerate_stereo(*args)
def get_unspec_double_bonds(m):
def check_nei_bonds(bond):
a1, a2 = bond.GetBeginAtom(), bond.GetEndAtom()
a1_bonds_single = [b.GetBondType() == Chem.BondType.SINGLE for b in a1.GetBonds() if b.GetIdx() != bond.GetIdx()]
a2_bonds_single = [b.GetBondType() == Chem.BondType.SINGLE for b in a2.GetBonds() if b.GetIdx() != bond.GetIdx()]
# if there are two identical substituents in one side then the bond is unsteric (no stereoisomers possible)
ranks = list(Chem.CanonicalRankAtoms(m, breakTies=False))
a1_nei = [a.GetIdx() for a in a1.GetNeighbors() if a.GetIdx() != a2.GetIdx()]
if len(a1_nei) == 2 and \
all(m.GetBondBetweenAtoms(i, a1.GetIdx()).GetBondType() == Chem.BondType.SINGLE for i in a1_nei) and \
ranks[a1_nei[0]] == ranks[a1_nei[1]]:
return False
a2_nei = [a.GetIdx() for a in a2.GetNeighbors() if a.GetIdx() != a1.GetIdx()]
if len(a2_nei) == 2 and \
all(m.GetBondBetweenAtoms(i, a2.GetIdx()).GetBondType() == Chem.BondType.SINGLE for i in a2_nei) and \
ranks[a2_nei[0]] == ranks[a2_nei[1]]:
return False
# if list is empty this is a terminal atom, e.g. O in C=O
if a1_bonds_single and a2_bonds_single and \
all(a1_bonds_single) and all(a2_bonds_single):
return True
else:
return False
res = []
for b in m.GetBonds():
if b.GetBondType() == Chem.BondType.DOUBLE and \
b.GetStereo() == Chem.BondStereo.STEREONONE and \
(not b.IsInRing() or not (b.IsInRingSize(3) or b.IsInRingSize(4) or b.IsInRingSize(5) or b.IsInRingSize(6) or b.IsInRingSize(7))) and \
check_nei_bonds(b):
res.append(b.GetIdx())
return res
def set_double_bond_stereo(bond, value):
# value can be 1 or 0
def set_bond(fixed_bonds, changed_bonds, bond_value):
if bond_value:
if any(b.GetBondDir() == Chem.rdchem.BondDir.ENDDOWNRIGHT for b in fixed_bonds):
changed_bonds[0].SetBondDir(Chem.rdchem.BondDir.ENDDOWNRIGHT)
elif any(b.GetBondDir() == Chem.rdchem.BondDir.ENDUPRIGHT for b in fixed_bonds):
changed_bonds[0].SetBondDir(Chem.rdchem.BondDir.ENDUPRIGHT)
else:
if any(b.GetBondDir() == Chem.rdchem.BondDir.ENDDOWNRIGHT for b in fixed_bonds):
changed_bonds[0].SetBondDir(Chem.rdchem.BondDir.ENDUPRIGHT)
elif any(b.GetBondDir() == Chem.rdchem.BondDir.ENDUPRIGHT for b in fixed_bonds):
changed_bonds[0].SetBondDir(Chem.rdchem.BondDir.ENDDOWNRIGHT)
a1, a2 = bond.GetBeginAtom(), bond.GetEndAtom()
a1_bonds = tuple(b for b in a1.GetBonds() if b.GetIdx() != bond.GetIdx())
a2_bonds = tuple(b for b in a2.GetBonds() if b.GetIdx() != bond.GetIdx())
if a1_bonds and a2_bonds: # if double bond of carbonyl one atom does not have any other bonds - skip
if all(b.GetBondDir() == Chem.BondDir.NONE for b in a1_bonds) and \
all(b.GetBondDir() == Chem.BondDir.NONE for b in a2_bonds):
a1_bonds[0].SetBondDir(Chem.rdchem.BondDir.ENDDOWNRIGHT)
if value:
a2_bonds[0].SetBondDir(Chem.rdchem.BondDir.ENDDOWNRIGHT)
else:
a2_bonds[0].SetBondDir(Chem.rdchem.BondDir.ENDUPRIGHT)
elif all(b.GetBondDir() == Chem.BondDir.NONE for b in a1_bonds):
set_bond(a2_bonds, a1_bonds, value)
else:
set_bond(a1_bonds, a2_bonds, value)
def enumerate_double_bond_stereo(mol):
bonds = get_unspec_double_bonds(mol)
res = []
for p in product([0, 1], repeat=len(bonds)):
m = deepcopy(mol)
for value, bond in zip(p, bonds):
set_double_bond_stereo(m.GetBondWithIdx(bond), value)
Chem.AssignStereochemistry(m, force=True, cleanIt=True)
res.append(m)
return res
def enumerate_tetrahedral_stereo(mol, n_attempts=0):
output = []
chiral_undef = tuple(i[0] for i in Chem.FindMolChiralCenters(mol, includeUnassigned=True) if i[1] == '?')
# without Hs sometimes wrong stereochemistry can be optimized without errors CC(C)=CCN1CCC2(C)c3cc(O)ccc3CC1C2C
mol = Chem.AddHs(mol)
num = 0
for p in product([Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW, Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW], repeat=len(chiral_undef)):
for tag, i in zip(p, chiral_undef):
mol.GetAtomWithIdx(i).SetChiralTag(tag)
conf_id = AllChem.EmbedMolecule(mol, maxAttempts=n_attempts)
if conf_id != -1: # -1 means attempt was failed
try:
if AllChem.UFFHasAllMoleculeParams(mol):
AllChem.UFFOptimizeMolecule(mol, maxIters=10)
num += 1
output.append(deepcopy(Chem.RemoveHs(mol)))
else:
sys.stderr.write('No UFF parameters for %s\n' % (Chem.MolToSmiles(Chem.RemoveHs(mol))))
except ValueError:
continue
else:
continue
return output
def enumerate_stereo(mol, mol_name, tetrahedral, double_bond, max_attempts, max_undef):
if max_undef != -1:
undef = 0
if tetrahedral:
undef += sum(i[1] == '?' for i in Chem.FindMolChiralCenters(mol, includeUnassigned=True))
if double_bond:
undef += len(get_unspec_double_bonds(mol))
if undef > max_undef:
return []
mols = [mol]
if double_bond:
mols = enumerate_double_bond_stereo(mol)
if tetrahedral:
tmp = []
for m in mols:
tmp.extend(enumerate_tetrahedral_stereo(m, max_attempts))
mols = tmp
# filter possible duplicates and keep the order, not sure whether it is really needed
output = []
for m in mols:
smi = Chem.MolToSmiles(m, isomericSmiles=True)
if smi not in output:
output.append(smi)
return [(smi, "%s_%i" % (mol_name, i+1)) for i, smi in enumerate(output)]
def main_params(in_fname, out_fname, tetrahedral, double_bond, max_attempts, max_undef, id_field_name, ncpu, verbose):
if out_fname is not None:
fout = open(out_fname, 'wt')
nprocess = min(cpu_count(), max(ncpu, 1))
p = Pool(nprocess)
try:
for i, res in enumerate(p.imap_unordered(map_enumerate_stereo,
prep_input(in_fname, id_field_name, tetrahedral, double_bond, max_attempts, max_undef),
chunksize=10)):
if out_fname is None:
for smi, mol_name in res:
print(smi + '\t' + mol_name)
sys.stdout.flush()
else:
for smi, mol_name in res:
fout.write('%s\t%s\n' % (smi, mol_name))
if verbose and i % 100 == 0:
sys.stderr.write('\r%i molecules were processed' % i)
sys.stderr.flush()
finally:
p.close()
if out_fname is not None:
fout.close()
def main():
parser = argparse.ArgumentParser(description='Generation of stereoisomers with RDKit.')
parser.add_argument('-i', '--input', metavar='input.sdf', required=True,
help='input file in SDF or SMILES format. SMILES input should have no header, '
'the first column is SMILES string and the second column with ID is optional.')
parser.add_argument('-o', '--output', metavar='output.smi', required=False, default=None,
help='output file in SMILES format. If omitted output will be made in STDOUT.')
parser.add_argument('-t', '--tetrahedral', required=False, action='store_true', default=False,
help='generate stereoisomers for unspecified tetrahedral centers.')
parser.add_argument('-d', '--double_bond', required=False, action='store_true', default=False,
help='generate stereoisomers for unspecified double bonds.')
parser.add_argument('-a', '--max_attempts', metavar='INTEGER', default=0,
help='maximum number of attempts to embed molecule when testing its 3D structure. '
'0 means maximum possible number of attempts which will help to dentify all possible '
'stereoisomers. Reasonable values for very flexible and complex molecules 100-200 to '
'shorten search time and lost only few stereoisomers. Default: 0.')
parser.add_argument('-u', '--max_undef', metavar='INTEGER', default=-1,
help='maximum allowed number of unspecified stereocenters and/or double bonds. '
'if compound contains greater number of them it will be discarded. '
'Default: all possible stereoisomers will be enumerated '
'(beware of combinatorial explosion).')
parser.add_argument('-f', '--id_field_name', metavar='field_name', default=None,
help='SDF input: field name of compound ID. '
'If omitted molecule titles will be used or SMILES string as name.')
parser.add_argument('-c', '--ncpu', metavar='cpu_number', default=1,
help='number of cpus to use for calculation. Default: 1.')
parser.add_argument('-v', '--verbose', action='store_true', default=False,
help='print progress to STDERR.')
args = vars(parser.parse_args())
for o, v in args.items():
if o == "input": in_fname = v
if o == "output": out_fname = v
if o == "id_field_name": id_field_name = v
if o == "ncpu": ncpu = int(v)
if o == "max_undef": max_undef = int(v)
if o == "tetrahedral": tetrahedral = v
if o == "double_bond": double_bond = v
if o == "verbose": verbose = v
if o == "max_attempts": max_attempts = int(v)
if not tetrahedral and not double_bond:
print("You should specify at least one option -t or -d. Revise you command line arguments.")
exit()
main_params(in_fname=in_fname,
out_fname=out_fname,
tetrahedral=tetrahedral,
double_bond=double_bond,
max_undef=max_undef,
id_field_name=id_field_name,
max_attempts=max_attempts,
ncpu=ncpu,
verbose=verbose)
if __name__ == '__main__':
main()