Skip to content

Commit faafe25

Browse files
committed
Add filter_conf_adv.py, which makes conformer selection based on RMSD and clustering with some advanced features
1 parent dba91d6 commit faafe25

File tree

2 files changed

+132
-2
lines changed

2 files changed

+132
-2
lines changed

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,19 +68,21 @@ Some scripts may require further dependencies.
6868

6969
`filter_conf` - filter conformers by RMS value.
7070

71+
`filter_conf_adv` - selection of representative conformers by RMS value using clustering and advanced features (e.g. preferable selection of specifically labeled conformers).
72+
7173
`gen_conf_rdkit` - generate conformers.
7274

7375
`gen_stereo_rdkit` - enumerate stereoisomers (tetrahedral and double bond).
7476

75-
`get_mol_center.py` - returns a geometrical center of a molecule
77+
`get_mol_center` - returns a geometrical center of a molecule
7678

7779
`get_substr` - filter input molecules by SMARTS, multiple SMARTS are allowed, negative matching is possible.
7880

7981
`get_total_charge` - calculate total formal charge of input MOL files.
8082

8183
`keep_largest` - keep the largest fragment by the number of heavy atoms in each compound record. If components have the same number a random one will be selected.
8284

83-
`mirror_mols.py` - return mirrored 3D input structure and optionally rename it. Useful for generation of enantiomers of molecules with axial/planar chirality.
85+
`mirror_mols` - return mirrored 3D input structure and optionally rename it. Useful for generation of enantiomers of molecules with axial/planar chirality.
8486

8587
`murcko` - return Murcko scaffolds ignoring stereoconfiguration.
8688

filter_conf_adv.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
#!/usr/bin/env python3
2+
3+
__author__ = 'Pavel Polishchuk'
4+
5+
import argparse
6+
import numpy as np
7+
import sys
8+
from functools import partial
9+
from read_input import read_input, assign_conf_props_to_mol
10+
from rdkit import Chem
11+
from rdkit.Chem.AllChem import GetConformerRMSMatrix
12+
from multiprocessing import Pool, cpu_count
13+
from sklearn.cluster import AgglomerativeClustering
14+
15+
16+
def remove_confs_rms(mol, rms=0.25, keep_nconf=None, preferably_keep=None):
17+
"""
18+
The function uses AgglomerativeClustering to select conformers.
19+
20+
:param mol: input molecule with multiple conformers
21+
:param rms: discard conformers which are closer than given value to a kept conformer
22+
:param keep_nconf: keep at most the given number of conformers. This parameter has precedence over rms
23+
:param preferably_keep: a tuple with the field name and the value to preferably keep conformers having this value
24+
:return:
25+
"""
26+
27+
def gen_ids(ids):
28+
for i in range(1, len(ids)):
29+
for j in range(0, i):
30+
yield j, i
31+
32+
if keep_nconf and mol.GetNumConformers() <= keep_nconf:
33+
return mol
34+
35+
if mol.GetNumConformers() <= 1:
36+
return mol
37+
38+
mol_tmp = Chem.RemoveHs(mol) # calc rms for heavy atoms only
39+
rms_ = GetConformerRMSMatrix(mol_tmp, prealigned=True)
40+
41+
cids = [c.GetId() for c in mol_tmp.GetConformers()]
42+
arr = np.zeros((len(cids), len(cids)))
43+
for (i, j), v in zip(gen_ids(cids), rms_):
44+
arr[i, j] = v
45+
arr[j, i] = v
46+
if keep_nconf:
47+
cl = AgglomerativeClustering(n_clusters=keep_nconf, linkage='complete', metric='precomputed').fit(arr)
48+
else:
49+
cl = AgglomerativeClustering(n_clusters=None, linkage='complete', metric='precomputed', distance_threshold=rms).fit(arr)
50+
51+
keep_cids = []
52+
for i in set(cl.labels_):
53+
ids = np.where(cl.labels_ == i)[0]
54+
if preferably_keep is not None:
55+
ids_keep = [] # these are sequential ids in ids var
56+
for k, conf_id in enumerate(ids):
57+
try:
58+
if mol_tmp.GetConformer(int(conf_id)).GetProp(preferably_keep[0]) == preferably_keep[1]:
59+
ids_keep.append(k)
60+
except KeyError:
61+
print(f'{mol_tmp.GetProp("_Name")} {conf_id}')
62+
print(f'\n{mol_tmp.GetConformer(int(conf_id)).GetPropsAsDict()}')
63+
if ids_keep:
64+
j = arr[np.ix_(ids, ids)].mean(axis=0)[ids_keep].argmin()
65+
j = ids_keep[j]
66+
else:
67+
j = arr[np.ix_(ids, ids)].mean(axis=0).argmin()
68+
else:
69+
j = arr[np.ix_(ids, ids)].mean(axis=0).argmin()
70+
keep_cids.append(cids[ids[j]])
71+
remove_ids = set(cids) - set(keep_cids)
72+
73+
for cid in sorted(remove_ids, reverse=True):
74+
mol_tmp.RemoveConformer(cid)
75+
76+
return mol_tmp
77+
78+
79+
def calc(input_fname, output_fname, rms, nconf, preferably_keep, ncpu, verbose):
80+
81+
Chem.SetDefaultPickleProperties(Chem.PropertyPickleOptions.AllProps)
82+
83+
pool = Pool(max(min(cpu_count(), ncpu), 1))
84+
85+
w = Chem.SDWriter(output_fname)
86+
for i, mol in enumerate(pool.imap(partial(remove_confs_rms,
87+
rms=rms,
88+
keep_nconf=nconf,
89+
preferably_keep=preferably_keep),
90+
(mol for mol, mol_id in read_input(input_fname, sdf_confs=True))),
91+
1):
92+
for conf in mol.GetConformers():
93+
assign_conf_props_to_mol(conf, mol)
94+
w.write(mol, confId=conf.GetId())
95+
if verbose:
96+
sys.stderr.write(f'\rMolecules passed: {i}')
97+
if verbose:
98+
sys.stderr.write('\n')
99+
100+
101+
def main():
102+
parser = argparse.ArgumentParser(description='Filter out conformers in the input file. Conformers are clustered '
103+
'by RMSD and from each cluster a representative conformer is '
104+
'selected. Selected conformers can be biased by specifying '
105+
'a particular value of a conformer property (data field).')
106+
parser.add_argument('-i', '--input', metavar='FILENAME', required=True, type=str,
107+
help='input SDF.')
108+
parser.add_argument('-o', '--output', metavar='FILENAME', required=True, type=str,
109+
help='output SDF file.')
110+
parser.add_argument('-r', '--rms', metavar='NUMERIC', default=1, type=float,
111+
help='RMS threshold to discard conformers. Default: 1 (Angstrom).')
112+
parser.add_argument('-n', '--nconf', metavar='INTEGER', default=None, type=int,
113+
help='number of selected conformers. It has a precedence over an RMS argument. '
114+
'Default: None (no filtering).')
115+
parser.add_argument('-k', '--preferably_keep', metavar=('STRING', 'STRING'), default=None, nargs=2,
116+
help='name of the property field and its value to use to preferably keep corresponding '
117+
'conformers having this property value.')
118+
parser.add_argument('-c', '--ncpu', metavar='INTEGER', default=1, type=int,
119+
help='number of cpu to use for calculation. Default: 1.')
120+
parser.add_argument('-v', '--verbose', action='store_true', default=False,
121+
help='print progress to STDERR.')
122+
args = parser.parse_args()
123+
124+
calc(args.input, args.output, args.rms, args.nconf, args.preferably_keep, args.ncpu, args.verbose)
125+
126+
127+
if __name__ == '__main__':
128+
main()

0 commit comments

Comments
 (0)