-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathsanitize_rdkit.py
executable file
·133 lines (104 loc) · 5.48 KB
/
sanitize_rdkit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python3
__author__ = 'Pavel Polishchuk'
import sys
import argparse
from rdkit import Chem
from read_input import read_input
def get_unspec_double_bonds(m):
def check_nei_bonds(bond):
a1, a2 = bond.GetBeginAtom(), bond.GetEndAtom()
a1_bonds_single = [b.GetBondType() == Chem.BondType.SINGLE for b in a1.GetBonds() if b.GetIdx() != bond.GetIdx()]
a2_bonds_single = [b.GetBondType() == Chem.BondType.SINGLE for b in a2.GetBonds() if b.GetIdx() != bond.GetIdx()]
# if there are two identical substituents in one side then the bond is unsteric (no stereoisomers possible)
ranks = list(Chem.CanonicalRankAtoms(m, breakTies=False))
a1_nei = [a.GetIdx() for a in a1.GetNeighbors() if a.GetIdx() != a2.GetIdx()]
if len(a1_nei) == 2 and \
all(m.GetBondBetweenAtoms(i, a1.GetIdx()).GetBondType() == Chem.BondType.SINGLE for i in a1_nei) and \
ranks[a1_nei[0]] == ranks[a1_nei[1]]:
return False
a2_nei = [a.GetIdx() for a in a2.GetNeighbors() if a.GetIdx() != a1.GetIdx()]
if len(a2_nei) == 2 and \
all(m.GetBondBetweenAtoms(i, a2.GetIdx()).GetBondType() == Chem.BondType.SINGLE for i in a2_nei) and \
ranks[a2_nei[0]] == ranks[a2_nei[1]]:
return False
# if list is empty this is a terminal atom, e.g. O in C=O
if a1_bonds_single and a2_bonds_single and \
all(a1_bonds_single) and all(a2_bonds_single):
return True
else:
return False
res = []
for b in m.GetBonds():
if b.GetBondType() == Chem.BondType.DOUBLE and \
b.GetStereo() == Chem.BondStereo.STEREONONE and \
(not b.IsInRing() or not (b.IsInRingSize(3) or b.IsInRingSize(4) or b.IsInRingSize(5) or b.IsInRingSize(6) or b.IsInRingSize(7))) and \
check_nei_bonds(b):
res.append(b.GetIdx())
return res
def calc(in_fname, out_fname, error_fname, input_format, output_format):
if out_fname is None:
out_fname = '/dev/stdout'
if error_fname is None:
error_fname = '/dev/stderr'
if output_format == 'sdf':
wout = Chem.SDWriter(out_fname)
else:
print("Error output format.")
exit()
werr = Chem.SmilesWriter(error_fname, delimiter='\t', isomericSmiles=True)
werr.SetProps(['Sanitization_error'])
for mol, mol_name in read_input(in_fname, input_format=input_format, sanitize=False):
if mol is not None:
err = Chem.SanitizeMol(mol, catchErrors=True)
if err:
sys.stderr.write('Error %i sanitizing molecule %s\n' % (err, mol_name))
sys.stderr.flush()
mol.SetProp('Sanitization_error', str(err))
werr.write(mol)
else:
try:
Chem.Kekulize(mol)
double = len(get_unspec_double_bonds(mol))
if double:
mol.SetProp("Double bonds", str(double))
tetra = sum(i[1] == '?' for i in Chem.FindMolChiralCenters(mol, includeUnassigned=True))
if tetra:
mol.SetProp("Undefined stereocenters", str(tetra))
chg = Chem.GetFormalCharge(mol)
if chg:
mol.SetProp("Charge", str(chg))
wout.write(mol)
except:
mol.SetProp('Sanitization_error', 'Kekulization failed')
werr.write(mol)
werr.close()
wout.close()
def main():
parser = argparse.ArgumentParser(description='Removes compounds with rdkit sanitization errors and add to output '
'molecules number of double bonds, unspecified stereocenters and '
'total charge.')
parser.add_argument('-i', '--input', metavar='input.sdf', required=False, default=None,
help='input file in SDF or SMILES format. SMILES input should have no header, '
'the first column is SMILES string and the second column with ID is optional. '
'If omitted STDIN will be read as SDF format.')
parser.add_argument('-f', '--input_format', metavar='sdf', required=False, default='sdf',
help='input file format. Default: sdf.')
parser.add_argument('-o', '--output', metavar='output.sdf', required=False, default=None,
help='output file in SDF format. If omitted output will be redirected to STDOUT.')
parser.add_argument('-g', '--output_format', metavar='sdf', required=False, default='sdf',
help='output file format. Default: sdf.')
parser.add_argument('-e', '--error_output', metavar='output.smi', required=False, default=None,
help='output SMILES file for compounds having sanitization errors. '
'If omitted output will be redirected to STDERR.')
args = vars(parser.parse_args())
for o, v in args.items():
if o == "input": in_fname = v
if o == "output": out_fname = v
if o == "error_output": error_fname = v
if o == "input_format": input_format = v
if o == "output_format": output_format = v
if in_fname == "/dev/stdin":
in_fname = None
calc(in_fname, out_fname, error_fname, input_format, output_format)
if __name__ == '__main__':
main()