Skip to content

Commit b38e9db

Browse files
committed
fix inclusion of PRO in secondary structure
- fix #4913 - ported fix from PyDSSP 0.9.1 by @ShintaroMinami to analysis.dssp.DSSP (see also ShintaroMinami/PyDSSP#2) - new kwarg ignore_proline_donor=True for DSSP (the new default changes the behavior and implements the fix, False recovers old behavior); the kwarg also exists in PyDSSP - updated docs - minimal regression tests - updated CHANGELOG
1 parent 0dc5a1b commit b38e9db

File tree

4 files changed

+83
-18
lines changed

4 files changed

+83
-18
lines changed

package/CHANGELOG

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ The rules for this file:
2121
* 2.10.0
2222

2323
Fixes
24+
* Fixes incorrect assignment of secondary structure to proline residues in
25+
DSSP by porting upstream PyDSSP 0.9.1 fix (Issue #4913)
2426
* Fix incorrect `self.atom` assignment in SingleFrameReaderBase (Issue #5052, PR #5055)
2527
* Fixes bug in `analysis/hydrogenbonds.py`: `_donors` and `_hydrogens`
2628
were not updated when the `acceptors_sel` and `hydrogens_sel` were provided
@@ -36,6 +38,9 @@ Fixes
3638
directly passing them. (Issue #3520, PR #5006)
3739

3840
Enhancements
41+
* Adds new kwarg `ignore_proline_donor=True` to analysis.dssp.DSSP so that
42+
proline HN are not used as H-bond donors in secondary structure
43+
calculation. Set to `False` to recover behavior from <2.10.0 (Issue #4913)
3944
* Improve parsing of topology information from LAMMPS dump files to allow
4045
reading of mass, charge and element attributes. (Issue #3449, PR #4995)
4146
* Added timestep support for XDR writers and readers (Issue #4905, PR #4908)

package/MDAnalysis/analysis/dssp/dssp.py

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -112,27 +112,27 @@
112112
:inherited-members:
113113
114114
.. attribute:: results.dssp
115-
116-
Contains the time series of the DSSP assignment as a
115+
116+
Contains the time series of the DSSP assignment as a
117117
:class:`numpy.ndarray` array of shape ``(n_frames, n_residues)`` where each row
118-
contains the assigned secondary structure character for each residue (whose
118+
contains the assigned secondary structure character for each residue (whose
119119
corresponding resid is stored in :attr:`results.resids`). The three characters
120120
are ['H', 'E', '-'] and representi alpha-helix, sheet and loop, respectively.
121121
122122
.. attribute:: results.dssp_ndarray
123-
123+
124124
Contains the one-hot encoding of the time series of the DSSP assignment
125-
as a :class:`numpy.ndarray` Boolean array of shape ``(n_frames, n_residues, 3)``
125+
as a :class:`numpy.ndarray` Boolean array of shape ``(n_frames, n_residues, 3)``
126126
where for each residue the encoding is stored as ``(3,)`` shape
127-
:class:`numpy.ndarray` of Booleans so that ``True`` at index 0 represents loop
128-
('-'), ``True`` at index 1 represents helix ('H'), and ``True`` at index 2
127+
:class:`numpy.ndarray` of Booleans so that ``True`` at index 0 represents loop
128+
('-'), ``True`` at index 1 represents helix ('H'), and ``True`` at index 2
129129
represents sheet 'E'.
130130
131131
.. SeeAlso:: :func:`translate`
132-
132+
133133
134134
.. attribute:: results.resids
135-
135+
136136
A :class:`numpy.ndarray` of length ``n_residues`` that contains the residue IDs
137137
(resids) for the protein residues that were assigned a secondary structure.
138138
@@ -144,7 +144,7 @@
144144
.. autofunction:: translate
145145
"""
146146

147-
from typing import Union
147+
from typing import Union, Optional
148148
import numpy as np
149149
from MDAnalysis import Universe, AtomGroup
150150

@@ -234,6 +234,15 @@ class DSSP(AnalysisBase):
234234
(e.g., a :exc:`ValueError` is raised) you must customize `hydrogen_name`
235235
for your specific case.
236236
237+
ignore_proline_donor : bool, optional, default ``True``
238+
If ``True`` (the default) do not consider HN on proline to be a hydrogen
239+
donor for the purpose of calculating secondary structure.
240+
241+
.. versionadded:: 2.10.0
242+
The default ``True`` is the correct DSSP behavior and fixes the earlier
243+
implementation. Setting to ``False`` recovers behavior from 2.9.x and
244+
earlier.
245+
237246
238247
Raises
239248
------
@@ -280,6 +289,10 @@ class DSSP(AnalysisBase):
280289
Enabled **parallel execution** with the ``multiprocessing`` and ``dask``
281290
backends; use the new method :meth:`get_supported_backends` to see all
282291
supported backends.
292+
293+
.. versionchanged:: 2.10.0
294+
Change treatment of proline and follow pydssp 0.9.1 (with
295+
``ignore_proline_donor=True``).
283296
"""
284297

285298
_analysis_algorithm_is_parallelizable = True
@@ -299,6 +312,7 @@ def __init__(
299312
*,
300313
heavyatom_names: tuple[str] = ("N", "CA", "C", "O O1 OT1"),
301314
hydrogen_name: str = "H HN HT1 HT2 HT3",
315+
ignore_proline_donor: bool = True,
302316
):
303317
self._guess_hydrogens = guess_hydrogens
304318

@@ -315,6 +329,7 @@ def __init__(
315329
]
316330
for t in heavyatom_names
317331
}
332+
self._donor_mask: Optional[np.ndarray] = ag.residues.resnames != 'PRO' if ignore_proline_donor else None
318333
self._hydrogens: list["AtomGroup"] = [
319334
res.atoms.select_atoms(f"name {hydrogen_name}")
320335
for res in ag.residues
@@ -391,7 +406,7 @@ def _get_coords(self) -> np.ndarray:
391406

392407
def _single_frame(self):
393408
coords = self._get_coords()
394-
dssp = assign(coords)
409+
dssp = assign(coords, donor_mask=self._donor_mask)
395410
self.results.dssp_ndarray.append(dssp)
396411

397412
def _conclude(self):

package/MDAnalysis/analysis/dssp/pydssp_numpy.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def _unfold(a: np.ndarray, window: int, axis: int):
7474

7575

7676
def _get_hydrogen_atom_position(coord: np.ndarray) -> np.ndarray:
77-
"""Fills in hydrogen atoms positions if they are abscent, under the
77+
"""Fills in hydrogen atoms positions if they are absent, under the
7878
assumption that C-N-H and H-N-CA angles are perfect 120 degrees,
7979
and N-H bond length is 1.01 A.
8080
@@ -118,6 +118,7 @@ def _get_hydrogen_atom_position(coord: np.ndarray) -> np.ndarray:
118118

119119
def get_hbond_map(
120120
coord: np.ndarray,
121+
donor_mask: np.ndarray = None,
121122
cutoff: float = DEFAULT_CUTOFF,
122123
margin: float = DEFAULT_MARGIN,
123124
return_e: bool = False,
@@ -130,6 +131,12 @@ def get_hbond_map(
130131
input coordinates in either (n, 4, 3) or (n, 5, 3) shape
131132
(without or with hydrogens). If hydrogens are not present, then
132133
ideal positions (see :func:_get_hydrogen_atom_positions) are used.
134+
donor_mask : np.array
135+
Mask out any hydrogens that should not be considered (in particular HN
136+
in PRO). If ``None`` then all H will be used (behavior up to 2.9.0).
137+
138+
.. versionadded:: 2.10.0
139+
133140
cutoff : float, optional
134141
cutoff, by default DEFAULT_CUTOFF
135142
margin : float, optional
@@ -144,6 +151,10 @@ def get_hbond_map(
144151
145152
146153
.. versionadded:: 2.8.0
154+
155+
.. versionchanged:: 2.10.0
156+
Support masking of hydrogen donors via `donor_mask` (especially needed
157+
for ignoring HN on proline residues). Backport of PRO fix from pydssp 0.9.1.
147158
"""
148159
n_atoms, n_atom_types, _ = coord.shape
149160
assert n_atom_types in (
@@ -194,15 +205,21 @@ def get_hbond_map(
194205
local_mask = ~np.eye(n_atoms, dtype=bool)
195206
local_mask *= ~np.diag(np.ones(n_atoms - 1, dtype=bool), k=-1)
196207
local_mask *= ~np.diag(np.ones(n_atoms - 2, dtype=bool), k=-2)
208+
# mask for donor H absence (Proline)
209+
donor_mask = np.array(donor_mask).astype(float) if donor_mask is not None else np.ones(n_atoms, dtype=float)
210+
donor_mask = np.tile(donor_mask[:, np.newaxis], (1, n_atoms))
197211
# hydrogen bond map (continuous value extension of original definition)
198212
hbond_map = np.clip(cutoff - margin - e, a_min=-margin, a_max=margin)
199213
hbond_map = (np.sin(hbond_map / margin * np.pi / 2) + 1.0) / 2
200-
hbond_map = hbond_map * local_mask
214+
hbond_map *= local_mask
215+
hbond_map *= donor_mask
201216

202217
return hbond_map
203218

204219

205-
def assign(coord: np.ndarray) -> np.ndarray:
220+
def assign(coord: np.ndarray,
221+
donor_mask: np.ndarray = None,
222+
) -> np.ndarray:
206223
"""Assigns secondary structure for a given coordinate array,
207224
either with or without assigned hydrogens
208225
@@ -214,6 +231,12 @@ def assign(coord: np.ndarray) -> np.ndarray:
214231
(N, CA, C, O) atoms coordinates (if k=4), or (N, CA, C, O, H) coordinates
215232
(when k=5).
216233
234+
donor_mask : np.array
235+
Mask out any hydrogens that should not be considered (in particular HN
236+
in PRO). If ``None`` then all H will be used (behavior up to 2.9.0).
237+
238+
.. versionadded:: 2.10.0
239+
217240
Returns
218241
-------
219242
np.ndarray
@@ -222,9 +245,13 @@ def assign(coord: np.ndarray) -> np.ndarray:
222245
223246
224247
.. versionadded:: 2.8.0
248+
249+
.. versionchanged:: 2.10.0
250+
Support masking of donors.
251+
225252
"""
226253
# get hydrogen bond map
227-
hbmap = get_hbond_map(coord)
254+
hbmap = get_hbond_map(coord, donor_mask=donor_mask)
228255
hbmap = np.swapaxes(hbmap, -1, -2) # convert into "i:C=O, j:N-H" form
229256

230257
# identify turn 3, 4, 5

testsuite/MDAnalysisTests/analysis/test_dssp.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,35 @@
1313
"pdb_filename", glob.glob(f"{DSSP_FOLDER}/?????.pdb.gz")
1414
)
1515
def test_file_guess_hydrogens(pdb_filename, client_DSSP):
16+
# run 2.9.0 tests (which include PRO)
17+
# ignore_proline_donor=False
18+
# TODO: update reference data for ignore_proline_donor=True
1619
u = mda.Universe(pdb_filename)
1720
with open(f"{pdb_filename.rstrip('.gz')}.dssp", "r") as fin:
1821
correct_answ = fin.read().strip().split()[0]
1922

20-
run = DSSP(u, guess_hydrogens=True).run(**client_DSSP)
23+
run = DSSP(u, guess_hydrogens=True, ignore_proline_donor=False).run(**client_DSSP)
2124
answ = "".join(run.results.dssp[0])
2225
assert answ == correct_answ
2326

2427

28+
@pytest.mark.parametrize(
29+
"pdb_filename", [f"{DSSP_FOLDER}/{PDBID}.pdb.gz" for PDBID in ("1eteA", "3aqgA", "3gknA", "3nzmA", "3a4rA", "3l4rA", "2j49A", "3gfsA",)]
30+
)
31+
def test_file_ignore_proline_donor(pdb_filename, client_DSSP):
32+
# weak test: just check that for some structures the two methods give different results
33+
u = mda.Universe(pdb_filename)
34+
35+
run_with_pro = DSSP(u, guess_hydrogens=True, ignore_proline_donor=False).run(**client_DSSP)
36+
answ_with_pro = "".join(run_with_pro.results.dssp[0])
37+
38+
# ignore_proline_donor=True is the default:
39+
run_ignore_pro = DSSP(u, guess_hydrogens=True).run(**client_DSSP)
40+
answ_ignore_pro = "".join(run_ignore_pro.results.dssp[0])
41+
42+
assert answ_ignore_pro != answ_with_pro
43+
44+
2545
def test_trajectory(client_DSSP):
2646
u = mda.Universe(TPR, XTC).select_atoms("protein").universe
2747
run = DSSP(u).run(**client_DSSP, stop=10)
@@ -32,8 +52,6 @@ def test_trajectory(client_DSSP):
3252
assert (
3353
first_frame[:10] != last_frame[:10] == avg_frame[:10] == "-EEEEEE---"
3454
)
35-
protein = mda.Universe(TPR, XTC).select_atoms("protein")
36-
run = DSSP(protein).run(**client_DSSP, stop=10)
3755

3856

3957
def test_atomgroup(client_DSSP):

0 commit comments

Comments
 (0)