fix inclusion of PRO in secondary structure

orbeckst · orbeckst · commit b38e9dbd8657 · 2025-06-11T17:21:34.000-07:00
- fix #4913 - ported fix from PyDSSP 0.9.1 by @ShintaroMinami to analysis.dssp.DSSP (see also ShintaroMinami/PyDSSP#2) - new kwarg ignore_proline_donor=True for DSSP (the new default changes the behavior and implements the fix, False recovers old behavior); the kwarg also exists in PyDSSP - updated docs - minimal regression tests - updated CHANGELOG
diff --git a/package/CHANGELOG b/package/CHANGELOG
@@ -21,6 +21,8 @@ The rules for this file:
  * 2.10.0
 
 Fixes
+ * Fixes incorrect assignment of secondary structure to proline residues in
+   DSSP by porting upstream PyDSSP 0.9.1 fix (Issue #4913)   
  * Fix incorrect `self.atom` assignment in SingleFrameReaderBase (Issue #5052, PR #5055)
  * Fixes bug in `analysis/hydrogenbonds.py`: `_donors` and `_hydrogens`
    were not updated when the `acceptors_sel` and `hydrogens_sel` were provided
@@ -36,6 +38,9 @@ Fixes
    directly passing them. (Issue #3520, PR #5006)
 
 Enhancements
+ * Adds new kwarg `ignore_proline_donor=True` to analysis.dssp.DSSP so that
+   proline HN are not used as H-bond donors in secondary structure
+   calculation. Set to `False` to recover behavior from <2.10.0 (Issue #4913)   
  * Improve parsing of topology information from LAMMPS dump files to allow
    reading of mass, charge and element attributes. (Issue #3449, PR #4995)
  * Added timestep support for XDR writers and readers (Issue #4905, PR #4908)
diff --git a/package/MDAnalysis/analysis/dssp/dssp.py b/package/MDAnalysis/analysis/dssp/dssp.py
@@ -112,27 +112,27 @@
    :inherited-members:
 
    .. attribute:: results.dssp
-   
-      Contains the time series of the DSSP assignment as a 
+
+      Contains the time series of the DSSP assignment as a
       :class:`numpy.ndarray` array of shape ``(n_frames, n_residues)`` where each row
-      contains the assigned secondary structure character for each residue (whose 
+      contains the assigned secondary structure character for each residue (whose
       corresponding resid is stored in :attr:`results.resids`). The three characters
       are ['H', 'E', '-'] and representi alpha-helix, sheet and loop, respectively.
 
    .. attribute:: results.dssp_ndarray
-   
+
       Contains the one-hot encoding of the time series of the DSSP assignment
-      as a :class:`numpy.ndarray` Boolean array of shape ``(n_frames, n_residues, 3)`` 
+      as a :class:`numpy.ndarray` Boolean array of shape ``(n_frames, n_residues, 3)``
       where for each residue the encoding is stored as ``(3,)`` shape
-      :class:`numpy.ndarray` of Booleans so that ``True`` at index 0 represents loop 
-      ('-'), ``True`` at index 1 represents helix ('H'), and ``True`` at index 2 
+      :class:`numpy.ndarray` of Booleans so that ``True`` at index 0 represents loop
+      ('-'), ``True`` at index 1 represents helix ('H'), and ``True`` at index 2
       represents sheet 'E'.
 
       .. SeeAlso:: :func:`translate`
-      
+
 
    .. attribute:: results.resids
-   
+
       A :class:`numpy.ndarray` of length ``n_residues`` that contains the residue IDs
       (resids) for the protein residues that were assigned a secondary structure.
 
@@ -144,7 +144,7 @@
 .. autofunction:: translate
 """
 
-from typing import Union
+from typing import Union, Optional
 import numpy as np
 from MDAnalysis import Universe, AtomGroup
 
@@ -234,6 +234,15 @@ class DSSP(AnalysisBase):
            (e.g., a :exc:`ValueError` is raised) you must customize `hydrogen_name`
            for your specific case.
 
+    ignore_proline_donor : bool, optional, default ``True``
+         If ``True`` (the default) do not consider HN on proline to be a hydrogen
+         donor for the purpose of calculating secondary structure.
+
+         .. versionadded:: 2.10.0
+            The default ``True`` is the correct DSSP behavior and fixes the earlier
+            implementation. Setting to ``False`` recovers behavior from 2.9.x and
+            earlier.
+
 
     Raises
     ------
@@ -280,6 +289,10 @@ class DSSP(AnalysisBase):
        Enabled **parallel execution** with the ``multiprocessing`` and ``dask``
        backends; use the new method :meth:`get_supported_backends` to see all
        supported backends.
+
+    .. versionchanged:: 2.10.0
+       Change treatment of proline and follow pydssp 0.9.1 (with
+       ``ignore_proline_donor=True``).
     """
 
     _analysis_algorithm_is_parallelizable = True
@@ -299,6 +312,7 @@ def __init__(
         *,
         heavyatom_names: tuple[str] = ("N", "CA", "C", "O O1 OT1"),
         hydrogen_name: str = "H HN HT1 HT2 HT3",
+        ignore_proline_donor: bool = True,
     ):
         self._guess_hydrogens = guess_hydrogens
 
@@ -315,6 +329,7 @@ def __init__(
             ]
             for t in heavyatom_names
         }
+        self._donor_mask: Optional[np.ndarray] = ag.residues.resnames != 'PRO' if ignore_proline_donor else None
         self._hydrogens: list["AtomGroup"] = [
             res.atoms.select_atoms(f"name {hydrogen_name}")
             for res in ag.residues
@@ -391,7 +406,7 @@ def _get_coords(self) -> np.ndarray:
 
     def _single_frame(self):
         coords = self._get_coords()
-        dssp = assign(coords)
+        dssp = assign(coords, donor_mask=self._donor_mask)
         self.results.dssp_ndarray.append(dssp)
 
     def _conclude(self):
diff --git a/package/MDAnalysis/analysis/dssp/pydssp_numpy.py b/package/MDAnalysis/analysis/dssp/pydssp_numpy.py
@@ -74,7 +74,7 @@ def _unfold(a: np.ndarray, window: int, axis: int):
 
 
 def _get_hydrogen_atom_position(coord: np.ndarray) -> np.ndarray:
-    """Fills in hydrogen atoms positions if they are abscent, under the
+    """Fills in hydrogen atoms positions if they are absent, under the
     assumption that C-N-H and H-N-CA angles are perfect 120 degrees,
     and N-H bond length is 1.01 A.
 
@@ -118,6 +118,7 @@ def _get_hydrogen_atom_position(coord: np.ndarray) -> np.ndarray:
 
 def get_hbond_map(
     coord: np.ndarray,
+    donor_mask: np.ndarray = None,
     cutoff: float = DEFAULT_CUTOFF,
     margin: float = DEFAULT_MARGIN,
     return_e: bool = False,
@@ -130,6 +131,12 @@ def get_hbond_map(
         input coordinates in either (n, 4, 3) or (n, 5, 3) shape
         (without or with hydrogens). If hydrogens are not present, then
         ideal positions (see :func:_get_hydrogen_atom_positions) are used.
+    donor_mask : np.array
+         Mask out any hydrogens that should not be considered (in particular HN
+         in PRO). If ``None`` then all H will be used (behavior up to 2.9.0).
+
+         .. versionadded:: 2.10.0
+
     cutoff : float, optional
         cutoff, by default DEFAULT_CUTOFF
     margin : float, optional
@@ -144,6 +151,10 @@ def get_hbond_map(
 
 
     .. versionadded:: 2.8.0
+
+    .. versionchanged:: 2.10.0
+       Support masking of hydrogen donors via `donor_mask` (especially needed
+       for ignoring HN on proline residues). Backport of PRO fix from pydssp 0.9.1.
     """
     n_atoms, n_atom_types, _ = coord.shape
     assert n_atom_types in (
@@ -194,15 +205,21 @@ def get_hbond_map(
     local_mask = ~np.eye(n_atoms, dtype=bool)
     local_mask *= ~np.diag(np.ones(n_atoms - 1, dtype=bool), k=-1)
     local_mask *= ~np.diag(np.ones(n_atoms - 2, dtype=bool), k=-2)
+    # mask for donor H absence (Proline)
+    donor_mask = np.array(donor_mask).astype(float) if donor_mask is not None else np.ones(n_atoms, dtype=float)
+    donor_mask = np.tile(donor_mask[:, np.newaxis], (1, n_atoms))
     # hydrogen bond map (continuous value extension of original definition)
     hbond_map = np.clip(cutoff - margin - e, a_min=-margin, a_max=margin)
     hbond_map = (np.sin(hbond_map / margin * np.pi / 2) + 1.0) / 2
-    hbond_map = hbond_map * local_mask
+    hbond_map *= local_mask
+    hbond_map *= donor_mask
 
     return hbond_map
 
 
-def assign(coord: np.ndarray) -> np.ndarray:
+def assign(coord: np.ndarray,
+           donor_mask: np.ndarray = None,
+) -> np.ndarray:
     """Assigns secondary structure for a given coordinate array,
     either with or without assigned hydrogens
 
@@ -214,6 +231,12 @@ def assign(coord: np.ndarray) -> np.ndarray:
         (N, CA, C, O) atoms coordinates (if k=4), or (N, CA, C, O, H) coordinates
         (when k=5).
 
+    donor_mask : np.array
+         Mask out any hydrogens that should not be considered (in particular HN
+         in PRO). If ``None`` then all H will be used (behavior up to 2.9.0).
+
+         .. versionadded:: 2.10.0
+
     Returns
     -------
     np.ndarray
@@ -222,9 +245,13 @@ def assign(coord: np.ndarray) -> np.ndarray:
 
 
     .. versionadded:: 2.8.0
+
+    .. versionchanged:: 2.10.0
+       Support masking of donors.
+
     """
     # get hydrogen bond map
-    hbmap = get_hbond_map(coord)
+    hbmap = get_hbond_map(coord, donor_mask=donor_mask)
     hbmap = np.swapaxes(hbmap, -1, -2)  # convert into "i:C=O, j:N-H" form
 
     # identify turn 3, 4, 5
diff --git a/testsuite/MDAnalysisTests/analysis/test_dssp.py b/testsuite/MDAnalysisTests/analysis/test_dssp.py
@@ -13,15 +13,35 @@
     "pdb_filename", glob.glob(f"{DSSP_FOLDER}/?????.pdb.gz")
 )
 def test_file_guess_hydrogens(pdb_filename, client_DSSP):
+    # run 2.9.0 tests (which include PRO)
+    # ignore_proline_donor=False
+    # TODO: update reference data for ignore_proline_donor=True
     u = mda.Universe(pdb_filename)
     with open(f"{pdb_filename.rstrip('.gz')}.dssp", "r") as fin:
         correct_answ = fin.read().strip().split()[0]
 
-    run = DSSP(u, guess_hydrogens=True).run(**client_DSSP)
+    run = DSSP(u, guess_hydrogens=True, ignore_proline_donor=False).run(**client_DSSP)
     answ = "".join(run.results.dssp[0])
     assert answ == correct_answ
 
 
+@pytest.mark.parametrize(
+    "pdb_filename", [f"{DSSP_FOLDER}/{PDBID}.pdb.gz" for PDBID in ("1eteA", "3aqgA", "3gknA", "3nzmA", "3a4rA", "3l4rA", "2j49A", "3gfsA",)]
+)
+def test_file_ignore_proline_donor(pdb_filename, client_DSSP):
+    # weak test: just check that for some structures the two methods give different results
+    u = mda.Universe(pdb_filename)
+
+    run_with_pro = DSSP(u, guess_hydrogens=True, ignore_proline_donor=False).run(**client_DSSP)
+    answ_with_pro = "".join(run_with_pro.results.dssp[0])
+
+    # ignore_proline_donor=True is the default:
+    run_ignore_pro = DSSP(u, guess_hydrogens=True).run(**client_DSSP)
+    answ_ignore_pro = "".join(run_ignore_pro.results.dssp[0])
+
+    assert answ_ignore_pro != answ_with_pro
+
+
 def test_trajectory(client_DSSP):
     u = mda.Universe(TPR, XTC).select_atoms("protein").universe
     run = DSSP(u).run(**client_DSSP, stop=10)
@@ -32,8 +52,6 @@ def test_trajectory(client_DSSP):
     assert (
         first_frame[:10] != last_frame[:10] == avg_frame[:10] == "-EEEEEE---"
     )
-    protein = mda.Universe(TPR, XTC).select_atoms("protein")
-    run = DSSP(protein).run(**client_DSSP, stop=10)
 
 
 def test_atomgroup(client_DSSP):