Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions package/CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ The rules for this file:
* 2.10.0

Fixes
* Fixes incorrect assignment of secondary structure to proline residues in
DSSP by porting upstream PyDSSP 0.9.1 fix (Issue #4913)
* Fix incorrect `self.atom` assignment in SingleFrameReaderBase (Issue #5052, PR #5055)
* Fixes bug in `analysis/hydrogenbonds.py`: `_donors` and `_hydrogens`
were not updated when the `acceptors_sel` and `hydrogens_sel` were provided
Expand All @@ -36,6 +38,9 @@ Fixes
directly passing them. (Issue #3520, PR #5006)

Enhancements
* Adds new kwarg `ignore_proline_donor=True` to analysis.dssp.DSSP so that
proline HN are not used as H-bond donors in secondary structure
calculation. Set to `False` to recover behavior from <2.10.0 (Issue #4913)
* Improve parsing of topology information from LAMMPS dump files to allow
reading of mass, charge and element attributes. (Issue #3449, PR #4995)
* Added timestep support for XDR writers and readers (Issue #4905, PR #4908)
Expand Down
39 changes: 28 additions & 11 deletions package/MDAnalysis/analysis/dssp/dssp.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,27 +112,27 @@
:inherited-members:

.. attribute:: results.dssp
Contains the time series of the DSSP assignment as a

Contains the time series of the DSSP assignment as a
:class:`numpy.ndarray` array of shape ``(n_frames, n_residues)`` where each row
contains the assigned secondary structure character for each residue (whose
contains the assigned secondary structure character for each residue (whose
corresponding resid is stored in :attr:`results.resids`). The three characters
are ['H', 'E', '-'] and representi alpha-helix, sheet and loop, respectively.

.. attribute:: results.dssp_ndarray

Contains the one-hot encoding of the time series of the DSSP assignment
as a :class:`numpy.ndarray` Boolean array of shape ``(n_frames, n_residues, 3)``
as a :class:`numpy.ndarray` Boolean array of shape ``(n_frames, n_residues, 3)``
where for each residue the encoding is stored as ``(3,)`` shape
:class:`numpy.ndarray` of Booleans so that ``True`` at index 0 represents loop
('-'), ``True`` at index 1 represents helix ('H'), and ``True`` at index 2
:class:`numpy.ndarray` of Booleans so that ``True`` at index 0 represents loop
('-'), ``True`` at index 1 represents helix ('H'), and ``True`` at index 2
represents sheet 'E'.

.. SeeAlso:: :func:`translate`


.. attribute:: results.resids

A :class:`numpy.ndarray` of length ``n_residues`` that contains the residue IDs
(resids) for the protein residues that were assigned a secondary structure.

Expand All @@ -144,7 +144,7 @@
.. autofunction:: translate
"""

from typing import Union
from typing import Union, Optional
import numpy as np
from MDAnalysis import Universe, AtomGroup

Expand Down Expand Up @@ -234,6 +234,15 @@ class DSSP(AnalysisBase):
(e.g., a :exc:`ValueError` is raised) you must customize `hydrogen_name`
for your specific case.

ignore_proline_donor : bool, optional, default ``True``
If ``True`` (the default) do not consider HN on proline to be a hydrogen
donor for the purpose of calculating secondary structure.

.. versionadded:: 2.10.0
The default ``True`` is the correct DSSP behavior and fixes the earlier
implementation. Setting to ``False`` recovers behavior from 2.9.x and
earlier.


Raises
------
Expand Down Expand Up @@ -280,6 +289,10 @@ class DSSP(AnalysisBase):
Enabled **parallel execution** with the ``multiprocessing`` and ``dask``
backends; use the new method :meth:`get_supported_backends` to see all
supported backends.

.. versionchanged:: 2.10.0
Change treatment of proline and follow pydssp 0.9.1 (with
``ignore_proline_donor=True``).
"""

_analysis_algorithm_is_parallelizable = True
Expand All @@ -299,6 +312,7 @@ def __init__(
*,
heavyatom_names: tuple[str] = ("N", "CA", "C", "O O1 OT1"),
hydrogen_name: str = "H HN HT1 HT2 HT3",
ignore_proline_donor: bool = True,
):
self._guess_hydrogens = guess_hydrogens

Expand All @@ -315,6 +329,9 @@ def __init__(
]
for t in heavyatom_names
}
self._donor_mask: Optional[np.ndarray] = (
ag.residues.resnames != "PRO" if ignore_proline_donor else None
)
Comment on lines +332 to +334
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This may not be correct. The code runs ... but I am not sure if I should be masking corresponding atoms.

self._hydrogens: list["AtomGroup"] = [
res.atoms.select_atoms(f"name {hydrogen_name}")
for res in ag.residues
Expand Down Expand Up @@ -391,7 +408,7 @@ def _get_coords(self) -> np.ndarray:

def _single_frame(self):
coords = self._get_coords()
dssp = assign(coords)
dssp = assign(coords, donor_mask=self._donor_mask)
self.results.dssp_ndarray.append(dssp)

def _conclude(self):
Expand Down
40 changes: 36 additions & 4 deletions package/MDAnalysis/analysis/dssp/pydssp_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def _unfold(a: np.ndarray, window: int, axis: int):


def _get_hydrogen_atom_position(coord: np.ndarray) -> np.ndarray:
"""Fills in hydrogen atoms positions if they are abscent, under the
"""Fills in hydrogen atoms positions if they are absent, under the
Copy link

Copilot AI Sep 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed spelling error: 'abscent' should be 'absent'.

Copilot uses AI. Check for mistakes.
assumption that C-N-H and H-N-CA angles are perfect 120 degrees,
and N-H bond length is 1.01 A.

Expand Down Expand Up @@ -118,6 +118,7 @@ def _get_hydrogen_atom_position(coord: np.ndarray) -> np.ndarray:

def get_hbond_map(
coord: np.ndarray,
donor_mask: np.ndarray = None,
cutoff: float = DEFAULT_CUTOFF,
margin: float = DEFAULT_MARGIN,
return_e: bool = False,
Expand All @@ -130,6 +131,12 @@ def get_hbond_map(
input coordinates in either (n, 4, 3) or (n, 5, 3) shape
(without or with hydrogens). If hydrogens are not present, then
ideal positions (see :func:_get_hydrogen_atom_positions) are used.
donor_mask : np.array
Mask out any hydrogens that should not be considered (in particular HN
in PRO). If ``None`` then all H will be used (behavior up to 2.9.0).
Comment on lines +135 to +136
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These docs should be more specific and state the shape. I just quickly guessed the shape from https://github.com/ShintaroMinami/PyDSSP/blob/e251a43ff8622fe0a555313b1567edce45e789e8/scripts/pydssp#L30

donor_mask = sequence != 'PRO' if args.ignore_proline_donor else None


.. versionadded:: 2.10.0

cutoff : float, optional
cutoff, by default DEFAULT_CUTOFF
margin : float, optional
Expand All @@ -144,6 +151,10 @@ def get_hbond_map(


.. versionadded:: 2.8.0

.. versionchanged:: 2.10.0
Support masking of hydrogen donors via `donor_mask` (especially needed
for ignoring HN on proline residues). Backport of PRO fix from pydssp 0.9.1.
"""
n_atoms, n_atom_types, _ = coord.shape
assert n_atom_types in (
Expand Down Expand Up @@ -194,15 +205,26 @@ def get_hbond_map(
local_mask = ~np.eye(n_atoms, dtype=bool)
local_mask *= ~np.diag(np.ones(n_atoms - 1, dtype=bool), k=-1)
local_mask *= ~np.diag(np.ones(n_atoms - 2, dtype=bool), k=-2)
# mask for donor H absence (Proline)
donor_mask = (
np.array(donor_mask).astype(float)
if donor_mask is not None
else np.ones(n_atoms, dtype=float)
)
donor_mask = np.tile(donor_mask[:, np.newaxis], (1, n_atoms))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the donor_mask (one element for each residue) really correct for this tiling????

# hydrogen bond map (continuous value extension of original definition)
hbond_map = np.clip(cutoff - margin - e, a_min=-margin, a_max=margin)
hbond_map = (np.sin(hbond_map / margin * np.pi / 2) + 1.0) / 2
hbond_map = hbond_map * local_mask
hbond_map *= local_mask
hbond_map *= donor_mask
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this correct? The original code uses https://github.com/ShintaroMinami/PyDSSP/blob/e251a43ff8622fe0a555313b1567edce45e789e8/pydssp/pydssp_numpy.py#L72

hbond_map = hbond_map * repeat(donor_mask, 'l1 l2 -> b l1 l2', b=b)

(with einops.repeat()). Note that we create our donor_mask with tile so it may already be the right size and shape.


return hbond_map


def assign(coord: np.ndarray) -> np.ndarray:
def assign(
coord: np.ndarray,
donor_mask: np.ndarray = None,
) -> np.ndarray:
"""Assigns secondary structure for a given coordinate array,
either with or without assigned hydrogens

Expand All @@ -214,6 +236,12 @@ def assign(coord: np.ndarray) -> np.ndarray:
(N, CA, C, O) atoms coordinates (if k=4), or (N, CA, C, O, H) coordinates
(when k=5).

donor_mask : np.array
Mask out any hydrogens that should not be considered (in particular HN
in PRO). If ``None`` then all H will be used (behavior up to 2.9.0).

.. versionadded:: 2.10.0

Returns
-------
np.ndarray
Expand All @@ -222,9 +250,13 @@ def assign(coord: np.ndarray) -> np.ndarray:


.. versionadded:: 2.8.0

.. versionchanged:: 2.10.0
Support masking of donors.

"""
# get hydrogen bond map
hbmap = get_hbond_map(coord)
hbmap = get_hbond_map(coord, donor_mask=donor_mask)
hbmap = np.swapaxes(hbmap, -1, -2) # convert into "i:C=O, j:N-H" form

# identify turn 3, 4, 5
Expand Down
41 changes: 38 additions & 3 deletions testsuite/MDAnalysisTests/analysis/test_dssp.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,52 @@
"pdb_filename", glob.glob(f"{DSSP_FOLDER}/?????.pdb.gz")
)
def test_file_guess_hydrogens(pdb_filename, client_DSSP):
# run 2.9.0 tests (which include PRO)
# ignore_proline_donor=False
# TODO: update reference data for ignore_proline_donor=True
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should really have correct reference data. About half of the files do not show a difference between ignore_proline_donor=True and ignore_proline_donor=False.

u = mda.Universe(pdb_filename)
with open(f"{pdb_filename.rstrip('.gz')}.dssp", "r") as fin:
correct_answ = fin.read().strip().split()[0]

run = DSSP(u, guess_hydrogens=True).run(**client_DSSP)
run = DSSP(u, guess_hydrogens=True, ignore_proline_donor=False).run(
**client_DSSP
)
answ = "".join(run.results.dssp[0])
assert answ == correct_answ


@pytest.mark.parametrize(
"pdb_filename",
[
f"{DSSP_FOLDER}/{PDBID}.pdb.gz"
for PDBID in (
"1eteA",
"3aqgA",
"3gknA",
"3nzmA",
"3a4rA",
"3l4rA",
"2j49A",
"3gfsA",
)
],
)
def test_file_ignore_proline_donor(pdb_filename, client_DSSP):
# weak test: just check that for some structures the two methods give different results
u = mda.Universe(pdb_filename)

run_with_pro = DSSP(
u, guess_hydrogens=True, ignore_proline_donor=False
).run(**client_DSSP)
answ_with_pro = "".join(run_with_pro.results.dssp[0])

# ignore_proline_donor=True is the default:
run_ignore_pro = DSSP(u, guess_hydrogens=True).run(**client_DSSP)
answ_ignore_pro = "".join(run_ignore_pro.results.dssp[0])

assert answ_ignore_pro != answ_with_pro


def test_trajectory(client_DSSP):
u = mda.Universe(TPR, XTC).select_atoms("protein").universe
run = DSSP(u).run(**client_DSSP, stop=10)
Expand All @@ -32,8 +69,6 @@ def test_trajectory(client_DSSP):
assert (
first_frame[:10] != last_frame[:10] == avg_frame[:10] == "-EEEEEE---"
)
Comment on lines 69 to 71
Copy link

Copilot AI Sep 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The removal of these duplicate lines that created a new protein AtomGroup and ran DSSP again appears to be cleanup of unused code, but the test logic should be verified to ensure the test still properly validates the trajectory functionality.

Suggested change
assert (
first_frame[:10] != last_frame[:10] == avg_frame[:10] == "-EEEEEE---"
)
assert first_frame[:10] != last_frame[:10]
assert last_frame[:10] == avg_frame[:10] == "-EEEEEE---"

Copilot uses AI. Check for mistakes.
protein = mda.Universe(TPR, XTC).select_atoms("protein")
run = DSSP(protein).run(**client_DSSP, stop=10)
Comment on lines -35 to -36
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These lines seemed superfluous as the atomgroup approach is tested separately.



def test_atomgroup(client_DSSP):
Expand Down
Loading