MDAnalysis · jauy123 · Feb 25, 2025 · Mar 3, 2025 · Mar 3, 2025 · Mar 3, 2025
diff --git a/.github/actions/setup-deps/action.yaml b/.github/actions/setup-deps/action.yaml
@@ -72,6 +72,8 @@ inputs:
     default: 'networkx'
   openmm:
     default: 'openmm'
+  pooch:
+    default: 'pooch'
   pytng:
     default: 'pytng>=0.2.3'
   rdkit:
@@ -142,6 +144,7 @@ runs:
           ${{ inputs.netcdf4 }}
           ${{ inputs.networkx }}
           ${{ inputs.openmm }}
+          ${{ inputs.pooch }}
           ${{ inputs.pytng }}
           ${{ inputs.rdkit }}
           ${{ inputs.scikit-learn }}

diff --git a/package/MDAnalysis/__init__.py b/package/MDAnalysis/__init__.py
@@ -221,6 +221,9 @@
 
 from .due import due, Doi, BibTeX
 
+## Temp fetch_pdb import to see if things works
+from .topology.PDBParser import fetch_pdb
+
 due.cite(
     Doi("10.25080/majora-629e541a-00e"),
     description="Molecular simulation analysis library",

diff --git a/package/MDAnalysis/topology/PDBParser.py b/package/MDAnalysis/topology/PDBParser.py
@@ -515,3 +515,94 @@ def _parse_conect(conect):
     bond_atoms = (int(conect[11 + i * 5: 16 + i * 5]) for i in
                   range(n_bond_atoms))
     return atom_id, bond_atoms
+
+def fetch_pdb(PDB_IDS=None,
+            cache_path=None,
+            progressbar=False,
+            file_format="pdb.gz",
+            ):
+
+    """
+    Download one or more PDB files from the RCSB Protein Data Bank and cache them locally.
+
+    Given one or multiple PDB IDs, downloads the corresponding structure files in the specified
+    format and stores them in a local cache directory. If files are cached on disk, fetch_pdb() will skip the download and use
+    the cached version instead.
+
+    Returns the path(s) as a string to the downloaded files.
+
+    Parameters
+    ----------
+    PDB_IDS : str or sequence of str
+        A single PDB ID as a string, or a sequence of PDB IDs to fetch.
+    cache_path : str or pathlib.Path
+        Directory where downloaded file(s) will be cached. 
+    file_format : str
+        The file extension/format to download (e.g., "cif", "pdb")
+    progressbar : bool, optional
+        If True, display a progress bar during file downloads. Default is False.
+
+    Returns
+    -------
+    str or list of str
+        The path(s) to the downloaded file(s). Returns a single string if one PDB ID is given,
+        or a list of strings if multiple PDB IDs are provided.
+
+    Raises
+    ------
+    requests.exceptions.HTTPError
+        If an invalid PDB code or file format is specified.
+
+    Notes
+    -----
+    This function downloads using the API established here at https://www.rcsb.org/docs/programmatic-access/file-download-services.
+
+    Examples
+    --------
+    Download a single PDB file:
+
+    >>> mda.fetch_pdb("1AKE", file_format="cif")
+    './pdb_cache/1AKE.cif'
+
+    Download multiple PDB files with a progress bar:
+
+    >>> mda.fetch_pdb(["1AKE", "4BWZ"], progressbar=True)
+    ['./pdb_cache/1AKE.pdb.gz', './pdb_cache/4BWZ.pdb.gz']
+
+    Download a single PDB file and converting it to a universe:
+
+    >>> mda.Universe(mda.fetch_pdb("1AKE"), file_format="pdb.gz")
+    <Universe with 3816 atoms>
+
+    Download multiple PDB files and converting them to a universe:
-    Download multiple PDB files and converting them to a universe:
+    Download multiple PDB files and convert each of them into a universe:
-    Download multiple PDB files and converting them to a universe:
+    Download multiple PDB files and convert each of them into a universe:
+
+    >>> [mda.Universe(mda.fetch_pdb(PDB_ID), file_format="pdb.gz") for PDB_ID in ("1AKE", "4BWZ")]
+    [<Universe with 3816 atoms>, <Universe with 2824 atoms>]
+
+    """
+
+    # Have to do this dictionary approach instead of using Pooch.retrieve in order to prevent the hardcoded known_hash warning from showing up
+
+    try:
+        import pooch
+    except:
+        raise ModuleNotFoundError('pooch is needed as a dependency for fetch_pdb()')
+
+    if isinstance(PDB_IDS, str):
+        PDB_IDS = (PDB_IDS,)
+
+    if cache_path is None:
+        cache_path = pooch.os_cache('MDAnalysis')
+
+    registry_dictionary = {f'{PDB_ID}.{file_format}': None for PDB_ID in PDB_IDS}
+
+    downloader = pooch.create(
+        path=cache_path,
+        base_url="https://files.wwpdb.org/download/",
+        registry=registry_dictionary
+    )
+
+    if len(PDB_IDS) == 1:
+        return str(downloader.fetch(fname=tuple(registry_dictionary.keys())[0], progressbar=progressbar))
+    else:
+        return [downloader.fetch(fname=file_name, progressbar=progressbar) for file_name in registry_dictionary.keys()]
diff --git a/package/pyproject.toml b/package/pyproject.toml
@@ -40,6 +40,7 @@ dependencies = [
     'packaging',
     'filelock',
     'mda-xdrlib',
+    'requests',
 ]
 keywords = [
     "python", "science", "chemistry", "biophysics", "molecular-dynamics",
@@ -72,6 +73,7 @@ extra_formats = [
     "h5py>=2.10",
     "chemfiles>=0.10",
     "parmed",
+    "pooch",
     "pyedr>=0.7.0",
     "pytng>=0.2.3",
     "gsd>3.0.0",

diff --git a/package/requirements.txt b/package/requirements.txt
@@ -13,7 +13,9 @@ networkx
 numpy>=1.23.2
 packaging
 parmed
+pooch
 pytest
+requests
 scikit-learn
 scipy
 seaborn>=0.7.0,<=0.9

diff --git a/testsuite/MDAnalysisTests/topology/test_fetch_pdb.py b/testsuite/MDAnalysisTests/topology/test_fetch_pdb.py
@@ -0,0 +1,84 @@
+# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
+# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 fileencoding=utf-8
+#
+# MDAnalysis --- https://www.mdanalysis.org
+# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors
+# (see the file AUTHORS for the full list of names)
+#
+# Released under the Lesser GNU Public Licence, v2.1 or any higher version
+#
+# Please cite your use of MDAnalysis in published work:
+#
+# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler,
+# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein.
+# MDAnalysis: A Python package for the rapid analysis of molecular dynamics
+# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th
+# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy.
+# doi: 10.25080/majora-629e541a-00e
+#
+# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein.
+# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations.
+# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787
+#
+
+
+# Note need to make test not run when there is no internet
+
+from requests.exceptions import HTTPError
+from urllib import request
+
+import MDAnalysis as mda
+import pytest
+
+
+
+def has_pooch():
+    try:
+        import pooch
+        return True
+    except ModuleNotFoundError:
+        return False
+
+
+def check_internet():
+    try:
+        request.urlopen('https://files.wwpdb.org/', timeout=2)
+        return True
+    except request.URLError as err: 
+        return False
+
+
+@pytest.mark.skipif(has_pooch() is False or check_internet() is False, reason="Cannot connect to https://files.wwpdb.org/'")
+class TestDocstringExamples:
+    """This class test the example found in fetch_pdb's docstring"""
+
+    def test_one_file_download(self, tmp_path):
+        """Tests docstring's mda.fetch_pdb("1AKE", file_format="cif") """
+        assert isinstance(mda.fetch_pdb("1AKE", cache_path=tmp_path, file_format="cif"), str)
+
+    def test_multiple_files_download(self, tmp_path):
+        """Tests docstring's mda.fetch_pdb(["1AKE", "4BWZ"], progressbar=True) """
+        list_of_path_strings = mda.fetch_pdb(["1AKE", "4BWZ"], cache_path=tmp_path, progressbar=True)
+        assert all(isinstance(PDB_ID, str) for PDB_ID in list_of_path_strings)
+
+    def test_one_file_to_universe(self, tmp_path):
+        """Tests docstring's mda.Universe(mda.fetch_pdb("1AKE"), file_format="pdb.gz") """
+        assert isinstance(mda.Universe(mda.fetch_pdb("1AKE"), file_format="pdb.gz", cache_path=tmp_path, progressbar=True), mda.Universe)
+
+    def test_multiple_files_to_universe(self, tmp_path):
+        """Tests docstring's [mda.Universe(mda.fetch_pdb(PDB_ID), file_format="pdb.gz") for PDB_ID in ("1AKE", "4BWZ")] """
+        list_of_path_strings = [mda.Universe(mda.fetch_pdb(PDB_ID), cache_path=tmp_path, file_format="pdb.gz") for PDB_ID in ("1AKE", "4BWZ")]
+        assert all(isinstance(PDB_ID, mda.Universe) for PDB_ID in list_of_path_strings)
+
+@pytest.mark.skipif(has_pooch() is False or check_internet() is False, reason="Cannot connect to https://files.wwpdb.org/")
+class TestExpectedErrors:
+    def test_invalid_pdb(self, tmp_path):
+        with pytest.raises(HTTPError):
+            mda.fetch_pdb(PDB_IDS='foobar', cache_path=tmp_path)
+
+    def test_invalid_file_format(self, tmp_path):
+        with pytest.raises(HTTPError):
+            mda.fetch_pdb(PDB_IDS='1AKE', cache_path=tmp_path, file_format='barfoo')
+
+
+