Skip to content
Open
Show file tree
Hide file tree
Changes from 55 commits
Commits
Show all changes
107 commits
Select commit Hold shift + click to select a range
7899f3d
Added requests as a dependency
jauy123 Feb 25, 2025
44393be
Inital download code
jauy123 Mar 3, 2025
b1f6002
fixed typo
jauy123 Mar 3, 2025
9c6e87a
cleaner convert_to_universe()
jauy123 Mar 3, 2025
aecefc9
Added abc module and allowed closing of file stream for downloaded te…
jauy123 Mar 4, 2025
9510cc6
Fixed __all__ -- should fixed pull request test on github
jauy123 Mar 4, 2025
8c1a196
refactored cache logic
jauy123 Mar 5, 2025
f0e30ed
Initial tests
jauy123 Mar 5, 2025
1c7d909
Added __init.py to make tests work
jauy123 Mar 5, 2025
eb23ed1
typos fixed
jauy123 Mar 6, 2025
b0c7f5a
Refactored Tests -- put them in classes!
jauy123 Mar 7, 2025
f2ec203
PdbDownloader().download() now downloads in binary rather than text (…
jauy123 Mar 7, 2025
a21fd94
Updated Tests to comply with pdb.gz
jauy123 Mar 7, 2025
d58bed9
Added Progress bar to PdbDownloader().download()
jauy123 Mar 8, 2025
1147b6d
Added a few clarifications to _requests_progress_bar
jauy123 Mar 8, 2025
91feb16
Added filename attribute() to BaseDownloader()
jauy123 Mar 8, 2025
ddcef9e
made _requests_progress_bar a private method of PdbDownloader
jauy123 Mar 9, 2025
bf3e07f
minor comments
jauy123 Mar 9, 2025
09cc409
Added Buffer as default option for PdbDownloader.download()
jauy123 Mar 9, 2025
d78a954
Renamed PdbDownloader to PDBDownloader to match PDBReader()
jauy123 Mar 9, 2025
560e1c2
better __str__ method for BaseDownloader()
jauy123 Mar 9, 2025
c43c10d
Enhanced tests
jauy123 Mar 9, 2025
e6a0f05
Added TODO list for future me
jauy123 Mar 10, 2025
ada1b38
Added requests as optional dep to pyproject.toml
jauy123 Mar 18, 2025
043c006
update todo list
jauy123 Jun 26, 2025
ea5c5b7
minor cleanup
jauy123 Jun 26, 2025
5d6d3e8
Ran black on package/
jauy123 Jun 26, 2025
6590c42
Ran black on tests
jauy123 Jun 26, 2025
6e9b9f3
updated TODO
jauy123 Jun 26, 2025
252b23c
attempt to fix mypy issue
jauy123 Jun 26, 2025
440e3b8
inital working pooch-based implementation of fetch_pdb()
jauy123 Aug 20, 2025
c3f74f9
merge from working pooch branch
jauy123 Aug 20, 2025
10f66be
removed all of old non-pooch based fetch_pdb() implementation and tests
jauy123 Aug 20, 2025
cda3559
cleaned up __init__.py of old non-pooched based fetch_pdb
jauy123 Aug 20, 2025
cecd570
fetch_pdb() now returns paths instead of universes
jauy123 Aug 22, 2025
03638c8
Cleaned up return logic with syntactic sugar
jauy123 Aug 22, 2025
544de38
package/MDAnalysis/coordinates/fetch_pdb.py
jauy123 Aug 22, 2025
fdaacf1
ok, this is proper version of the cleaned up fetch_pdb()
jauy123 Aug 22, 2025
215ee43
cleaned up return logic with Syntactic sugar
jauy123 Aug 22, 2025
5990939
Made explicit that in the one pdb case that the return type is a string
jauy123 Aug 22, 2025
7f7387f
remove redundant comments
jauy123 Aug 22, 2025
f3456a5
Moved fetch_pdb() to PDBParser
jauy123 Aug 22, 2025
8b8492f
Added fetch_pdb() docstring
jauy123 Aug 22, 2025
3fea571
Added default cache folder to fetch_pdb()
jauy123 Aug 22, 2025
f5d6a9f
Added Unit Test for fetch_pdb()'s docstring
jauy123 Aug 22, 2025
64ac4e5
Finalized tests and docstring
jauy123 Aug 22, 2025
0f54e8e
Spagetti fingered fetch_pdb() docstring
jauy123 Aug 22, 2025
867614a
Added pooch to requirements.txt in order to get github's test to work…
jauy123 Aug 22, 2025
c85fd75
Added pooch to pyproject.toml to get github's online test
jauy123 Aug 22, 2025
96dbf05
Added pooch to pyproject.toml to get github's online test to work
jauy123 Aug 22, 2025
b15d148
Merge branch 'downloads' of github.com:jauy123/mdanalysis into downloads
jauy123 Aug 22, 2025
2d10ad3
Modified the action.yaml to HOPEFULLY get github's online test to work
jauy123 Aug 22, 2025
ab7bc8a
i have fat fingers
jauy123 Aug 22, 2025
9289792
action.yaml attempt number 2
jauy123 Aug 22, 2025
96d7341
An attempt to make pooch optional
jauy123 Aug 22, 2025
c74a46e
Cleaned up fetch_pdb(), added Universe PDB assertions -- still need t…
jauy123 Aug 23, 2025
6b20e86
Pre-black tests
jauy123 Aug 23, 2025
0d793e9
Moved requests' HTTPExeception inside has_pooch() since requests is a…
jauy123 Aug 23, 2025
d964bc5
Ran Black on test_fetch_pdb.py
jauy123 Aug 23, 2025
124d06a
Added test to catch missing pooch dependency
jauy123 Aug 23, 2025
b8f7a81
Renamed pooch dependency test
jauy123 Aug 23, 2025
d78bae6
removed requests from pyproject.toml and requirements.txt
jauy123 Aug 23, 2025
577ac9d
PDBParser.py pre-black
jauy123 Aug 23, 2025
608d991
post black PDBParser.py
jauy123 Aug 23, 2025
07d124c
Removed err assignment from has_internet()
jauy123 Aug 23, 2025
8a9ac84
Merge remote-tracking branch 'upstream/develop' into downloads
jauy123 Aug 23, 2025
939d5f0
modified in pytest fixtures
jauy123 Aug 25, 2025
7107aa4
Made pooch import global
jauy123 Aug 26, 2025
c869bbc
added pytest fixtures to test_fetch_pdb
jauy123 Aug 26, 2025
557b1e9
Modified test_pooch_installation to be like coordinates/test_gcd.py/t…
jauy123 Aug 26, 2025
f3a4d7b
Restored previous backup
jauy123 Aug 26, 2025
2a97d9b
Made test_fetch_pdb() more topology/ test_gsd()
jauy123 Aug 26, 2025
9d0f53a
Rewrote test skip if condition
jauy123 Aug 27, 2025
595423a
renamed HAS_INTERNET() to HAS_ACCESS_TO_WWPDB
jauy123 Aug 27, 2025
eed80ed
Changed wwPDB download url to be a module level variable
jauy123 Aug 27, 2025
802183f
moved url backed into fetch_pdb()
jauy123 Sep 3, 2025
bf9292c
remove comment in inital __init__.py
jauy123 Sep 3, 2025
b595f09
minor cleanup in PDBParser.py
jauy123 Sep 3, 2025
e93c73a
Updated CHANGELOG
jauy123 Sep 3, 2025
5f407ba
oops, put the text in the wrong section in CHANGELOG
jauy123 Sep 3, 2025
f09115a
Added no cache test
jauy123 Oct 21, 2025
e2141a8
merged with upstream
jauy123 Oct 21, 2025
bf81128
Revert "Added no cache test"
jauy123 Oct 21, 2025
934eda3
Copy and Pasted from old changelog
jauy123 Oct 21, 2025
9b8da31
uunrevert commit bf81128
jauy123 Oct 21, 2025
0b80840
Split pytest fixtures into two
jauy123 Oct 22, 2025
a7519af
renamed pdb_cache to MDAnalysis_pdbs
jauy123 Oct 22, 2025
72c24e0
Added supported file_formats to code.
jauy123 Oct 22, 2025
ffcc270
Added documentation and update test for file formats
jauy123 Oct 22, 2025
b07a16d
post darker pre flake8
jauy123 Oct 22, 2025
a2aff4c
Updated fetch_pdb return types
jauy123 Oct 22, 2025
98fa75b
Manual flake 8 on fetch_pdb
jauy123 Oct 22, 2025
1e635c4
Addressing obeckset's fetch_pdb's docstring changes
jauy123 Oct 22, 2025
447de56
Changed 'Multiple PDBs example in docstring
jauy123 Oct 22, 2025
c28110f
wrote tests for str/tuple arguement -- need to refactor however
jauy123 Oct 23, 2025
02d81a3
Updated PDBParser.py and applied flake8
jauy123 Oct 23, 2025
adbfabb
Modified Tests per orbeckst's requests
jauy123 Oct 23, 2025
31a8f7b
Modified fetch_pdb()'s logic and added cache docs
jauy123 Oct 23, 2025
e2a28ec
Changed check on fetch_pdb() return
jauy123 Oct 24, 2025
546538a
Modified fetch_pdb() and its test per github feedback
jauy123 Oct 24, 2025
735586f
Added autodocumenation, so fetch_pdb is visible to sphinx
jauy123 Oct 27, 2025
b5da9be
Change spacing under "Classes and Functions"
jauy123 Oct 28, 2025
b0c808a
added autodata for DEFAULT_CACHE_NAME_DOWNLOADER
jauy123 Oct 28, 2025
ab0f635
Modified docs per suggestions and did a general pass with adding sphi…
jauy123 Oct 28, 2025
d2f7857
doc grammar
jauy123 Oct 28, 2025
09e7ef5
Got sphinx markup to work
jauy123 Oct 28, 2025
6beffd8
Added markup to file formats
jauy123 Oct 29, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/actions/setup-deps/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ inputs:
default: 'networkx'
openmm:
default: 'openmm'
pooch:
default: 'pooch'
pytng:
default: 'pytng>=0.2.3'
rdkit:
Expand Down Expand Up @@ -142,6 +144,7 @@ runs:
${{ inputs.netcdf4 }}
${{ inputs.networkx }}
${{ inputs.openmm }}
${{ inputs.pooch }}
${{ inputs.pytng }}
${{ inputs.rdkit }}
${{ inputs.scikit-learn }}
Expand Down
3 changes: 3 additions & 0 deletions package/MDAnalysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,9 @@

from .due import due, Doi, BibTeX

## Temp fetch_pdb import to see if things works
from .topology.PDBParser import fetch_pdb
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@IAlibay @BradyAJohnston are we sure that we want the import at the top level?

If we do more fetch_xxx() in the future then we may have to deprecate it again, e.g. in favor of a mda.fetch.pdb(...) or Universe.from_fetched.

I think it's ok to leave it here for now because we don't have anything else. If we get more before 3.0, we still have time to deprecate and remove in 3.0.

If it is left in then does it need to be documented at the top level, too?


Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if we want to keep it at the top level.

For right now, I'd remove it, and use it as MDAnalysis.topology.PDBparser.fetch_pdb()

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd be tempted to shorten the function name to fetch as PDBparser.fetch() already contains the information.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@IAlibay do you have comments on this, specifically the comment on keeping the function top level? I'm personally agnostic of whether it should stay top level or not.

due.cite(
Doi("10.25080/majora-629e541a-00e"),
description="Molecular simulation analysis library",
Expand Down
91 changes: 91 additions & 0 deletions package/MDAnalysis/topology/PDBParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,3 +515,94 @@ def _parse_conect(conect):
bond_atoms = (int(conect[11 + i * 5: 16 + i * 5]) for i in
range(n_bond_atoms))
return atom_id, bond_atoms

def fetch_pdb(PDB_IDS=None,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd propose to shorten the name to fetch().

It also suggests how to do the same functionality for other formats: FORMAT.fetch().

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(and "pooch" and "fetch" work even better together 🐕 .... ⚽ )

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would be very cute if name change is kept! 🐶

cache_path=None,
progressbar=False,
file_format="pdb.gz",
):

"""
Download one or more PDB files from the RCSB Protein Data Bank and cache them locally.

Given one or multiple PDB IDs, downloads the corresponding structure files in the specified
format and stores them in a local cache directory. If files are cached on disk, fetch_pdb() will skip the download and use
the cached version instead.

Returns the path(s) as a string to the downloaded files.

Parameters
----------
PDB_IDS : str or sequence of str
A single PDB ID as a string, or a sequence of PDB IDs to fetch.
cache_path : str or pathlib.Path
Directory where downloaded file(s) will be cached.
file_format : str
The file extension/format to download (e.g., "cif", "pdb")
progressbar : bool, optional
If True, display a progress bar during file downloads. Default is False.

Returns
-------
str or list of str
The path(s) to the downloaded file(s). Returns a single string if one PDB ID is given,
or a list of strings if multiple PDB IDs are provided.

Raises
------
requests.exceptions.HTTPError
If an invalid PDB code or file format is specified.

Notes
-----
This function downloads using the API established here at https://www.rcsb.org/docs/programmatic-access/file-download-services.

Examples
--------
Download a single PDB file:

>>> mda.fetch_pdb("1AKE", file_format="cif")
'./pdb_cache/1AKE.cif'

Download multiple PDB files with a progress bar:

>>> mda.fetch_pdb(["1AKE", "4BWZ"], progressbar=True)
['./pdb_cache/1AKE.pdb.gz', './pdb_cache/4BWZ.pdb.gz']

Download a single PDB file and converting it to a universe:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

converting -> convert

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fix grammar


>>> mda.Universe(mda.fetch_pdb("1AKE"), file_format="pdb.gz")
<Universe with 3816 atoms>

Download multiple PDB files and converting them to a universe:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Download multiple PDB files and converting them to a universe:
Download multiple PDB files and convert each of them into a universe:


>>> [mda.Universe(mda.fetch_pdb(PDB_ID), file_format="pdb.gz") for PDB_ID in ("1AKE", "4BWZ")]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is passing the file format necessary for this to work? just curious, not blocking.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, since the the default paramter of the file_format argument to set already to pdb.gz. It is just to make it explicit that Universe at the moment can't read mmCIF files (#2367), and that you need to pass in a pdb or a gzip compressed version.

Currently, fetch_pdb is querying wwPDB API directly, so in principle you can it to download mmCIF files without loading in a universe.

[<Universe with 3816 atoms>, <Universe with 2824 atoms>]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably not needed as this is "normal python".

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is downloading one-by-one in a loop preferred over downloading all at once (shown above), i.e.,

universes = [mda.Universe(pdb) for pdb in mda.fetch_pdb(["1AKE", "4BWZ"], progressbar=True)]

If you want an example with multiple files then I'd use the one that uses fetch_pdb 's capability to download multiple files at once.


"""

# Have to do this dictionary approach instead of using Pooch.retrieve in order to prevent the hardcoded known_hash warning from showing up
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

move the comment to where the dict is created (and format for line length... just run black on the file)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did run black, but it didn't format for line length.


try:
import pooch
except:
raise ModuleNotFoundError('pooch is needed as a dependency for fetch_pdb()')

if isinstance(PDB_IDS, str):
PDB_IDS = (PDB_IDS,)

if cache_path is None:
cache_path = pooch.os_cache('MDAnalysis')

registry_dictionary = {f'{PDB_ID}.{file_format}': None for PDB_ID in PDB_IDS}

downloader = pooch.create(
path=cache_path,
base_url="https://files.wwpdb.org/download/",
registry=registry_dictionary
)

if len(PDB_IDS) == 1:
return str(downloader.fetch(fname=tuple(registry_dictionary.keys())[0], progressbar=progressbar))
else:
return [downloader.fetch(fname=file_name, progressbar=progressbar) for file_name in registry_dictionary.keys()]
2 changes: 2 additions & 0 deletions package/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ dependencies = [
'packaging',
'filelock',
'mda-xdrlib',
'requests',
]
keywords = [
"python", "science", "chemistry", "biophysics", "molecular-dynamics",
Expand Down Expand Up @@ -72,6 +73,7 @@ extra_formats = [
"h5py>=2.10",
"chemfiles>=0.10",
"parmed",
"pooch",
"pyedr>=0.7.0",
"pytng>=0.2.3",
"gsd>3.0.0",
Expand Down
2 changes: 2 additions & 0 deletions package/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ networkx
numpy>=1.23.2
packaging
parmed
pooch
pytest
requests
scikit-learn
scipy
seaborn>=0.7.0,<=0.9
Expand Down
84 changes: 84 additions & 0 deletions testsuite/MDAnalysisTests/topology/test_fetch_pdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 fileencoding=utf-8
#
# MDAnalysis --- https://www.mdanalysis.org
# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors
# (see the file AUTHORS for the full list of names)
#
# Released under the Lesser GNU Public Licence, v2.1 or any higher version
#
# Please cite your use of MDAnalysis in published work:
#
# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler,
# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein.
# MDAnalysis: A Python package for the rapid analysis of molecular dynamics
# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th
# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy.
# doi: 10.25080/majora-629e541a-00e
#
# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein.
# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations.
# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787
#


# Note need to make test not run when there is no internet

from requests.exceptions import HTTPError
from urllib import request

import MDAnalysis as mda
import pytest



def has_pooch():
try:
import pooch
return True
except ModuleNotFoundError:
return False


def check_internet():
try:
request.urlopen('https://files.wwpdb.org/', timeout=2)
return True
except request.URLError as err:
return False


@pytest.mark.skipif(has_pooch() is False or check_internet() is False, reason="Cannot connect to https://files.wwpdb.org/'")
class TestDocstringExamples:
"""This class test the example found in fetch_pdb's docstring"""

def test_one_file_download(self, tmp_path):
"""Tests docstring's mda.fetch_pdb("1AKE", file_format="cif") """
assert isinstance(mda.fetch_pdb("1AKE", cache_path=tmp_path, file_format="cif"), str)

def test_multiple_files_download(self, tmp_path):
"""Tests docstring's mda.fetch_pdb(["1AKE", "4BWZ"], progressbar=True) """
list_of_path_strings = mda.fetch_pdb(["1AKE", "4BWZ"], cache_path=tmp_path, progressbar=True)
assert all(isinstance(PDB_ID, str) for PDB_ID in list_of_path_strings)

def test_one_file_to_universe(self, tmp_path):
"""Tests docstring's mda.Universe(mda.fetch_pdb("1AKE"), file_format="pdb.gz") """
assert isinstance(mda.Universe(mda.fetch_pdb("1AKE"), file_format="pdb.gz", cache_path=tmp_path, progressbar=True), mda.Universe)

def test_multiple_files_to_universe(self, tmp_path):
"""Tests docstring's [mda.Universe(mda.fetch_pdb(PDB_ID), file_format="pdb.gz") for PDB_ID in ("1AKE", "4BWZ")] """
list_of_path_strings = [mda.Universe(mda.fetch_pdb(PDB_ID), cache_path=tmp_path, file_format="pdb.gz") for PDB_ID in ("1AKE", "4BWZ")]
assert all(isinstance(PDB_ID, mda.Universe) for PDB_ID in list_of_path_strings)

@pytest.mark.skipif(has_pooch() is False or check_internet() is False, reason="Cannot connect to https://files.wwpdb.org/")
class TestExpectedErrors:
def test_invalid_pdb(self, tmp_path):
with pytest.raises(HTTPError):
mda.fetch_pdb(PDB_IDS='foobar', cache_path=tmp_path)

def test_invalid_file_format(self, tmp_path):
with pytest.raises(HTTPError):
mda.fetch_pdb(PDB_IDS='1AKE', cache_path=tmp_path, file_format='barfoo')



Loading