- 
                Notifications
    
You must be signed in to change notification settings  - Fork 734
 
Implementation of fetch_pdb() #4943
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from 80 commits
7899f3d
              44393be
              b1f6002
              9c6e87a
              aecefc9
              9510cc6
              8c1a196
              f0e30ed
              1c7d909
              eb23ed1
              b0c7f5a
              f2ec203
              a21fd94
              d58bed9
              1147b6d
              91feb16
              ddcef9e
              bf3e07f
              09cc409
              d78a954
              560e1c2
              c43c10d
              e6a0f05
              ada1b38
              043c006
              ea5c5b7
              5d6d3e8
              6590c42
              6e9b9f3
              252b23c
              440e3b8
              c3f74f9
              10f66be
              cda3559
              cecd570
              03638c8
              544de38
              fdaacf1
              215ee43
              5990939
              7f7387f
              f3456a5
              8b8492f
              3fea571
              f5d6a9f
              64ac4e5
              0f54e8e
              867614a
              c85fd75
              96dbf05
              b15d148
              2d10ad3
              ab7bc8a
              9289792
              96d7341
              c74a46e
              6b20e86
              0d793e9
              d964bc5
              124d06a
              b8f7a81
              d78bae6
              577ac9d
              608d991
              07d124c
              8a9ac84
              939d5f0
              7107aa4
              c869bbc
              557b1e9
              f3a4d7b
              2a97d9b
              9d0f53a
              595423a
              eed80ed
              802183f
              bf9292c
              b595f09
              e93c73a
              5f407ba
              f09115a
              e2141a8
              bf81128
              934eda3
              9b8da31
              0b80840
              a7519af
              72c24e0
              ffcc270
              b07a16d
              a2aff4c
              98fa75b
              1e635c4
              447de56
              c28110f
              02d81a3
              adbfabb
              31a8f7b
              e2a28ec
              546538a
              735586f
              b5da9be
              b0c808a
              ab0f635
              d2f7857
              09e7ef5
              6beffd8
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| 
          
            
          
           | 
    @@ -63,7 +63,7 @@ | |||||
| .. autoclass:: PDBParser | ||||||
| :members: | ||||||
| :inherited-members: | ||||||
| 
     | 
||||||
                
      
                  jauy123 marked this conversation as resolved.
               
              
                Outdated
          
            Show resolved
            Hide resolved
         | 
||||||
| """ | ||||||
| import numpy as np | ||||||
| import warnings | ||||||
| 
          
            
          
           | 
    @@ -95,6 +95,12 @@ | |||||
| # Set up a logger for the PDBParser | ||||||
| logger = logging.getLogger("MDAnalysis.topology.PDBParser") | ||||||
| 
     | 
||||||
| try: | ||||||
| import pooch | ||||||
| except ImportError: | ||||||
| HAS_POOCH = False | ||||||
| else: | ||||||
| HAS_POOCH = True | ||||||
| 
     | 
||||||
                
      
                  orbeckst marked this conversation as resolved.
               
          
            Show resolved
            Hide resolved
         | 
||||||
| def float_or_default(val, default): | ||||||
| try: | ||||||
| 
          
            
          
           | 
    @@ -515,3 +521,103 @@ def _parse_conect(conect): | |||||
| bond_atoms = (int(conect[11 + i * 5: 16 + i * 5]) for i in | ||||||
| range(n_bond_atoms)) | ||||||
| return atom_id, bond_atoms | ||||||
| 
     | 
||||||
| def fetch_pdb( | ||||||
| PDB_IDS=None, | ||||||
| cache_path=None, | ||||||
| progressbar=False, | ||||||
| file_format="pdb.gz", | ||||||
| ): | ||||||
| """ | ||||||
| Download one or more PDB files from the RCSB Protein Data Bank and cache them locally. | ||||||
| 
     | 
||||||
| Given one or multiple PDB IDs, downloads the corresponding structure files in the specified | ||||||
| format and stores them in a local cache directory. If files are cached on disk, fetch_pdb() will skip the download and use | ||||||
| the cached version instead. | ||||||
| 
     | 
||||||
| Returns the path(s) as a string to the downloaded files. | ||||||
| 
     | 
||||||
| Parameters | ||||||
| ---------- | ||||||
| PDB_IDS : str or sequence of str | ||||||
| A single PDB ID as a string, or a sequence of PDB IDs to fetch. | ||||||
| cache_path : str or pathlib.Path | ||||||
                
      
                  jauy123 marked this conversation as resolved.
               
          
            Show resolved
            Hide resolved
         | 
||||||
| Directory where downloaded file(s) will be cached. | ||||||
                
      
                  jauy123 marked this conversation as resolved.
               
          
            Show resolved
            Hide resolved
         | 
||||||
| file_format : str | ||||||
| The file extension/format to download (e.g., "cif", "pdb") | ||||||
                
      
                  p-j-smith marked this conversation as resolved.
               
              
                Outdated
          
            Show resolved
            Hide resolved
                
      
                  jauy123 marked this conversation as resolved.
               
              
                Outdated
          
            Show resolved
            Hide resolved
         | 
||||||
| progressbar : bool, optional | ||||||
| If True, display a progress bar during file downloads. Default is False. | ||||||
| 
     | 
||||||
| Returns | ||||||
| ------- | ||||||
| str or list of str | ||||||
| The path(s) to the downloaded file(s). Returns a single string if one PDB ID is given, | ||||||
                
      
                  jauy123 marked this conversation as resolved.
               
              
                Outdated
          
            Show resolved
            Hide resolved
         | 
||||||
| or a list of strings if multiple PDB IDs are provided. | ||||||
| 
     | 
||||||
| Raises | ||||||
| ------ | ||||||
| requests.exceptions.HTTPError | ||||||
| If an invalid PDB code or file format is specified. | ||||||
| 
     | 
||||||
| Notes | ||||||
| ----- | ||||||
| This function downloads using the API established here at https://www.rcsb.org/docs/programmatic-access/file-download-services. | ||||||
                
      
                  jauy123 marked this conversation as resolved.
               
              
                Outdated
          
            Show resolved
            Hide resolved
         | 
||||||
| 
     | 
||||||
| Examples | ||||||
| -------- | ||||||
| Download a single PDB file: | ||||||
| 
     | 
||||||
| >>> mda.fetch_pdb("1AKE", file_format="cif") | ||||||
| './pdb_cache/1AKE.cif' | ||||||
| 
     | 
||||||
| Download multiple PDB files with a progress bar: | ||||||
| 
     | 
||||||
| >>> mda.fetch_pdb(["1AKE", "4BWZ"], progressbar=True) | ||||||
| ['./pdb_cache/1AKE.pdb.gz', './pdb_cache/4BWZ.pdb.gz'] | ||||||
| 
     | 
||||||
| Download a single PDB file and converting it to a universe: | ||||||
                
       | 
||||||
| 
     | 
||||||
| >>> mda.Universe(mda.fetch_pdb("1AKE"), file_format="pdb.gz") | ||||||
| <Universe with 3816 atoms> | ||||||
| 
     | 
||||||
| Download multiple PDB files and converting them to a universe: | ||||||
                
       | 
||||||
| Download multiple PDB files and converting them to a universe: | |
| Download multiple PDB files and convert each of them into a universe: | 
        
          
              
                Outdated
          
        
      There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is passing the file format necessary for this to work? just curious, not blocking.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, since the the default paramter of the file_format argument to set already to pdb.gz. It is just to make it explicit that Universe at the moment can't read mmCIF files (#2367), and that you need to pass in a pdb or a gzip compressed version.
Currently, fetch_pdb is querying wwPDB API directly, so in principle you can it to download mmCIF files without loading in a universe.
        
          
              
                Outdated
          
        
      There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably not needed as this is "normal python".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is downloading one-by-one in a loop preferred over downloading all at once (shown above), i.e.,
universes = [mda.Universe(pdb) for pdb in mda.fetch_pdb(["1AKE", "4BWZ"], progressbar=True)]If you want an example with multiple files then I'd use the one that uses fetch_pdb 's capability to download multiple files at once.
        
          
              
                  orbeckst marked this conversation as resolved.
              
          
            Show resolved
            Hide resolved
        
              
          
              
                  p-j-smith marked this conversation as resolved.
              
          
            Show resolved
            Hide resolved
        
              
          
              
                  jauy123 marked this conversation as resolved.
              
          
            Show resolved
            Hide resolved
        
              
          
              
                  jauy123 marked this conversation as resolved.
              
              
                Outdated
          
            Show resolved
            Hide resolved
        
              
          
              
                Outdated
          
        
      There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
break comments at 79 chars
        
          
              
                  p-j-smith marked this conversation as resolved.
              
              
                Outdated
          
            Show resolved
            Hide resolved
        
              
          
              
                  jauy123 marked this conversation as resolved.
              
          
            Show resolved
            Hide resolved
        
              
          
              
                  jauy123 marked this conversation as resolved.
              
              
                Outdated
          
            Show resolved
            Hide resolved
        
              
          
              
                  p-j-smith marked this conversation as resolved.
              
              
                Outdated
          
            Show resolved
            Hide resolved
        
      | Original file line number | Diff line number | Diff line change | 
|---|---|---|
| 
          
            
          
           | 
    @@ -13,6 +13,7 @@ networkx | |
| numpy>=1.23.2 | ||
| packaging | ||
| parmed | ||
| pooch | ||
| pytest | ||
| scikit-learn | ||
| scipy | ||
| 
          
            
          
           | 
    ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,98 @@ | ||
| # -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*- | ||
| # vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 fileencoding=utf-8 | ||
| # | ||
| # MDAnalysis --- https://www.mdanalysis.org | ||
| # Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors | ||
| # (see the file AUTHORS for the full list of names) | ||
| # | ||
| # Released under the Lesser GNU Public Licence, v2.1 or any higher version | ||
| # | ||
| # Please cite your use of MDAnalysis in published work: | ||
| # | ||
| # R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler, | ||
| # D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein. | ||
| # MDAnalysis: A Python package for the rapid analysis of molecular dynamics | ||
| # simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th | ||
| # Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy. | ||
| # doi: 10.25080/majora-629e541a-00e | ||
| # | ||
| # N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein. | ||
| # MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations. | ||
| # J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787 | ||
| # | ||
| 
     | 
||
| import pytest | ||
| 
     | 
||
| import MDAnalysis as mda | ||
| from MDAnalysis.topology.PDBParser import HAS_POOCH | ||
| 
     | 
||
| from urllib import request | ||
| 
     | 
||
| if HAS_POOCH: | ||
| from requests.exceptions import HTTPError | ||
| 
     | 
||
| try: | ||
| request.urlopen("https://files.wwpdb.org/", timeout=2) | ||
| HAS_ACCESS_TO_WWPDB = True | ||
| except request.URLError: | ||
| HAS_ACCESS_TO_WWPDB = False | ||
| 
     | 
||
| 
     | 
||
| @pytest.mark.skipif( | ||
| not (HAS_POOCH and HAS_ACCESS_TO_WWPDB), | ||
| reason="Pooch is not installed or can not connect to https://files.wwpdb.org/", | ||
| ) | ||
| class TestDocstringExamples: | ||
| """This class tests all the examples found in fetch_pdb's docstring""" | ||
| 
     | 
||
| @pytest.mark.parametrize("pdb_id", [("1AKE"), ("4BWZ")]) | ||
                
      
                  jauy123 marked this conversation as resolved.
               
              
                Outdated
          
            Show resolved
            Hide resolved
                
      
                  jauy123 marked this conversation as resolved.
               
              
                Outdated
          
            Show resolved
            Hide resolved
         | 
||
| def test_one_file_download(self, tmp_path, pdb_id): | ||
| assert isinstance( | ||
| mda.fetch_pdb(pdb_id, cache_path=tmp_path, file_format="cif"), str | ||
| ) | ||
| 
     | 
||
| def test_multiple_files_download(self, tmp_path): | ||
| list_of_path_strings = mda.fetch_pdb( | ||
| ["1AKE", "4BWZ"], cache_path=tmp_path, progressbar=True | ||
| ) | ||
| assert all(isinstance(PDB_ID, str) for PDB_ID in list_of_path_strings) | ||
                
      
                  jauy123 marked this conversation as resolved.
               
              
                Outdated
          
            Show resolved
            Hide resolved
         | 
||
| 
     | 
||
| @pytest.mark.parametrize( | ||
| "pdb_id, n_atoms", [("1AKE", 3816), ("4BWZ", 2824)] | ||
| ) | ||
| def test_files_to_universe(self, tmp_path, pdb_id, n_atoms): | ||
| u = mda.Universe( | ||
| mda.fetch_pdb( | ||
| pdb_id, | ||
| file_format="pdb.gz", | ||
| cache_path=tmp_path, | ||
| progressbar=True, | ||
| ) | ||
| ) | ||
| assert isinstance(u, mda.Universe) and (len(u.atoms) == n_atoms) | ||
| 
     | 
||
| 
     | 
||
| @pytest.mark.skipif( | ||
| not (HAS_POOCH and HAS_ACCESS_TO_WWPDB), | ||
| reason="Pooch is not installed or can not connect to https://files.wwpdb.org/", | ||
| ) | ||
| class TestExpectedErrors: | ||
                
      
                  p-j-smith marked this conversation as resolved.
               
              
                Outdated
          
            Show resolved
            Hide resolved
         | 
||
| 
     | 
||
| def test_invalid_pdb(self, tmp_path): | ||
| with pytest.raises(HTTPError): | ||
                
      
                  orbeckst marked this conversation as resolved.
               
          
            Show resolved
            Hide resolved
         | 
||
| mda.fetch_pdb(PDB_IDS="foobar", cache_path=tmp_path) | ||
| 
     | 
||
| def test_invalid_file_format(self, tmp_path): | ||
| with pytest.raises(HTTPError): | ||
| mda.fetch_pdb( | ||
| PDB_IDS="1AKE", cache_path=tmp_path, file_format="barfoo" | ||
| ) | ||
| 
     | 
||
| 
     | 
||
| @pytest.mark.skipif( | ||
| HAS_POOCH, | ||
| reason="Pooch is installed.", | ||
| ) | ||
| def test_pooch_installation(tmp_path): | ||
| with pytest.raises(ModuleNotFoundError): | ||
                
      
                  jauy123 marked this conversation as resolved.
               
              
                Outdated
          
            Show resolved
            Hide resolved
         | 
||
| mda.fetch_pdb("1AKE", cache_path=tmp_path, file_format="cif") | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@IAlibay @BradyAJohnston are we sure that we want the import at the top level?
If we do more
fetch_xxx()in the future then we may have to deprecate it again, e.g. in favor of amda.fetch.pdb(...)orUniverse.from_fetched.I think it's ok to leave it here for now because we don't have anything else. If we get more before 3.0, we still have time to deprecate and remove in 3.0.
If it is left in then does it need to be documented at the top level, too?