- 
                Notifications
    
You must be signed in to change notification settings  - Fork 734
 
fix inclusion of PRO in secondary structure #5065
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from 4 commits
005100f
              df31c86
              87810eb
              cf1f6aa
              e9df025
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| 
          
            
          
           | 
    @@ -74,7 +74,7 @@ def _unfold(a: np.ndarray, window: int, axis: int): | |
| 
     | 
||
| 
     | 
||
| def _get_hydrogen_atom_position(coord: np.ndarray) -> np.ndarray: | ||
| """Fills in hydrogen atoms positions if they are abscent, under the | ||
| """Fills in hydrogen atoms positions if they are absent, under the | ||
        
    
 | 
||
| assumption that C-N-H and H-N-CA angles are perfect 120 degrees, | ||
| and N-H bond length is 1.01 A. | ||
| 
     | 
||
| 
          
            
          
           | 
    @@ -118,6 +118,7 @@ def _get_hydrogen_atom_position(coord: np.ndarray) -> np.ndarray: | |
| 
     | 
||
| def get_hbond_map( | ||
| coord: np.ndarray, | ||
| donor_mask: np.ndarray = None, | ||
| cutoff: float = DEFAULT_CUTOFF, | ||
| margin: float = DEFAULT_MARGIN, | ||
| return_e: bool = False, | ||
| 
        
          
        
         | 
    @@ -130,6 +131,12 @@ def get_hbond_map( | |
| input coordinates in either (n, 4, 3) or (n, 5, 3) shape | ||
| (without or with hydrogens). If hydrogens are not present, then | ||
| ideal positions (see :func:_get_hydrogen_atom_positions) are used. | ||
| donor_mask : np.array | ||
| Mask out any hydrogens that should not be considered (in particular HN | ||
| in PRO). If ``None`` then all H will be used (behavior up to 2.9.0). | ||
| 
         
      Comment on lines
    
      +135
     to 
      +136
    
   
  There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These docs should be more specific and state the shape. I just quickly guessed the shape from https://github.com/ShintaroMinami/PyDSSP/blob/e251a43ff8622fe0a555313b1567edce45e789e8/scripts/pydssp#L30 donor_mask = sequence != 'PRO' if args.ignore_proline_donor else None | 
||
| 
     | 
||
| .. versionadded:: 2.10.0 | ||
| 
     | 
||
| cutoff : float, optional | ||
| cutoff, by default DEFAULT_CUTOFF | ||
| margin : float, optional | ||
| 
        
          
        
         | 
    @@ -144,6 +151,10 @@ def get_hbond_map( | |
| 
     | 
||
| 
     | 
||
| .. versionadded:: 2.8.0 | ||
| 
     | 
||
| .. versionchanged:: 2.10.0 | ||
| Support masking of hydrogen donors via `donor_mask` (especially needed | ||
| for ignoring HN on proline residues). Backport of PRO fix from pydssp 0.9.1. | ||
| """ | ||
| n_atoms, n_atom_types, _ = coord.shape | ||
| assert n_atom_types in ( | ||
| 
          
            
          
           | 
    @@ -194,15 +205,26 @@ def get_hbond_map( | |
| local_mask = ~np.eye(n_atoms, dtype=bool) | ||
| local_mask *= ~np.diag(np.ones(n_atoms - 1, dtype=bool), k=-1) | ||
| local_mask *= ~np.diag(np.ones(n_atoms - 2, dtype=bool), k=-2) | ||
| # mask for donor H absence (Proline) | ||
| donor_mask = ( | ||
| np.array(donor_mask).astype(float) | ||
| if donor_mask is not None | ||
| else np.ones(n_atoms, dtype=float) | ||
| ) | ||
| donor_mask = np.tile(donor_mask[:, np.newaxis], (1, n_atoms)) | ||
| 
         There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the donor_mask (one element for each residue) really correct for this tiling????  | 
||
| # hydrogen bond map (continuous value extension of original definition) | ||
| hbond_map = np.clip(cutoff - margin - e, a_min=-margin, a_max=margin) | ||
| hbond_map = (np.sin(hbond_map / margin * np.pi / 2) + 1.0) / 2 | ||
| hbond_map = hbond_map * local_mask | ||
| hbond_map *= local_mask | ||
| hbond_map *= donor_mask | ||
| 
         There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this correct? The original code uses https://github.com/ShintaroMinami/PyDSSP/blob/e251a43ff8622fe0a555313b1567edce45e789e8/pydssp/pydssp_numpy.py#L72 hbond_map = hbond_map * repeat(donor_mask, 'l1 l2 -> b l1 l2', b=b)(with   | 
||
| 
     | 
||
| return hbond_map | ||
| 
     | 
||
| 
     | 
||
| def assign(coord: np.ndarray) -> np.ndarray: | ||
| def assign( | ||
| coord: np.ndarray, | ||
| donor_mask: np.ndarray = None, | ||
| ) -> np.ndarray: | ||
| """Assigns secondary structure for a given coordinate array, | ||
| either with or without assigned hydrogens | ||
| 
     | 
||
| 
        
          
        
         | 
    @@ -214,6 +236,12 @@ def assign(coord: np.ndarray) -> np.ndarray: | |
| (N, CA, C, O) atoms coordinates (if k=4), or (N, CA, C, O, H) coordinates | ||
| (when k=5). | ||
| 
     | 
||
| donor_mask : np.array | ||
| Mask out any hydrogens that should not be considered (in particular HN | ||
| in PRO). If ``None`` then all H will be used (behavior up to 2.9.0). | ||
| 
     | 
||
| .. versionadded:: 2.10.0 | ||
| 
     | 
||
| Returns | ||
| ------- | ||
| np.ndarray | ||
| 
        
          
        
         | 
    @@ -222,9 +250,13 @@ def assign(coord: np.ndarray) -> np.ndarray: | |
| 
     | 
||
| 
     | 
||
| .. versionadded:: 2.8.0 | ||
| 
     | 
||
| .. versionchanged:: 2.10.0 | ||
| Support masking of donors. | ||
| 
     | 
||
| """ | ||
| # get hydrogen bond map | ||
| hbmap = get_hbond_map(coord) | ||
| hbmap = get_hbond_map(coord, donor_mask=donor_mask) | ||
| hbmap = np.swapaxes(hbmap, -1, -2) # convert into "i:C=O, j:N-H" form | ||
| 
     | 
||
| # identify turn 3, 4, 5 | ||
| 
          
            
          
           | 
    ||
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 
          
            
          
           | 
    @@ -13,15 +13,52 @@ | |||||||||||
| "pdb_filename", glob.glob(f"{DSSP_FOLDER}/?????.pdb.gz") | ||||||||||||
| ) | ||||||||||||
| def test_file_guess_hydrogens(pdb_filename, client_DSSP): | ||||||||||||
| # run 2.9.0 tests (which include PRO) | ||||||||||||
| # ignore_proline_donor=False | ||||||||||||
| # TODO: update reference data for ignore_proline_donor=True | ||||||||||||
| 
         There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should really have correct reference data. About half of the files do not show a difference between   | 
||||||||||||
| u = mda.Universe(pdb_filename) | ||||||||||||
| with open(f"{pdb_filename.rstrip('.gz')}.dssp", "r") as fin: | ||||||||||||
| correct_answ = fin.read().strip().split()[0] | ||||||||||||
| 
     | 
||||||||||||
| run = DSSP(u, guess_hydrogens=True).run(**client_DSSP) | ||||||||||||
| run = DSSP(u, guess_hydrogens=True, ignore_proline_donor=False).run( | ||||||||||||
| **client_DSSP | ||||||||||||
| ) | ||||||||||||
| answ = "".join(run.results.dssp[0]) | ||||||||||||
| assert answ == correct_answ | ||||||||||||
| 
     | 
||||||||||||
| 
     | 
||||||||||||
| @pytest.mark.parametrize( | ||||||||||||
| "pdb_filename", | ||||||||||||
| [ | ||||||||||||
| f"{DSSP_FOLDER}/{PDBID}.pdb.gz" | ||||||||||||
| for PDBID in ( | ||||||||||||
| "1eteA", | ||||||||||||
| "3aqgA", | ||||||||||||
| "3gknA", | ||||||||||||
| "3nzmA", | ||||||||||||
| "3a4rA", | ||||||||||||
| "3l4rA", | ||||||||||||
| "2j49A", | ||||||||||||
| "3gfsA", | ||||||||||||
| ) | ||||||||||||
| ], | ||||||||||||
| ) | ||||||||||||
| def test_file_ignore_proline_donor(pdb_filename, client_DSSP): | ||||||||||||
| # weak test: just check that for some structures the two methods give different results | ||||||||||||
| u = mda.Universe(pdb_filename) | ||||||||||||
| 
     | 
||||||||||||
| run_with_pro = DSSP( | ||||||||||||
| u, guess_hydrogens=True, ignore_proline_donor=False | ||||||||||||
| ).run(**client_DSSP) | ||||||||||||
| answ_with_pro = "".join(run_with_pro.results.dssp[0]) | ||||||||||||
| 
     | 
||||||||||||
| # ignore_proline_donor=True is the default: | ||||||||||||
| run_ignore_pro = DSSP(u, guess_hydrogens=True).run(**client_DSSP) | ||||||||||||
| answ_ignore_pro = "".join(run_ignore_pro.results.dssp[0]) | ||||||||||||
| 
     | 
||||||||||||
| assert answ_ignore_pro != answ_with_pro | ||||||||||||
| 
     | 
||||||||||||
| 
     | 
||||||||||||
| def test_trajectory(client_DSSP): | ||||||||||||
| u = mda.Universe(TPR, XTC).select_atoms("protein").universe | ||||||||||||
| run = DSSP(u).run(**client_DSSP, stop=10) | ||||||||||||
| 
        
          
        
         | 
    @@ -32,8 +69,6 @@ def test_trajectory(client_DSSP): | |||||||||||
| assert ( | ||||||||||||
| first_frame[:10] != last_frame[:10] == avg_frame[:10] == "-EEEEEE---" | ||||||||||||
| ) | ||||||||||||
| 
         
      Comment on lines
    
      69
     to 
      71
    
   
  
    
 | 
||||||||||||
| assert ( | |
| first_frame[:10] != last_frame[:10] == avg_frame[:10] == "-EEEEEE---" | |
| ) | |
| assert first_frame[:10] != last_frame[:10] | |
| assert last_frame[:10] == avg_frame[:10] == "-EEEEEE---" | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These lines seemed superfluous as the atomgroup approach is tested separately.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This may not be correct. The code runs ... but I am not sure if I should be masking corresponding atoms.