@@ -600,6 +600,7 @@ def postprocess_pruned_pairs(raw_pdb_dir: str, external_feats_dir: str, pair_fil
600
600
601
601
def postprocess_pruned_pair (raw_pdb_filenames : List [str ], external_feats_dir : str , original_pair , source_type : str ):
602
602
"""Construct a new Pair consisting of residues of structures with DSSP-derivable features and append DSSP secondary structure (SS) features to each protein structure dataframe as well."""
603
+ chains_selected = [original_pair .df0 ['chain' ][0 ], original_pair .df1 ['chain' ][0 ]]
603
604
df0_ss_values , df0_rsa_values , df0_rd_values , df0_protrusion_indices , \
604
605
df0_hsaacs , df0_cn_values , df0_sequence_feats , df0_amide_norm_vecs , \
605
606
df1_ss_values , df1_rsa_values , df1_rd_values , df1_protrusion_indices , \
@@ -612,7 +613,7 @@ def postprocess_pruned_pair(raw_pdb_filenames: List[str], external_feats_dir: st
612
613
dssp_dicts , rd_dicts , psaia_dfs , coordinate_numbers_list , hsaac_matrices , sequence_feats_dfs = [], [], [], \
613
614
[], [], []
614
615
for struct_idx , raw_pdb_filename in enumerate (raw_pdb_filenames ):
615
- is_rcsb_complex = source_type .lower () == 'rcsb'
616
+ is_rcsb_complex = source_type .lower () in [ 'rcsb' , 'evcoupling' , 'casp_capri' ]
616
617
617
618
# Extract the FASTA sequence(s) for a given PDB file
618
619
sequences = find_fasta_sequences_for_pdb_file (sequences ,
@@ -627,7 +628,8 @@ def postprocess_pruned_pair(raw_pdb_filenames: List[str], external_feats_dir: st
627
628
# Derive BioPython structure and residues for the given PDB file
628
629
structure = PDB_PARSER .get_structure (original_pair .complex , raw_pdb_filename ) # PDB structure
629
630
# Filter out all hetero residues including waters to leave only amino and nucleic acids
630
- residues = [residue for residue in Selection .unfold_entities (structure , 'R' ) if residue .get_id ()[0 ] == ' ' ]
631
+ residues = [residue for residue in Selection .unfold_entities (structure , 'R' )
632
+ if residue .get_id ()[0 ] == ' ' and residue .get_parent ().id in chains_selected ]
631
633
632
634
# Extract DSSP secondary structure (SS) and relative solvent accessibility (RSA) values for the 1st model
633
635
dssp_dict = get_dssp_dict_for_pdb_model (structure [0 ], raw_pdb_filename ) # Only for 1st model
@@ -1055,7 +1057,7 @@ def get_raw_pdb_filename_from_interim_filename(interim_filename: str, raw_pdb_di
1055
1057
slash_tokens = pdb_name .split (os .path .sep )
1056
1058
slash_dot_tokens = slash_tokens [- 1 ].split ("." )
1057
1059
raw_pdb_filename = os .path .join (raw_pdb_dir , slash_tokens [- 2 ], slash_dot_tokens [0 ]) + '.' + slash_dot_tokens [1 ] if \
1058
- source_type == 'rcsb' else \
1060
+ source_type . lower () in [ 'rcsb' , 'evcoupling' , 'casp_capri' ] else \
1059
1061
os .path .join (raw_pdb_dir , slash_dot_tokens [0 ].split ('_' )[0 ], slash_dot_tokens [0 ]) + '.' + slash_dot_tokens [1 ]
1060
1062
return raw_pdb_filename
1061
1063
0 commit comments