Skip to content

Commit ba789d7

Browse files
committed
Add CASP-CAPRI initial support
1 parent 5077f12 commit ba789d7

File tree

3 files changed

+9
-7
lines changed

3 files changed

+9
-7
lines changed

project/datasets/builder/partition_dataset_filenames.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
@click.command()
1717
@click.argument('output_dir', default='../DIPS/final/raw', type=click.Path())
18-
@click.option('--source_type', default='rcsb', type=click.Choice(['rcsb', 'db5']))
18+
@click.option('--source_type', default='rcsb', type=click.Choice(['rcsb', 'db5', 'evcoupling', 'casp_capri']))
1919
@click.option('--filter_by_atom_count', '-f', default=False)
2020
@click.option('--max_atom_count', '-l', default=ATOM_COUNT_LIMIT)
2121
@click.option('--rank', '-r', default=0)

project/datasets/builder/postprocess_pruned_pairs.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
@click.option('--num_cpus', '-c', default=1)
2020
@click.option('--rank', '-r', default=0)
2121
@click.option('--size', '-s', default=1)
22-
@click.option('--source_type', default='rcsb', type=click.Choice(['rcsb', 'db5']))
22+
@click.option('--source_type', default='rcsb', type=click.Choice(['rcsb', 'db5', 'evcoupling', 'casp_capri']))
2323
def main(raw_pdb_dir: str, pruned_pairs_dir: str, external_feats_dir: str, output_dir: str,
2424
num_cpus: int, rank: int, size: int, source_type: str):
2525
"""Run postprocess_pruned_pairs on all provided complexes."""
@@ -41,7 +41,7 @@ def main(raw_pdb_dir: str, pruned_pairs_dir: str, external_feats_dir: str, outpu
4141
produced_filenames = get_structures_filenames(output_dir, extension='.dill')
4242
produced_keys = [get_pdb_name(x) for x in produced_filenames]
4343
work_keys = [key for key in requested_keys if key not in produced_keys]
44-
rscb_pruned_pair_ext = '.dill' if source_type.lower() == 'rcsb' else ''
44+
rscb_pruned_pair_ext = '.dill' if source_type.lower() in ['rcsb', 'evcoupling', 'casp_capri'] else ''
4545
work_filenames = [os.path.join(pruned_pairs_dir, get_pdb_code(work_key)[1:3], work_key + rscb_pruned_pair_ext)
4646
for work_key in work_keys]
4747
logger.info(f'Found {len(work_keys)} work pair(s) in {pruned_pairs_dir}')
@@ -58,7 +58,7 @@ def main(raw_pdb_dir: str, pruned_pairs_dir: str, external_feats_dir: str, outpu
5858
if not os.path.exists(sub_dir):
5959
os.mkdir(sub_dir)
6060
new_output_filename = sub_dir + '/' + get_pdb_name(pdb_filename) + ".dill" if \
61-
source_type == 'rcsb' else \
61+
source_type in ['rcsb', 'evcoupling', 'casp_capri'] else \
6262
sub_dir + '/' + get_pdb_name(pdb_filename)
6363
output_filenames.append(new_output_filename)
6464

project/utils/utils.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -600,6 +600,7 @@ def postprocess_pruned_pairs(raw_pdb_dir: str, external_feats_dir: str, pair_fil
600600

601601
def postprocess_pruned_pair(raw_pdb_filenames: List[str], external_feats_dir: str, original_pair, source_type: str):
602602
"""Construct a new Pair consisting of residues of structures with DSSP-derivable features and append DSSP secondary structure (SS) features to each protein structure dataframe as well."""
603+
chains_selected = [original_pair.df0['chain'][0], original_pair.df1['chain'][0]]
603604
df0_ss_values, df0_rsa_values, df0_rd_values, df0_protrusion_indices, \
604605
df0_hsaacs, df0_cn_values, df0_sequence_feats, df0_amide_norm_vecs, \
605606
df1_ss_values, df1_rsa_values, df1_rd_values, df1_protrusion_indices, \
@@ -612,7 +613,7 @@ def postprocess_pruned_pair(raw_pdb_filenames: List[str], external_feats_dir: st
612613
dssp_dicts, rd_dicts, psaia_dfs, coordinate_numbers_list, hsaac_matrices, sequence_feats_dfs = [], [], [], \
613614
[], [], []
614615
for struct_idx, raw_pdb_filename in enumerate(raw_pdb_filenames):
615-
is_rcsb_complex = source_type.lower() == 'rcsb'
616+
is_rcsb_complex = source_type.lower() in ['rcsb', 'evcoupling', 'casp_capri']
616617

617618
# Extract the FASTA sequence(s) for a given PDB file
618619
sequences = find_fasta_sequences_for_pdb_file(sequences,
@@ -627,7 +628,8 @@ def postprocess_pruned_pair(raw_pdb_filenames: List[str], external_feats_dir: st
627628
# Derive BioPython structure and residues for the given PDB file
628629
structure = PDB_PARSER.get_structure(original_pair.complex, raw_pdb_filename) # PDB structure
629630
# Filter out all hetero residues including waters to leave only amino and nucleic acids
630-
residues = [residue for residue in Selection.unfold_entities(structure, 'R') if residue.get_id()[0] == ' ']
631+
residues = [residue for residue in Selection.unfold_entities(structure, 'R')
632+
if residue.get_id()[0] == ' ' and residue.get_parent().id in chains_selected]
631633

632634
# Extract DSSP secondary structure (SS) and relative solvent accessibility (RSA) values for the 1st model
633635
dssp_dict = get_dssp_dict_for_pdb_model(structure[0], raw_pdb_filename) # Only for 1st model
@@ -1055,7 +1057,7 @@ def get_raw_pdb_filename_from_interim_filename(interim_filename: str, raw_pdb_di
10551057
slash_tokens = pdb_name.split(os.path.sep)
10561058
slash_dot_tokens = slash_tokens[-1].split(".")
10571059
raw_pdb_filename = os.path.join(raw_pdb_dir, slash_tokens[-2], slash_dot_tokens[0]) + '.' + slash_dot_tokens[1] if \
1058-
source_type == 'rcsb' else \
1060+
source_type.lower() in ['rcsb', 'evcoupling', 'casp_capri'] else \
10591061
os.path.join(raw_pdb_dir, slash_dot_tokens[0].split('_')[0], slash_dot_tokens[0]) + '.' + slash_dot_tokens[1]
10601062
return raw_pdb_filename
10611063

0 commit comments

Comments
 (0)