Add CASP-CAPRI initial support

amorehead · amorehead · commit ba789d72126c · 2021-09-17T14:57:12.000-05:00
diff --git a/project/datasets/builder/partition_dataset_filenames.py b/project/datasets/builder/partition_dataset_filenames.py
@@ -15,7 +15,7 @@
 
 @click.command()
 @click.argument('output_dir', default='../DIPS/final/raw', type=click.Path())
-@click.option('--source_type', default='rcsb', type=click.Choice(['rcsb', 'db5']))
+@click.option('--source_type', default='rcsb', type=click.Choice(['rcsb', 'db5', 'evcoupling', 'casp_capri']))
 @click.option('--filter_by_atom_count', '-f', default=False)
 @click.option('--max_atom_count', '-l', default=ATOM_COUNT_LIMIT)
 @click.option('--rank', '-r', default=0)
diff --git a/project/datasets/builder/postprocess_pruned_pairs.py b/project/datasets/builder/postprocess_pruned_pairs.py
@@ -19,7 +19,7 @@
 @click.option('--num_cpus', '-c', default=1)
 @click.option('--rank', '-r', default=0)
 @click.option('--size', '-s', default=1)
-@click.option('--source_type', default='rcsb', type=click.Choice(['rcsb', 'db5']))
+@click.option('--source_type', default='rcsb', type=click.Choice(['rcsb', 'db5', 'evcoupling', 'casp_capri']))
 def main(raw_pdb_dir: str, pruned_pairs_dir: str, external_feats_dir: str, output_dir: str,
          num_cpus: int, rank: int, size: int, source_type: str):
     """Run postprocess_pruned_pairs on all provided complexes."""
@@ -41,7 +41,7 @@ def main(raw_pdb_dir: str, pruned_pairs_dir: str, external_feats_dir: str, outpu
     produced_filenames = get_structures_filenames(output_dir, extension='.dill')
     produced_keys = [get_pdb_name(x) for x in produced_filenames]
     work_keys = [key for key in requested_keys if key not in produced_keys]
-    rscb_pruned_pair_ext = '.dill' if source_type.lower() == 'rcsb' else ''
+    rscb_pruned_pair_ext = '.dill' if source_type.lower() in ['rcsb', 'evcoupling', 'casp_capri'] else ''
     work_filenames = [os.path.join(pruned_pairs_dir, get_pdb_code(work_key)[1:3], work_key + rscb_pruned_pair_ext)
                       for work_key in work_keys]
     logger.info(f'Found {len(work_keys)} work pair(s) in {pruned_pairs_dir}')
@@ -58,7 +58,7 @@ def main(raw_pdb_dir: str, pruned_pairs_dir: str, external_feats_dir: str, outpu
         if not os.path.exists(sub_dir):
             os.mkdir(sub_dir)
         new_output_filename = sub_dir + '/' + get_pdb_name(pdb_filename) + ".dill" if \
-            source_type == 'rcsb' else \
+            source_type in ['rcsb', 'evcoupling', 'casp_capri'] else \
             sub_dir + '/' + get_pdb_name(pdb_filename)
         output_filenames.append(new_output_filename)
 
diff --git a/project/utils/utils.py b/project/utils/utils.py
@@ -600,6 +600,7 @@ def postprocess_pruned_pairs(raw_pdb_dir: str, external_feats_dir: str, pair_fil
 
 def postprocess_pruned_pair(raw_pdb_filenames: List[str], external_feats_dir: str, original_pair, source_type: str):
     """Construct a new Pair consisting of residues of structures with DSSP-derivable features and append DSSP secondary structure (SS) features to each protein structure dataframe as well."""
+    chains_selected = [original_pair.df0['chain'][0], original_pair.df1['chain'][0]]
     df0_ss_values, df0_rsa_values, df0_rd_values, df0_protrusion_indices, \
     df0_hsaacs, df0_cn_values, df0_sequence_feats, df0_amide_norm_vecs, \
     df1_ss_values, df1_rsa_values, df1_rd_values, df1_protrusion_indices, \
@@ -612,7 +613,7 @@ def postprocess_pruned_pair(raw_pdb_filenames: List[str], external_feats_dir: st
     dssp_dicts, rd_dicts, psaia_dfs, coordinate_numbers_list, hsaac_matrices, sequence_feats_dfs = [], [], [], \
                                                                                                    [], [], []
     for struct_idx, raw_pdb_filename in enumerate(raw_pdb_filenames):
-        is_rcsb_complex = source_type.lower() == 'rcsb'
+        is_rcsb_complex = source_type.lower() in ['rcsb', 'evcoupling', 'casp_capri']
 
         # Extract the FASTA sequence(s) for a given PDB file
         sequences = find_fasta_sequences_for_pdb_file(sequences,
@@ -627,7 +628,8 @@ def postprocess_pruned_pair(raw_pdb_filenames: List[str], external_feats_dir: st
             # Derive BioPython structure and residues for the given PDB file
             structure = PDB_PARSER.get_structure(original_pair.complex, raw_pdb_filename)  # PDB structure
             # Filter out all hetero residues including waters to leave only amino and nucleic acids
-            residues = [residue for residue in Selection.unfold_entities(structure, 'R') if residue.get_id()[0] == ' ']
+            residues = [residue for residue in Selection.unfold_entities(structure, 'R')
+                        if residue.get_id()[0] == ' ' and residue.get_parent().id in chains_selected]
 
             # Extract DSSP secondary structure (SS) and relative solvent accessibility (RSA) values for the 1st model
             dssp_dict = get_dssp_dict_for_pdb_model(structure[0], raw_pdb_filename)  # Only for 1st model
@@ -1055,7 +1057,7 @@ def get_raw_pdb_filename_from_interim_filename(interim_filename: str, raw_pdb_di
     slash_tokens = pdb_name.split(os.path.sep)
     slash_dot_tokens = slash_tokens[-1].split(".")
     raw_pdb_filename = os.path.join(raw_pdb_dir, slash_tokens[-2], slash_dot_tokens[0]) + '.' + slash_dot_tokens[1] if \
-        source_type == 'rcsb' else \
+        source_type.lower() in ['rcsb', 'evcoupling', 'casp_capri'] else \
         os.path.join(raw_pdb_dir, slash_dot_tokens[0].split('_')[0], slash_dot_tokens[0]) + '.' + slash_dot_tokens[1]
     return raw_pdb_filename