Use unstitched_cascade.csv for proviral pipeline

Donaim · Donaim · commit e3efdea2eb2a · 2024-10-31T15:46:18.000-07:00
diff --git a/Singularity b/Singularity
@@ -157,7 +157,7 @@ From: python:3.8
         conseq_all_csv concordance_csv concordance_seed_csv failed_align_csv \
         coverage_scores_csv coverage_maps_tar aligned_csv g2p_aligned_csv \
         genome_coverage_csv genome_coverage_svg genome_concordance_svg \
-        unstitched_conseq_csv unstitched_contigs_csv contigs_csv \
+        unstitched_cascade_csv unstitched_conseq_csv unstitched_contigs_csv contigs_csv \
         read_entropy_csv conseq_region_csv conseq_stitched_csv
     KIVE_THREADS 2
     KIVE_MEMORY 6000
diff --git a/docs/steps.md b/docs/steps.md
@@ -353,6 +353,13 @@ Individual files are described after the list of steps.
 * unstitched_conseq.csv
   * region - the region mapped to
   * sequence - the consensus sequence used
+* unstitched_cascade.csv - number of read pairs that flow through the pipeline steps
+  * demultiplexed - count from the raw FASTQ
+  * v3loop - aligned with V3LOOP
+  * g2p - valid reads to count in G2P
+  * prelim_map - mapped to other references on first pass
+  * remap - mapped to other references after remapping
+  * aligned - aligned with a reference and merged with mate
 * resistance.csv
   * region - the region code, like PR or RT
   * drug_class - the drug class code from the HIVdb rules, like NRTI
diff --git a/micall/monitor/kive_watcher.py b/micall/monitor/kive_watcher.py
@@ -59,6 +59,7 @@
                       'resistance_consensus_csv',
                       'wg_fasta',
                       'mid_fasta',
+                      'unstitched_cascade_csv',
                       'unstitched_conseq_csv',
                       'unstitched_contigs_csv',
                       'contigs_csv',
@@ -939,12 +940,13 @@ def run_proviral_pipeline(self, sample_watcher, folder_watcher, description):
             run_dataset['argument_name']: run_dataset['dataset']
             for run_dataset in main_run['datasets']
             if run_dataset['argument_name'] in ('sample_info_csv',
+                                                'unstitched_cascade_csv',
                                                 'unstitched_conseq_csv',
-                                                'unstitched_contigs_csv',
-                                                'cascade_csv')}
+                                                'unstitched_contigs_csv')}
         input_datasets = {
             argument_name: self.kive_retry(lambda: self.session.get(url).json())
             for argument_name, url in input_dataset_urls.items()}
+        input_datasets['cascade_csv'] = input_datasets.pop('unstitched_cascade_csv')
         input_datasets['conseqs_csv'] = input_datasets.pop('unstitched_conseq_csv')
         input_datasets['contigs_csv'] = input_datasets.pop('unstitched_contigs_csv')
         run = self.find_or_launch_run(
diff --git a/micall/tests/test_kive_watcher.py b/micall/tests/test_kive_watcher.py
@@ -1765,7 +1765,7 @@ def test_launch_proviral_run(raw_data_with_two_samples, mock_open_kive):
               argument_name='unstitched_conseq_csv'),
          dict(dataset='/datasets/113/',
               argument_type='O',
-              argument_name='cascade_csv')]]  # run datasets
+              argument_name='unstitched_cascade_csv')]]  # run datasets
     mock_session.get.return_value.json.side_effect = [
         dict(url='/datasets/110/', id=110),
         dict(url='/datasets/111/', id=111),
diff --git a/micall_docker.py b/micall_docker.py
@@ -1099,7 +1099,8 @@ def collate_samples(run_info: RunInfo):
                  'concordance.csv',
                  'concordance_seed.csv']
     if run_info.is_denovo:
-        filenames += ['conseq_stitched.csv', 'conseq_region.csv', 'unstitched_conseq.csv']
+        filenames += ['conseq_stitched.csv', 'conseq_region.csv',
+                      'unstitched_cascade.csv', 'unstitched_conseq.csv', 'unstitched_contigs.csv']
     for filename in filenames:
         out_path = run_info.output_path
         with open(os.path.join(out_path, filename), 'w') as fout:
diff --git a/micall_kive.py b/micall_kive.py
@@ -80,6 +80,9 @@ def parse_args():
                         action='store_true',
                         help='Use de novo assembly instead of mapping to '
                              'reference sequences.')
+    parser.add_argument('unstitched_cascade_csv',
+                        nargs='?',
+                        help='count of reads at each step')
     parser.add_argument('unstitched_conseq_csv',
                         nargs='?',
                         help='CSV containing mapping unstitched consensus sequences')
diff --git a/release_test_microtest.py b/release_test_microtest.py
@@ -465,6 +465,7 @@ def process_sample(self, fastq_file: Path):
                 'genome_coverage.csv',
                 'genome_coverage.svg',
                 'genome_concordance.svg',
+                'unstitched_cascade.csv',
                 'unstitched_conseq.csv',
                 'unstitched_contigs.csv',
                 'contigs.csv',