WIP: create proviral landscape output, for #16

CBeelen · CBeelen · commit e1da72bb57b2 · 2023-02-15T12:36:46.000-08:00
diff --git a/Singularity b/Singularity
@@ -13,7 +13,7 @@ From: centos:7
     MAINTAINER BC CfE in HIV/AIDS https://github.com/cfe-lab/
     KIVE_INPUTS sample_info_csv contigs_csv conseqs_csv cascade_csv
     KIVE_OUTPUTS outcome_summary_csv conseqs_primers_csv contigs_primers_csv \
-        table_precursor_csv hivseqinr_results_tar
+        table_precursor_csv proviral_landscape_csv hivseqinr_results_tar
     KIVE_THREADS 1
     KIVE_MEMORY 6000
 
diff --git a/gene_splicer/sample.py b/gene_splicer/sample.py
@@ -40,6 +40,9 @@ def parse_args():
     parser.add_argument('table_precursor_csv',
                         help='Sequence data ready to upload',
                         type=FileType('w'))
+    parser.add_argument('proviral_landscape_csv',
+                        help='Data for proviral landscape plot',
+                        type=FileType('w'))
     parser.add_argument('hivseqinr_results_tar',
                         help="Archive file with HIVSeqinR's final results folder.",
                         type=FileType('wb'))
@@ -100,12 +103,14 @@ def main():
     for file in fasta_files:
         gene_splicer.run(file, outdir=outpath)
     utils.generate_table_precursor(name=run_name, outpath=outpath)
+    utils.generate_proviral_landscape_csv(name=run_name, outpath=outpath)
     copy_output(outpath / 'outcome_summary.csv', args.outcome_summary_csv)
     copy_output(outpath / (run_name + '_conseqs_primer_analysis.csv'),
                 args.conseqs_primers_csv)
     copy_output(outpath / (run_name + '_contigs_primer_analysis.csv'),
                 args.contigs_primers_csv)
     copy_output(outpath / 'table_precursor.csv', args.table_precursor_csv)
+    copy_output(outpath / 'proviral_landscape.csv', args.proviral_landscape_csv)
 
 
 if __name__ == '__main__':
diff --git a/gene_splicer/utils.py b/gene_splicer/utils.py
@@ -1,3 +1,4 @@
+import csv
 import logging
 import os
 import re
@@ -493,6 +494,65 @@ def generate_table_precursor_2(hivseqinr_resultsfile, filtered_file,
     return table_precursorfile
 
 
+def generate_proviral_landscape_csv(outpath):
+    proviral_landscape_csv = os.path.join(outpath, 'proviral_landscape.csv')
+    landscape_rows = []
+
+    table_precursor_csv = os.path.join(outpath, 'table_precursor.csv')
+    blastn_csv = glob.glob(
+        os.path.join(
+            outpath,
+            'hivseqinr*',
+            'Results_Intermediate',
+            'Output_Blastn_HXB2MEGA28_tabdelim.txt'
+        )
+    )[0]
+
+    blastn_columns = ['qseqid',
+                      'qlen',
+                      'sseqid',
+                      'sgi',
+                      'slen',
+                      'qstart',
+                      'qend',
+                      'sstart',
+                      'send',
+                      'evalue',
+                      'bitscore',
+                      'length',
+                      'pident',
+                      'nident',
+                      'btop',
+                      'stitle',
+                      'sstrand']
+    with open(blastn_csv, 'r') as blastn_file:
+        blastn_reader = DictReader(blastn_file, fieldnames=blastn_columns)
+        for row in blastn_reader:
+            if row['qseqid'] in ['8E5LAV', 'HXB2']:
+                # skip the positive control rows
+                continue
+            # TODO: have to skip weird entries at start and end
+            [run_name, sample_name, reference, seqtype] = row['seqid'].split('::', expand=True)
+            landscape_entry = {'ref_start': row['sstart'],
+                               'ref_end': row['send'],
+                               'samp_name': sample_name,
+                               'samp_id': f"{run_name}::{sample_name}"}
+            landscape_rows.append(landscape_entry)
+
+    with open(table_precursor_csv, 'r') as tab_prec:
+        tab_prec_reader = DictReader(tab_prec)
+        for row in tab_prec_reader:
+            samp_name = row['sample']
+            verdict = row['MyVerdict']
+            for entry in landscape_rows:
+                if entry['samp_name'] == samp_name:
+                    entry['defect'] = verdict
+
+    landscape_columns = ['samp_id', 'ref_start', 'ref_end', 'defect']
+    with open(proviral_landscape_csv, 'w') as landscape_file:
+        landscape_writer = csv.DictWriter(landscape_file, fieldnames=landscape_columns)
+
+
 def get_softclipped_region(query, alignment, alignment_path):
     try:
         size, op = alignment.iloc[0]['cigar'][0]