|
| 1 | +import csv |
1 | 2 | import logging
|
2 | 3 | import os
|
3 | 4 | import re
|
@@ -493,6 +494,65 @@ def generate_table_precursor_2(hivseqinr_resultsfile, filtered_file,
|
493 | 494 | return table_precursorfile
|
494 | 495 |
|
495 | 496 |
|
| 497 | +def generate_proviral_landscape_csv(outpath): |
| 498 | + proviral_landscape_csv = os.path.join(outpath, 'proviral_landscape.csv') |
| 499 | + landscape_rows = [] |
| 500 | + |
| 501 | + table_precursor_csv = os.path.join(outpath, 'table_precursor.csv') |
| 502 | + blastn_csv = glob.glob( |
| 503 | + os.path.join( |
| 504 | + outpath, |
| 505 | + 'hivseqinr*', |
| 506 | + 'Results_Intermediate', |
| 507 | + 'Output_Blastn_HXB2MEGA28_tabdelim.txt' |
| 508 | + ) |
| 509 | + )[0] |
| 510 | + |
| 511 | + blastn_columns = ['qseqid', |
| 512 | + 'qlen', |
| 513 | + 'sseqid', |
| 514 | + 'sgi', |
| 515 | + 'slen', |
| 516 | + 'qstart', |
| 517 | + 'qend', |
| 518 | + 'sstart', |
| 519 | + 'send', |
| 520 | + 'evalue', |
| 521 | + 'bitscore', |
| 522 | + 'length', |
| 523 | + 'pident', |
| 524 | + 'nident', |
| 525 | + 'btop', |
| 526 | + 'stitle', |
| 527 | + 'sstrand'] |
| 528 | + with open(blastn_csv, 'r') as blastn_file: |
| 529 | + blastn_reader = DictReader(blastn_file, fieldnames=blastn_columns) |
| 530 | + for row in blastn_reader: |
| 531 | + if row['qseqid'] in ['8E5LAV', 'HXB2']: |
| 532 | + # skip the positive control rows |
| 533 | + continue |
| 534 | + # TODO: have to skip weird entries at start and end |
| 535 | + [run_name, sample_name, reference, seqtype] = row['seqid'].split('::', expand=True) |
| 536 | + landscape_entry = {'ref_start': row['sstart'], |
| 537 | + 'ref_end': row['send'], |
| 538 | + 'samp_name': sample_name, |
| 539 | + 'samp_id': f"{run_name}::{sample_name}"} |
| 540 | + landscape_rows.append(landscape_entry) |
| 541 | + |
| 542 | + with open(table_precursor_csv, 'r') as tab_prec: |
| 543 | + tab_prec_reader = DictReader(tab_prec) |
| 544 | + for row in tab_prec_reader: |
| 545 | + samp_name = row['sample'] |
| 546 | + verdict = row['MyVerdict'] |
| 547 | + for entry in landscape_rows: |
| 548 | + if entry['samp_name'] == samp_name: |
| 549 | + entry['defect'] = verdict |
| 550 | + |
| 551 | + landscape_columns = ['samp_id', 'ref_start', 'ref_end', 'defect'] |
| 552 | + with open(proviral_landscape_csv, 'w') as landscape_file: |
| 553 | + landscape_writer = csv.DictWriter(landscape_file, fieldnames=landscape_columns) |
| 554 | + |
| 555 | + |
496 | 556 | def get_softclipped_region(query, alignment, alignment_path):
|
497 | 557 | try:
|
498 | 558 | size, op = alignment.iloc[0]['cigar'][0]
|
|
0 commit comments