|
1 |
| -from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter |
| 1 | +import argparse |
2 | 2 | import logging
|
3 | 3 | import shutil
|
4 | 4 | import sys
|
5 | 5 | import os
|
6 | 6 | import tempfile
|
7 | 7 | import gzip
|
| 8 | +import csv |
| 9 | +from typing import TextIO, List, Optional, Dict, IO, Set, Tuple |
8 | 10 |
|
9 | 11 | from micall.core.trim_fastqs import TrimSteps, trim
|
10 | 12 |
|
11 | 13 |
|
12 | 14 | logging.basicConfig(level=logging.INFO,
|
13 | 15 | format='%(asctime)s[%(levelname)s]%(name)s.%(funcName)s(): %(message)s')
|
14 |
| -logger = logging.getLogger('micall') |
| 16 | +logger = logging.getLogger(__name__) |
15 | 17 |
|
16 | 18 |
|
17 |
| -def concatenate_files(input_file1, input_file2, output_file): |
18 |
| - with open(input_file1, 'rb') as src1, \ |
19 |
| - open(input_file2, 'rb') as src2, \ |
20 |
| - open(output_file, 'wb') as dst: |
| 19 | +class AmbiguousBadCycles(ValueError): pass |
| 20 | +class BadTrimplanFieldNames(ValueError): pass |
21 | 21 |
|
22 |
| - shutil.copyfileobj(src1, dst) |
23 |
| - shutil.copyfileobj(src2, dst) |
24 | 22 |
|
| 23 | +def concatenate_files(input_files: List[str], output_file: str) -> None: |
| 24 | + with open(output_file, 'wb') as dst: |
| 25 | + for input_file in input_files: |
| 26 | + with open(input_file, 'rb') as src: |
| 27 | + shutil.copyfileobj(src, dst) |
25 | 28 |
|
26 |
| -def merge_fastqs(args): |
27 |
| - with tempfile.NamedTemporaryFile() as trimmed_fastq1_a, \ |
28 |
| - tempfile.NamedTemporaryFile() as trimmed_fastq2_a, \ |
29 |
| - tempfile.NamedTemporaryFile() as trimmed_fastq1_b, \ |
30 |
| - tempfile.NamedTemporaryFile() as trimmed_fastq2_b: |
31 | 29 |
|
32 |
| - logger.info('Processing reads of Sample A.') |
| 30 | +def parse_inputs_and_merge_fastqs(trim_file: TextIO, mergeplan_file: TextIO, zip_file: Optional[TextIO]) -> None: |
| 31 | + mergeplan: Dict[str, List[str]] = {} |
| 32 | + trimplan: Set[Tuple[str, ...]] = set() |
| 33 | + zipped: Set[str] = set() |
| 34 | + bad_cycles: Dict[str, str] = {} |
33 | 35 |
|
34 |
| - trim((args.fastq1_a, args.fastq2_a), |
35 |
| - args.bad_cycles_a_csv, |
36 |
| - (trimmed_fastq1_a.name, trimmed_fastq2_a.name), |
37 |
| - use_gzip=not args.unzipped) |
| 36 | + mergeplan_reader = csv.DictReader(mergeplan_file) |
| 37 | + trim_reader = csv.DictReader(trim_file) |
| 38 | + zip_reader = csv.DictReader(zip_file) if zip_file is not None else None |
38 | 39 |
|
39 |
| - logger.info('Processing reads of Sample B.') |
| 40 | + for row in mergeplan_reader: |
| 41 | + input_path = row["input"] |
| 42 | + output_path = row["output"] |
40 | 43 |
|
41 |
| - trim((args.fastq1_b, args.fastq2_b), |
42 |
| - args.bad_cycles_b_csv, |
43 |
| - (trimmed_fastq1_b.name, trimmed_fastq2_b.name), |
44 |
| - use_gzip=not args.unzipped) |
| 44 | + if output_path not in mergeplan: |
| 45 | + mergeplan[output_path] = [] |
| 46 | + mergeplan[output_path].append(input_path) |
45 | 47 |
|
46 |
| - logger.info('Merging resulting reads files.') |
| 48 | + no_badcycles_fields = list(sorted( |
| 49 | + (field for field in trim_reader.fieldnames or [] if field != "bad_cycles"), |
| 50 | + key=lambda field: field.lower())) |
| 51 | + expected_no_badcycles_fields = [f"r{i + 1}" for i in range(len(no_badcycles_fields))] |
47 | 52 |
|
48 |
| - os.makedirs(os.path.dirname(args.fastq1_result), exist_ok=True) |
49 |
| - os.makedirs(os.path.dirname(args.fastq2_result), exist_ok=True) |
| 53 | + if [field.lower() for field in no_badcycles_fields] \ |
| 54 | + != expected_no_badcycles_fields: |
| 55 | + raise BadTrimplanFieldNames(f"Bad field names: {no_badcycles_fields}, expected {expected_no_badcycles_fields}") |
50 | 56 |
|
51 |
| - if args.unzipped: |
52 |
| - concatenate_files(trimmed_fastq1_a.name, trimmed_fastq1_b.name, |
53 |
| - args.fastq1_result) |
54 |
| - concatenate_files(trimmed_fastq2_a.name, trimmed_fastq2_b.name, |
55 |
| - args.fastq2_result) |
| 57 | + for row in trim_reader: |
| 58 | + input_paths = tuple(row[field] for field in no_badcycles_fields) |
| 59 | + trimplan.add(input_paths) |
| 60 | + bad_cycles_path = row.get("bad_cycles", "") |
| 61 | + if bad_cycles_path: |
| 62 | + for input_path in input_paths: |
| 63 | + bad_cycles[input_path] = bad_cycles_path |
56 | 64 |
|
57 |
| - else: |
58 |
| - with tempfile.NamedTemporaryFile() as temp_fastq1, \ |
59 |
| - tempfile.NamedTemporaryFile() as temp_fastq2: |
| 65 | + if zip_reader is not None: |
| 66 | + for row in zip_reader: |
| 67 | + zipped.add(row["file"]) |
60 | 68 |
|
61 |
| - temp_fastq1.close() |
62 |
| - temp_fastq2.close() |
| 69 | + return merge_fastqs(trimplan, mergeplan, zipped, bad_cycles) |
63 | 70 |
|
64 |
| - concatenate_files(trimmed_fastq1_a.name, trimmed_fastq1_b.name, |
65 |
| - temp_fastq1.name) |
66 |
| - concatenate_files(trimmed_fastq2_a.name, trimmed_fastq2_b.name, |
67 |
| - temp_fastq2.name) |
68 | 71 |
|
69 |
| - logger.info('Compressing final outputs.') |
| 72 | +def compress_file(input_path: str, output_path: str) -> None: |
| 73 | + with open(input_path, 'rb') as input_file, \ |
| 74 | + open(output_path, 'wb') as output_file: |
| 75 | + with gzip.GzipFile(fileobj=output_file, mode='wb') as gzip_file: |
| 76 | + shutil.copyfileobj(input_file, gzip_file) |
70 | 77 |
|
71 |
| - with open(temp_fastq1.name, 'rb') as f_in, gzip.open(args.fastq1_result, 'wb') as f_out: |
72 |
| - shutil.copyfileobj(f_in, f_out) |
73 | 78 |
|
74 |
| - with open(temp_fastq2.name, 'rb') as f_in, gzip.open(args.fastq2_result, 'wb') as f_out: |
75 |
| - shutil.copyfileobj(f_in, f_out) |
| 79 | +def uncompress_file(input_path: str, output_path: str) -> None: |
| 80 | + with open(input_path, 'rb') as compressed_file, \ |
| 81 | + open(output_path, 'w+b') as ret: |
| 82 | + with gzip.GzipFile(fileobj=compressed_file, mode='rb') as gzip_file: |
| 83 | + shutil.copyfileobj(gzip_file, ret) |
76 | 84 |
|
77 |
| - logger.info('Done.') |
78 | 85 |
|
| 86 | +def get_transitive(graph: Dict[str, str], key: str) -> str: |
| 87 | + if key in graph: |
| 88 | + return get_transitive(graph, graph[key]) |
| 89 | + else: |
| 90 | + return key |
| 91 | + |
| 92 | + |
| 93 | +def merge_fastqs(trimplan: Set[Tuple[str, ...]], |
| 94 | + mergeplan: Dict[str, List[str]], |
| 95 | + zipped: Set[str] = set(), |
| 96 | + bad_cycles: Dict[str, str] = {}, |
| 97 | + ) -> None: |
| 98 | + |
| 99 | + inputs = [value for values in mergeplan.values() for value in values] |
| 100 | + outputs = list(mergeplan.keys()) |
| 101 | + name_mapping: Dict[str, str] = {} |
| 102 | + temporary: List[IO] = [] |
| 103 | + |
| 104 | + for output_path in outputs: |
| 105 | + os.makedirs(os.path.dirname(output_path), exist_ok=True) |
| 106 | + |
| 107 | + for input_path in inputs: |
| 108 | + if input_path in zipped: |
| 109 | + logger.info('Uncompressing %s.', input_path) |
| 110 | + uncompressed = tempfile.NamedTemporaryFile(mode="w+b") |
| 111 | + uncompressed_path = uncompressed.name |
| 112 | + uncompress_file(input_path, uncompressed_path) |
| 113 | + temporary.append(uncompressed) |
| 114 | + name_mapping[input_path] = uncompressed_path |
| 115 | + |
| 116 | + for to_trim in trimplan: |
| 117 | + assert len(to_trim) > 0 |
79 | 118 |
|
80 |
| -def main(argv) -> int: |
81 |
| - parser = ArgumentParser( |
82 |
| - description="Combine and filter the FASTQ files from two samples into a single output file.", |
83 |
| - formatter_class=ArgumentDefaultsHelpFormatter) |
84 |
| - |
85 |
| - parser.add_argument( |
86 |
| - "fastq1_a", |
87 |
| - help="FASTQ file containing forward reads of sample A", |
88 |
| - ) |
89 |
| - parser.add_argument( |
90 |
| - "fastq2_a", |
91 |
| - help="FASTQ file containing reverse reads of sample A", |
92 |
| - ) |
93 |
| - parser.add_argument( |
94 |
| - "fastq1_b", |
95 |
| - help="FASTQ file containing forward reads of sample B", |
96 |
| - ) |
97 |
| - parser.add_argument( |
98 |
| - "fastq2_b", |
99 |
| - help="FASTQ file containing reverse reads of sample B", |
100 |
| - ) |
101 |
| - parser.add_argument( |
102 |
| - "fastq1_result", |
103 |
| - help="Resulting combined FASTQ file containing forward reads", |
104 |
| - ) |
105 |
| - parser.add_argument( |
106 |
| - "fastq2_result", |
107 |
| - help="Resulting combined FASTQ file containing reverse reads", |
108 |
| - ) |
109 |
| - parser.add_argument( |
110 |
| - "--bad_cycles_a_csv", |
111 |
| - help="list of tiles and cycles rejected for poor quality in sample A", |
112 |
| - ) |
113 |
| - parser.add_argument( |
114 |
| - "--bad_cycles_b_csv", |
115 |
| - help="list of tiles and cycles rejected for poor quality in sample B", |
116 |
| - ) |
117 |
| - parser.add_argument( |
118 |
| - '--unzipped', '-u', |
119 |
| - action='store_true', |
120 |
| - help='Set if the original FASTQ files are not compressed', |
121 |
| - ) |
| 119 | + trim_outputs: List[str] = [] |
| 120 | + trim_inputs: List[str] = [] |
122 | 121 |
|
| 122 | + all_bad_cycles_paths = set(bad_cycles[path] for path in to_trim if path in bad_cycles) |
| 123 | + if len(all_bad_cycles_paths) == 0: |
| 124 | + bad_cycles_path = None |
| 125 | + elif len(all_bad_cycles_paths) == 1: |
| 126 | + bad_cycles_path = next(iter(all_bad_cycles_paths)) |
| 127 | + else: |
| 128 | + raise AmbiguousBadCycles(f"Ambiguous bad_cycles for {to_trim}: {all_bad_cycles_paths}.") |
| 129 | + |
| 130 | + for path in to_trim: |
| 131 | + path = get_transitive(name_mapping, path) |
| 132 | + tmp = tempfile.NamedTemporaryFile() |
| 133 | + trim_inputs.append(path) |
| 134 | + trim_outputs.append(tmp.name) |
| 135 | + temporary.append(tmp) |
| 136 | + name_mapping[path] = tmp.name |
| 137 | + |
| 138 | + logger.info('Trimming samples %s.', ','.join(map(repr, to_trim))) |
| 139 | + trim(trim_inputs, bad_cycles_path, trim_outputs, use_gzip=False) |
| 140 | + |
| 141 | + for output_path in mergeplan: |
| 142 | + input_files = mergeplan[output_path] |
| 143 | + logger.info('Merging results %s to %s.', ','.join(map(repr, input_files)), output_path) |
| 144 | + input_files = [get_transitive(name_mapping, path) for path in input_files] |
| 145 | + tmp = tempfile.NamedTemporaryFile() |
| 146 | + temporary.append(tmp) |
| 147 | + name_mapping[output_path] = tmp.name |
| 148 | + output_path = tmp.name |
| 149 | + concatenate_files(input_files, output_path) |
| 150 | + |
| 151 | + for output_path in mergeplan: |
| 152 | + concatenated = name_mapping[output_path] |
| 153 | + if output_path in zipped: |
| 154 | + logger.info('Compressing output %s.', output_path) |
| 155 | + compress_file(concatenated, output_path) |
| 156 | + else: |
| 157 | + os.rename(concatenated, output_path) |
| 158 | + |
| 159 | + for toclose in temporary: |
| 160 | + try: toclose.close() |
| 161 | + except: pass |
| 162 | + |
| 163 | + logger.info('Done.') |
| 164 | + |
| 165 | + |
| 166 | +def main(argv: List[str]) -> int: |
| 167 | + parser = argparse.ArgumentParser(description="Combine and filter the FASTQ files from multiple samples into single output files.") |
| 168 | + parser.add_argument("trimplan", type=argparse.FileType('rt'), help="A CSV file containing the lists of files to be trimmed.") |
| 169 | + parser.add_argument("mergeplan", type=argparse.FileType('rt'), help="A CSV file containing merge plan.") |
| 170 | + parser.add_argument("--zipfile", type=argparse.FileType('rt'), help="A CSV file containing a list of files that are compressed or need to be compressed.") |
123 | 171 | args = parser.parse_args(argv)
|
124 |
| - merge_fastqs(args) |
| 172 | + |
| 173 | + parse_inputs_and_merge_fastqs(args.trimplan, args.mergeplan, args.zipfile) |
125 | 174 | return 0
|
126 | 175 |
|
127 | 176 |
|
|
0 commit comments