-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathfastq_num_reads.py
executable file
·77 lines (66 loc) · 2.3 KB
/
fastq_num_reads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python
"""Determine number of reads in (gzipped) fastq files
"""
import sys
import os
import gzip
import io
def warn(msg):
"""prints warning message to stderr (appends newline)"""
sys.stderr.write("WARN: {}\n".format(msg))
def fastqdata_from_fastq(fastq):
"""Check whether we have a data file produced with fastqc for this
fastq file
"""
fastqc_dir = fastq
for ext in [".gz", ".fastq", ".fq"]:
if fastqc_dir.endswith(ext):
fastqc_dir = fastqc_dir[:-len(ext)]
fastqc_dir += "_fastqc"
fastqc_data = os.path.join(fastqc_dir, "fastqc_data.txt")
if os.path.exists(fastqc_data):
if os.path.getctime(fastqc_data) < os.path.getctime(fastq):
warn("Using fastqc file that's older than fastq file: {}.".format(
fastqc_data))
return fastqc_data
else:
return None
def fastq_num_reads(fastq):
"""Determine number of reads in fastq file"""
fastqc_data = fastqdata_from_fastq(fastq)
if fastqc_data:
#sys.stderr.write("DEBUG: is fastqc\n")
with open(fastqc_data) as fh:
for line in fh:
if line.startswith("Total Sequences"):
return int(line.split()[-1])
raise ValueError(fastqc_data)
else:
#sys.stderr.write("DEBUG: count fastq.gz\n")
if fastq.endswith(".gz"):
# gzip speedup with io.BufferedReader
# see http://aripollak.com/pythongzipbenchmarks/
# and http://www.reddit.com/r/Python/comments/2olhrf/fast_gzip_in_python/
fh = io.BufferedReader(gzip.open(fastq))
else:
#sys.stderr.write("DEBUG: count fastq\n")
fh = open(fastq)
num_lines = 0
for line in fh:
num_lines += 1
if num_lines % 4 == 1:
# bytes or string?
if isinstance(line, bytes):
c = line.decode()[0]
else:
c = line[0]
assert c == '@'
fh.close()
assert num_lines % 4 == 0
return num_lines/4
if __name__ == "__main__":
for f in sys.argv[1:]:
if not os.path.exists(f):
warn("non-existant file {}".format(f))
continue
print("{}\t{}".format(f, fastq_num_reads(f)))