Skip to content

Commit 6023c2e

Browse files
committed
add bcl2fq subcommand
1 parent aa0383d commit 6023c2e

File tree

6 files changed

+90
-74
lines changed

6 files changed

+90
-74
lines changed

README.md

+19-33
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
$ cat README.md
2+
13
# fsplit
24

35
`fsplit`是用于根据`barcode`信息从`BCL``fastq`混合数据中拆分样本数据的软件。
@@ -74,40 +76,18 @@ fsplit index -i test.fastq.gz
7476

7577
#### 参数说明
7678

77-
使用`fsplit split`命令,拆分`bcl`数据,相关参数如下:
78-
79-
| 参数 | 描述 |
80-
| ------------- | ------------------------------------------------------------ |
81-
| -i/--input | 输入的BCL数据flowcell目录 |
82-
| -b/--barcode | barcode信息文件,两列或三列,第一列为样本名,第二列和第三列为barcode信息 |
83-
| -m/--mismatch | barcode拆分时运行的错配碱基数,默认0,不允许错配, |
84-
| -t/--threads | 运行使用的cpu核数 |
85-
| -o/--output | 结果输出目录,不存在会自动创建 |
86-
| --bcl2fq | 指定bcl2fastq软件路径,不指定会自动从$PATH或sys.prefix中查找 |
87-
88-
89-
90-
#### single-end index barcode文件实例
91-
92-
```
93-
S1 AAAAA
94-
S2 TTTTT
95-
S3 GGGGG
96-
S4 CCCCC
97-
```
98-
99-
100-
101-
#### paired-end index barcode文件实例
102-
103-
```
104-
S1 AAAAA GAAAAAA
105-
S2 TTTTT TGGGGGG
106-
S3 GGGGG CTTTTTT
107-
S4 CCCCC ACCCCCC
108-
```
109-
79+
使用`fsplit bcl2fq`命令,拆分`bcl`数据,相关参数如下:
11080

81+
| 参数 | 描述 |
82+
| ---------------- | ------------------------------------------------------------ |
83+
| -i/--input | 输入的BCL数据flowcell目录 |
84+
| -s/--sample | sample sheet信息文件,两列或三列,空白隔开,第一列为样本名,第二列为indel1(i7)序列,第三列为index2(i5)序列 |
85+
| -m/--mismatch | barcode拆分时运行的错配碱基数,默认0,不允许错配, |
86+
| -t/--threads | 运行使用的cpu核数 |
87+
| -o/--output | 结果输出目录,不存在会自动创建 |
88+
| -rc1/--rc-index1 | 将index1(i7)序列反向互补 |
89+
| -rc2/--rc-index2 | 将index2(i5)序列反向互补 |
90+
| --bcl2fq | 指定bcl2fastq软件路径,不指定会自动从$PATH或sys.prefix中查找 |
11191

11292

11393

@@ -148,3 +128,9 @@ S4 CCCCC ACCCCCC
148128

149129
+ 单线程读取,子进程解压,处理后序列直接写入文件,取消建立索引步骤,取消多进程处理,取消文件互斥锁
150130
+ `split`步骤同时添加`golang`实现[gsplit](src/gsplit.go).
131+
132+
133+
134+
#### version 1.0.5
135+
136+
+ 新增bcl2fq子命令封装bcl2fastq软件,用于bcl数据拆分

src/bcl.py

+20-19
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,20 @@
44
import logging
55
import subprocess
66

7-
from .utils import which
7+
from .utils import *
88

99

1010
class BCL(object):
11-
def __init__(self, fcdir, outdir, bcfile, cpu=60, bcl2fastq="", mis=1):
11+
def __init__(self, fcdir, outdir, bcfile, cpu=60, bcl2fastq="", mis=1, rc_i7=False, rc_i5=False, print_cmd=False):
1212
self.fcdir = fcdir
1313
self.outdir = outdir
1414
self.bcfile = bcfile
1515
self.mis = str(mis)
1616
self.index = 1
1717
self.nproc = str(cpu)
18+
self.rc_i7 = rc_i7
19+
self.rc_i5 = rc_i5
20+
self.print_cmd = print_cmd
1821
self.bcl2fastq = bcl2fastq or which("bcl2fastq")
1922
self.samplesheet = os.path.join(self.outdir, "sample-sheet.csv")
2023

@@ -24,24 +27,28 @@ def create_samplesheet(self):
2427
for line in fi:
2528
if not line.strip() or line.startswith("#"):
2629
continue
27-
line = line.split()
30+
line = line.split()[:3]
2831
if len(line) == 2:
2932
self.index = 1
3033
idx.append((line))
3134
elif len(line) == 3:
3235
self.index = 2
3336
idx.append((line))
34-
else:
35-
raise IOError("illegal barcode file %s" % self.bcfile)
37+
# else:
38+
# raise IOError("illegal barcode file %s" % self.bcfile)
3639
if not os.path.isdir(self.outdir):
3740
os.makedirs(self.outdir)
3841
with open(self.samplesheet, "w") as fo:
3942
if self.index == 1:
40-
fo.write("[Data]\nSample_ID,Sample_Name,index\n")
43+
fo.write("[Data]\nSample_ID,index\n")
4144
else:
42-
fo.write("[Data]\nSample_ID,Sample_Name,index,index2\n")
45+
fo.write("[Data]\nSample_ID,index,index2\n")
4346
for line in idx:
44-
fo.write(",".join([line[0]] + line) + "\n")
47+
if self.rc_i7:
48+
line[1] = rc_seq(line[1])
49+
if self.index == 2 and self.rc_i5:
50+
line[2] = rc_seq(line[2])
51+
fo.write(",".join(line) + "\n")
4552

4653
def make_bcl_cmd(self):
4754
cmd = [self.bcl2fastq, "-o", self.outdir, "-R", self.fcdir,
@@ -52,19 +59,13 @@ def make_bcl_cmd(self):
5259
"--sample-sheet", self.samplesheet]
5360
return " ".join(cmd)
5461

55-
@staticmethod
56-
def call(cmd, run=True, verbose=False):
62+
def call(self, cmd, run=True):
63+
if self.print_cmd:
64+
self.logs.info(cmd)
5765
if not run:
58-
if verbose:
59-
print(cmd)
6066
return
61-
if verbose:
62-
print(cmd)
63-
subprocess.check_call(cmd, shell=True, stdout=sys.stdout,
64-
stderr=sys.stderr)
65-
else:
66-
with open(os.devnull, "w") as fo:
67-
subprocess.check_call(cmd, shell=True, stdout=fo, stderr=fo)
67+
with open(os.devnull, "w") as fo:
68+
subprocess.check_call(cmd, shell=True, stdout=fo, stderr=fo)
6869

6970
def stats(self, j):
7071
with open(j) as fi:

src/main.py

+14-8
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import os
55
import multiprocessing as mp
66

7-
87
from collections import Counter
98

109
from .src import *
@@ -110,14 +109,21 @@ def main():
110109
FastqIndex.createindex(args.input)
111110
return
112111
outdir = os.path.abspath(args.output)
113-
if os.path.isdir(infq):
112+
if args.mode == "bcl2fq":
114113
if os.path.isfile(os.path.join(infq, "RTAComplete.txt")):
115-
bcl2fastq = args.bcl2fq or which("bcl2fastq") or os.path.join(
116-
sys.prefix, "bin", "bcl2fastq")
117-
if not os.path.isfile(bcl2fastq):
118-
sys.exit("bcl2fastq not found, exists")
119-
bcl = BCL(infq, outdir, args.barcode, args.threads,
120-
bcl2fastq=bcl2fastq, mis=args.mismatch)
114+
bcl2fastq = args.bcl2fq or which("bcl2fastq")
115+
if not (bcl2fastq and os.path.isfile(bcl2fastq)):
116+
sys.exit("bcl2fastq not found, exit")
117+
kw = {
118+
"cpu": args.threads,
119+
"bcl2fastq": bcl2fastq,
120+
"mis": args.mismatch,
121+
"bcl2fastq": bcl2fastq,
122+
"rc_i7": args.rc_index1,
123+
"rc_i5": args.rc_index2,
124+
"print_cmd": False,
125+
}
126+
bcl = BCL(infq, outdir, args.sample, **kw)
121127
bcl.run()
122128
logs.info("Success")
123129
js = os.path.join(outdir, "Stats/Stats.json")

src/src.py

+19-7
Original file line numberDiff line numberDiff line change
@@ -115,33 +115,45 @@ def logs(self):
115115

116116
def parseArg():
117117
parser = argparse.ArgumentParser(
118-
description="split a mix fastq by barcode index.",)
118+
description="split a mix fastq or BCL by barcode index.",)
119119
parser.add_argument("-v", '--version',
120120
action='version', version="v" + __version__)
121121
parent_parser = argparse.ArgumentParser(add_help=False)
122122
general_parser = parent_parser.add_argument_group("common options")
123123
general_parser.add_argument("-i", "--input", type=str, help="input fastq file or BCL flowcell directory, required",
124124
required=True, metavar="<str>")
125125
subparsers = parser.add_subparsers(
126-
title="commands", dest="mode", help="sub-command help")
126+
metavar="command", dest="mode")
127127
parser_index = subparsers.add_parser(
128128
'index', parents=[parent_parser], help="index fastq file for reading in multi processing, can be instead by `samtools fqidx <fqfile>`.")
129129
parser_split = subparsers.add_parser(
130130
'split', parents=[parent_parser], help="split sequence data by barcode.")
131131
parser_split.add_argument("-b", "--barcode", type=str,
132-
help='barcode and sample file, required', required=True, metavar="<file>")
132+
help='sample and barcode sequence info, two columns like "sampleName barcodeSeq", required', required=True, metavar="<file>")
133133
parser_split.add_argument('-m', "--mismatch", help="mismatch allowed for barcode search, 0 by default",
134134
type=int, default=0, metavar="<int>")
135-
# parser_split.add_argument('-t', "--threads", help="threads core, 10 by default",
136-
# type=int, default=10, metavar="<int>")
137135
parser_split.add_argument('-o', "--output", help="output directory, required",
138136
type=str, required=True, metavar="<str>")
139137
parser_split.add_argument("-d", '--drup', action='store_true',
140138
help="drup barcode sequence in output if set", default=False)
141-
parser_split.add_argument('--bcl2fq', metavar="<str>",
142-
help="bcl2fastq path if necessary, if not set, auto detected")
143139
parser_split.add_argument("--output-gzip", action='store_true',
144140
help="gzip output fastq file, this will make your process slower", default=False)
141+
parser_bcl2fq = subparsers.add_parser(
142+
'bcl2fq', parents=[parent_parser], help="split flowcell bcl data to fastq.")
143+
parser_bcl2fq.add_argument('-t', "--threads", help="threads core, 10 by default",
144+
type=int, default=10, metavar="<int>")
145+
parser_bcl2fq.add_argument("-s", "--sample", type=str,
146+
help='sample index file, two or three columns like "sample index1(i7) index2(i5)", required', required=True, metavar="<file>")
147+
parser_bcl2fq.add_argument('-m', "--mismatch", help="mismatch allowed for barcode search, 1 by default",
148+
type=int, default=1, metavar="<int>")
149+
parser_bcl2fq.add_argument('-o', "--output", help="output directory, required",
150+
type=str, required=True, metavar="<str>")
151+
parser_bcl2fq.add_argument("-rc1", "--rc-index1", action="store_true", default=False,
152+
help='reverse complement index1(i7)')
153+
parser_bcl2fq.add_argument("-rc2", "--rc-index2", action="store_true", default=False,
154+
help='reverse complement index2(i5)')
155+
parser_bcl2fq.add_argument('--bcl2fq', metavar="<str>",
156+
help="bcl2fastq path if necessary, if not set, auto detected")
145157
return parser.parse_args()
146158

147159

src/utils.py

+17-6
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
import logging
99
import subprocess
1010

11+
PY3 = sys.version[0] == "3"
12+
1113

1214
class MultiZipHandle(object):
1315
def __init__(self, mode="rb", **infiles):
@@ -79,30 +81,39 @@ def clean(*path):
7981
shutil.rmtree(p)
8082

8183

84+
if PY3:
85+
TRANS = str.maketrans("ATGCRYMKSWHBVDN", "TACGYRKMSWDVBHN")
86+
else:
87+
import string
88+
TRANS = string.maketrans("ATGCRYMKSWHBVDN", "TACGYRKMSWDVBHN")
89+
90+
91+
def rc_seq(seq):
92+
return seq.upper().translate(TRANS)[::-1]
93+
94+
8295
def which(program, paths=None):
96+
ex = os.path.dirname(sys.executable)
8397
found_path = None
8498
fpath, fname = os.path.split(program)
85-
8699
if fpath:
87100
program = canonicalize(program)
88101
if is_exe(program):
89102
found_path = program
90-
91103
else:
104+
if is_exe(os.path.join(ex, program)):
105+
return os.path.join(ex, program)
92106
paths_to_search = []
93-
94107
if isinstance(paths, (tuple, list)):
95108
paths_to_search.extend(paths)
96109
else:
97110
env_paths = os.environ.get("PATH", "").split(os.pathsep)
98111
paths_to_search.extend(env_paths)
99-
100112
for path in paths_to_search:
101113
exe_file = os.path.join(canonicalize(path), program)
102114
if is_exe(exe_file):
103115
found_path = exe_file
104116
break
105-
106117
return found_path
107118

108119

@@ -131,6 +142,6 @@ def timeRecord(func):
131142
def wrapper(*args, **kwargs):
132143
s = time.time()
133144
value = func(*args, **kwargs)
134-
sys.stdout.write("\nTime elapse: %d sec.\n" % int(time.time() - s))
145+
sys.stdout.write("Time elapse: %d sec.\n" % int(time.time() - s))
135146
return value
136147
return wrapper

src/version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.0.4"
1+
__version__ = "1.0.5"

0 commit comments

Comments
 (0)