-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathQuantCirc.nf
261 lines (194 loc) · 9.24 KB
/
QuantCirc.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#!/usr/bin/env nextflow
/*
========================================================================================
QuantCirc
========================================================================================
* QuantCirc was developed by Dr. Qi Zhao from Sun Yat-sen University Cancer Center.
* it implemented a workform circRNA remapping step to quantification with a pre-build count function
* in circletools.jar, which uses a bsj remapping bamfile and genome mapped bamfile to filter the
* false positive circRNA reads.
* Homepage / Documentation
https://github.com/likelet/circpipe
*/
/*
*
*
* Authors:
* Qi Zhao <[email protected]>: design and implement the tools.
* requirement
* hisat2
* bedtools
*/
def helpMessage() {
log.info"""
=========================================
QuantCirc v${workflow.manifest.version}
=========================================
Usage:
The typical command for running the QuantCirc is as follows:
nextflow path/to/circPipe/QuantCirc.nf --reads "path/to/*{1,2}.fq.gz" --bedfile circleRNA.bed --genomefile genome.fa --hisat2_index hisat2_index
Mandatory arguments:
--reads Path to input data (must be surrounded with quotes)
--hisat2_index Hisat2 index of whole genome sequence file name with `*.ht2` suffixed. like "GRCh38" when your files are "GRCh38.*.ht2"
--genomefile genome reference file with fasta format, such as genome.fa .
Options:
If not set, the pipeline will create the index itself.
--singleEnd Specify that the reads are single ended
""".stripIndent()
}
// Show help emssage
if (params.help){
helpMessage()
exit 0
}
/*
========================================================================================
Input file parsing
========================================================================================
*/
// set outdir
outdir =params.outdir? params.outdir : "./"
// input data set
genomefile = file(params.genomefile) //the genomefile
if( !genomefile.exists() ) exit 1, LikeletUtils.print_red("Missing genome file: ${genomefile}")
hisat2_index = Channel.fromPath("${params.hisat2_index}*.ht2")
.ifEmpty { exit 1, "HISAT2 index not found: ${params.hisat2_index}" }
// path for placing the bedfile
bedfile= file(params.bedfile) //the genomefile
if( !bedfile.exists() ) exit 1, LikeletUtils.print_red("Missing circRNA bedfile ${params.bedfile}")
// fastqfile
(Fastpfiles_recount,Fastpfiles_hisat)=Channel.fromFilePairs( params.reads, size: params.singleEnd ? 1 : 2 )
.ifEmpty { error "Cannot find any reads matching: ${params.reads}" }
.into(2)
/*
========================================================================================
showing the process and files
========================================================================================
*/
log.info "INFO "+LikeletUtils.print_cyan("""
========================================================================
# ____ _ _____ _
# / __ \\ | | / ____|(_)
# | | | | _ _ __ _ _ __ | |_ | | _ _ __ ___
# | | | || | | | / _` || '_ \\ | __|| | | || '__|/ __
# | |__| || |_| || (_| || | | || |_ | |____ | || | | (__
# \\___\\_\\ \\__,_| \\_,_ ||_| |_| \\__| \\_____||_||_|
#
========================================================================
""")
.stripIndent()
log.info "INFO "+LikeletUtils.print_purple("========You are running QuantCirc with the following parameters===============")
log.info "INFO "+LikeletUtils.print_purple("Checking parameters ...")
log.info "\n"
log.info "INFO "+LikeletUtils.print_yellow("==================================Input files selected==========================")
log.info "INFO "+LikeletUtils.print_yellow("Reads : ") + LikeletUtils.print_green(params.reads)
log.info "INFO "+LikeletUtils.print_yellow("hisatIndex file : ") + LikeletUtils.print_green(params.hisat2_index)
log.info "INFO "+LikeletUtils.print_yellow("Genome file : ") + LikeletUtils.print_green(params.genomefile)
log.info "\n"
log.info "INFO "+LikeletUtils.print_yellow("=====================================Reads types================================")
log.info "INFO "+LikeletUtils.print_yellow("SingleEnd : ") + LikeletUtils.print_green(params.singleEnd)
log.info "\n"
log.info "INFO "+LikeletUtils.print_yellow("==================================Output files directory========================")
log.info "INFO "+LikeletUtils.print_yellow("Output directory : ") + LikeletUtils.print_green(outdir)
log.info "\n"
/*
========================================================================================
after running the tools
Recount for merge
========================================================================================
*/
process getPsudoCircSequenceAndBuildHisatIndex {
storeDir "${params.outdir}/BSJ_Index"
input:
file bedfile
file genomefile
output:
file "*.ht2" into Candidate_circRNA_index
script:
"""
# extract bed file for obtaining seqeuence
sh ${baseDir}/bin/ProcessBedforGettingSequence.sh ${bedfile} temp.sort.bed temp.start.bed temp.end.bed
bedtools getfasta -name -fi ${genomefile} -s -bed temp.start.bed > temp.start.fa
bedtools getfasta -name -fi ${genomefile} -s -bed temp.end.bed > temp.end.fa
# circRNA <= 400 bp
bedtools getfasta -name -fi ${genomefile} -s -bed temp.sort.bed > temp.sort.fa
# merge and get combined fasta formatted psudoCirc sequences
sh ${baseDir}/bin/MergeBSJsequence.sh temp.sort.fa temp.start.fa temp.end.fa tmp_candidate.circular_BSJ_flank.fa
hisat2-build -p ${task.cpus} tmp_candidate.circular_BSJ_flank.fa candidate_circRNA_BSJ_flank
rm temp*
rm tmp*
"""
}
process Recount_generate_BSJ_Bamfile {
tag "$tag_id"
maxForks 3
input:
file index from Candidate_circRNA_index.collect()
tuple val(sampleID),file(query_file) from Fastpfiles_recount
output:
tuple val(sampleID),file("${sampleID}_denovo.bam") into BSJ_mapping_bamfile
file "fileforwaiting.txt" into Wait_for_hisat2
script:
tag_id=sampleID
if(params.singleEnd){
"""
hisat2 -p 8 -t -k 1 -x candidate_circRNA_BSJ_flank -U ${query_file} | samtools view -bS -q 10 - > ${sampleID}_denovo.bam
touch fileforwaiting.txt
"""
}else{
"""
hisat2 -p 8 -t -k 1 -x candidate_circRNA_BSJ_flank -1 ${query_file[0]} -2 ${query_file[1]} | samtools view -bS -q 10 - > ${sampleID}_denovo.bam
touch fileforwaiting.txt
"""
}
}
process Recount_generate_genome_Bamfile {
tag "$sampleID"
maxForks 3
input:
file index from hisat2_index.collect()
tuple val(sampleID), file(query_file) from Fastpfiles_hisat
file filewait from Wait_for_hisat2
output:
tuple val(sampleID),file("${sampleID}.bam") into Genome_remapping_bamfile
script:
index_base = index[0].toString() - ~/.\d.ht2/
if(params.singleEnd){
"""
hisat2 -p 8 -t -k 1 -x ${index_base} -U ${query_file} | samtools view -bS -q 10 - > ${sampleID}.bam
"""
}else{
"""
hisat2 -p 8 -t -k 1 -x ${index_base} -1 ${query_file[0]} -2 ${query_file[1]} | samtools view -bS -q 10 - > ${sampleID}.bam
"""
}
}
BSJ_mapping_bamfile.combine(Genome_remapping_bamfile, by : 0 ).set{RecountBamfiles}
process Recount_estimate_step{
publishDir "${params.outdir}/QuantCirc", pattern: "*.count", mode: 'link', overwrite: true
input:
tuple val(sampleID), file(bsjBamfile),file(genomeBamfile) from RecountBamfiles
output:
tuple val(sampleID),file("${sampleID}.count") into Single_sample_recount
script:
"""
java -jar ${baseDir}/bin/circpipetools.jar -recount -bsjbam ${bsjBamfile} -allBam ${genomeBamfile} -out ${sampleID}.count
"""
}
// test
process Recount_results_combine{
publishDir "${outdir}/Combination_Matrix", mode: 'copy', pattern: "*.matrix", overwrite: true
input:
file (query_file) from Single_sample_recount.collect()
output:
file ("multitools.exp.matrix") into (Matrix_for_circos, Plot_merge, PlotMergeCor)
when:
run_multi_tools
script:
"""
java -jar ${baseDir}/bin/circpipetools.jar -MM -dir ./ -suffix .count -out Quant_Circle.matrix
"""
}
workflow.onComplete {
log.info "INFO "+LikeletUtils.print_purple("Analysis completed ! Your result are stored in ") + LikeletUtils.print_green("${outdir}/Combination_Matrix")
}