Skip to content

Commit

Permalink
Update to match hello-modules
Browse files Browse the repository at this point in the history
This should make the transition to a deployed pipeline more obvious to an observer
  • Loading branch information
adamrtalbot committed Nov 5, 2024
1 parent 3f6792e commit cd9fbd7
Show file tree
Hide file tree
Showing 7 changed files with 215 additions and 158 deletions.
167 changes: 11 additions & 156 deletions main.nf
Original file line number Diff line number Diff line change
@@ -1,156 +1,11 @@
#!/usr/bin/env nextflow

/*
* Pipeline parameters
*/

// Primary input (file of input files, one per line)
params.reads_bam = "${projectDir}/data/sample_bams.txt"

// Accessory files
params.reference = "${projectDir}/data/ref/ref.fasta"
params.reference_index = "${projectDir}/data/ref/ref.fasta.fai"
params.reference_dict = "${projectDir}/data/ref/ref.dict"
params.intervals = "${projectDir}/data/ref/intervals.bed"

// Base name for final output file
params.cohort_name = "family_trio"

params.outdir = "results"

/*
* Generate BAM index file
*/
process SAMTOOLS_INDEX {

container 'community.wave.seqera.io/library/samtools:1.20--b5dfbd93de237464'

publishDir "$params.outdir", mode: 'symlink'

input:
path input_bam

output:
tuple path(input_bam), path("${input_bam}.bai")

"""
samtools index '$input_bam'
"""
}

/*
* Call variants with GATK HaplotypeCaller
*/
process GATK_HAPLOTYPECALLER {

container "community.wave.seqera.io/library/gatk4:4.5.0.0--730ee8817e436867"

publishDir "$params.outdir", mode: 'symlink'

input:
tuple path(input_bam), path(input_bam_index)
path ref_fasta
path ref_index
path ref_dict
path interval_list

output:
path "${input_bam}.g.vcf"
path "${input_bam}.g.vcf.idx"

"""
gatk HaplotypeCaller \
-R ${ref_fasta} \
-I ${input_bam} \
-O ${input_bam}.g.vcf \
-L ${interval_list} \
-ERC GVCF
"""
}

/*
* Combine GVCFs into GenomicsDB datastore and run joint genotyping to produce cohort-level calls
*/
process GATK_JOINTGENOTYPING {

container "community.wave.seqera.io/library/gatk4:4.5.0.0--730ee8817e436867"

publishDir "$params.outdir", mode: 'symlink'

input:
path all_gvcfs
path all_idxs
path interval_list
val cohort_name
path ref_fasta
path ref_index
path ref_dict

output:
path "${cohort_name}.joint.vcf"
path "${cohort_name}.joint.vcf.idx"

script:
def gvcfs_line = all_gvcfs.collect { gvcf -> "-V ${gvcf}" }.join(' ')
"""
gatk GenomicsDBImport \
${gvcfs_line} \
-L ${interval_list} \
--genomicsdb-workspace-path ${cohort_name}_gdb
gatk GenotypeGVCFs \
-R ${ref_fasta} \
-V gendb://${cohort_name}_gdb \
-L ${interval_list} \
-O ${cohort_name}.joint.vcf
"""
}

/*
* Generate statistics with bcftools stats
*/
process BCFTOOLS_STATS {

container 'community.wave.seqera.io/library/bcftools:1.20--a7f1d9cdda56cc93'
conda "bioconda::bcftools=1.20"

input:
path vcf_file

output:
path "${vcf_file}.stats"

"""
bcftools stats ${vcf_file} > ${vcf_file}.stats
"""
}

/*
* Generate MultiQC report
*/
process MULTIQC {

container 'community.wave.seqera.io/library/multiqc:1.24.1--789bc3917c8666da'
conda "bioconda::multiqc=1.24.1"

publishDir "$params.outdir", mode: 'copy'

input:
path input_files
val cohort_name

output:
path "${params.cohort_name}_multiqc_report.html"

"""
multiqc \\
--force \\
-o . \\
-n ${cohort_name}_multiqc_report.html \\
--clean-up \\
${input_files}
"""
}
// Include modules
include { SAMTOOLS_INDEX } from './modules/local/samtools/index/main.nf'
include { GATK_HAPLOTYPECALLER } from './modules/local/gatk/haplotypecaller/main.nf'
include { GATK_JOINTGENOTYPING } from './modules/local/gatk/jointgenotyping/main.nf'
include { BCFTOOLS_STATS } from './modules/local/bcftools/stats/main.nf'
include { MULTIQC } from './modules/local/multiqc/multiqc/main.nf'

workflow {

Expand All @@ -168,16 +23,16 @@ workflow {

// Call variants from the indexed BAM file
GATK_HAPLOTYPECALLER(
SAMTOOLS_INDEX.out,
SAMTOOLS_INDEX.out.bam_bai,
ref_file,
ref_index_file,
ref_dict_file,
intervals_file
)

// Collect variant calling outputs across samples
all_gvcfs_ch = GATK_HAPLOTYPECALLER.out[0].collect()
all_idxs_ch = GATK_HAPLOTYPECALLER.out[1].collect()
all_gvcfs_ch = GATK_HAPLOTYPECALLER.out.vcf.collect()
all_idxs_ch = GATK_HAPLOTYPECALLER.out.idx.collect()

// Combine GVCFs into a GenomicsDB data store and apply joint genotyping
GATK_JOINTGENOTYPING(
Expand All @@ -191,11 +46,11 @@ workflow {
)

BCFTOOLS_STATS(
GATK_JOINTGENOTYPING.out[0]
GATK_JOINTGENOTYPING.out.vcf
)

MULTIQC(
BCFTOOLS_STATS.out.collect(),
BCFTOOLS_STATS.out.stats.collect(),
params.cohort_name
)
}
19 changes: 19 additions & 0 deletions modules/local/bcftools/stats/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
/*
* Generate statistics with bcftools stats
*/
process BCFTOOLS_STATS {

container 'community.wave.seqera.io/library/bcftools:1.20--a7f1d9cdda56cc93'
conda "bioconda::bcftools=1.20"

input:
path vcf_file

output:
path "${vcf_file}.stats", emit: stats

script:
"""
bcftools stats ${vcf_file} > ${vcf_file}.stats
"""
}
33 changes: 33 additions & 0 deletions modules/local/gatk/haplotypecaller/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/usr/bin/env nextflow

/*
* Call variants with GATK HaplotypeCaller
*/
process GATK_HAPLOTYPECALLER {

container "community.wave.seqera.io/library/gatk4:4.5.0.0--730ee8817e436867"
conda "bioconda::gatk4=4.5.0.0"

publishDir '${params.outdir}', mode: 'symlink'

input:
tuple path(input_bam), path(input_bam_index)
path ref_fasta
path ref_index
path ref_dict
path interval_list

output:
path "${input_bam}.g.vcf" , emit: vcf
path "${input_bam}.g.vcf.idx" , emit: idx

script:
"""
gatk HaplotypeCaller \
-R ${ref_fasta} \
-I ${input_bam} \
-O ${input_bam}.g.vcf \
-L ${interval_list} \
-ERC GVCF
"""
}
38 changes: 38 additions & 0 deletions modules/local/gatk/jointgenotyping/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*
* Combine GVCFs into GenomicsDB datastore and run joint genotyping to produce cohort-level calls
*/
process GATK_JOINTGENOTYPING {

container "community.wave.seqera.io/library/gatk4:4.5.0.0--730ee8817e436867"
conda "bioconda::gatk4=4.5.0.0"

publishDir '${params.outdir}', mode: 'symlink'

input:
path all_gvcfs
path all_idxs
path interval_list
val cohort_name
path ref_fasta
path ref_index
path ref_dict

output:
path "${cohort_name}.joint.vcf" , emit: vcf
path "${cohort_name}.joint.vcf.idx", emit: idx

script:
def gvcfs_line = all_gvcfs.collect { gvcf -> "-V ${gvcf}" }.join(' ')
"""
gatk GenomicsDBImport \
${gvcfs_line} \
-L ${interval_list} \
--genomicsdb-workspace-path ${cohort_name}_gdb
gatk GenotypeGVCFs \
-R ${ref_fasta} \
-V gendb://${cohort_name}_gdb \
-L ${interval_list} \
-O ${cohort_name}.joint.vcf
"""
}
27 changes: 27 additions & 0 deletions modules/local/multiqc/multiqc/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*
* Generate MultiQC report
*/
process MULTIQC {

container 'community.wave.seqera.io/library/multiqc:1.24.1--789bc3917c8666da'
conda "bioconda::multiqc=1.24.1"

publishDir "$params.outdir", mode: 'copy'

input:
path input_files
val cohort_name

output:
path "${cohort_name}_multiqc_report.html", emit: report

script:
"""
multiqc \\
--force \\
-o . \\
-n ${cohort_name}_multiqc_report.html \\
--clean-up \\
${input_files}
"""
}
21 changes: 21 additions & 0 deletions modules/local/samtools/index/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/*
* Generate BAM index file
*/
process SAMTOOLS_INDEX {

container 'community.wave.seqera.io/library/samtools:1.20--b5dfbd93de237464'
conda "bioconda::samtools=1.20"

publishDir '${params.outdir}', mode: 'symlink'

input:
path input_bam

output:
tuple path(input_bam), path("${input_bam}.bai"), emit: bam_bai

script:
"""
samtools index '$input_bam'
"""
}
Loading

0 comments on commit cd9fbd7

Please sign in to comment.