From 9e6226381ea075d2b4065d7f8600721de91bd11b Mon Sep 17 00:00:00 2001 From: whalleyt Date: Wed, 6 Dec 2023 17:47:34 +0000 Subject: [PATCH 01/44] parse profiler params --- main.nf | 11 +++++++++-- nextflow.config | 2 ++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 837d3ef..140be73 100644 --- a/main.nf +++ b/main.nf @@ -49,8 +49,8 @@ Mandatory and conditional parameters: --bowtie2_index Directory containing Bowtie2 index (obtain from ftp://ftp.ccb.jhu.edu/pub/data/bowtie2_indexes/hg19_1kgmaj_bt2.zip This is the Langmead lab pre-built major-allele-SNP reference; see https://github.com/BenLangmead/bowtie-majref) --bowtie_index_name Name of the bowtie index, e.g. hg19_1kgmaj ---vcfmix Run VFCMIX "yes" or "no". Should be set to "no" for synthetic samples ---gnomonicus Run gnomon "yes" or "no" +--vcfmix Run VFCMIX "yes" or "no". Should be set to "no" for synthetic samples +--resistance_profiler Tool to profile resistance with. At the moment options are "tb-profiler" or "none" --amr_cat Path to the AMR catalogue (https://github.com/oxfordmmm/tuberculosis_amr_catalogues is at /tuberculosis_amr_catalogues in the vcfpredict container) --afanc_myco_db Path to the Afanc database used for speciation. Obtain from https://s3.climb.ac.uk/microbial-bioin-sp3/Mycobacteriaciae_DB_3.0.tar.gz @@ -86,6 +86,13 @@ nextflow run main.nf -profile docker --filetype bam --input_dir bam_dir --unmix_ } +resistance_profilers = ["tb-profiler", "none"] + + if(!resistance_profilers.contains(params.resistance_profiler)){ + exit 1, 'Invalid resistance profiler. Must be one of "tb-profiler" or "none" to skip.' + } + + // confirm that mandatory parameters have been set and that the conditional parameter, --pattern, has been used appropriately if ( params.input_dir == "" ) { exit 1, "error: --input_dir is mandatory (run with --help to see parameters)" diff --git a/nextflow.config b/nextflow.config index 21122da..27a0fcd 100644 --- a/nextflow.config +++ b/nextflow.config @@ -46,6 +46,8 @@ params { // run gnomonicus 'yes' or 'no' gnomonicus = 'yes' + + resistance_profiler = "tb-profiler" // path to AMR catalogue for gnomon // https://github.com/oxfordmmm/tuberculosis_amr_catalogues available at path /tuberculosis_amr_catalogues in container From 3948b1270f889746a869479218e7172b48aa00fb Mon Sep 17 00:00:00 2001 From: whalleyt Date: Thu, 7 Dec 2023 14:03:31 +0000 Subject: [PATCH 02/44] update tb-profiler in docker --- .github/workflows/build-push-quay.yml | 4 +- docker/Dockerfile.vcfpredict-0.9.8r1 | 78 +++++++++++++++++++++++++++ nextflow.config | 30 +++++------ 3 files changed, 96 insertions(+), 16 deletions(-) create mode 100644 docker/Dockerfile.vcfpredict-0.9.8r1 diff --git a/.github/workflows/build-push-quay.yml b/.github/workflows/build-push-quay.yml index 9043ae6..cef945d 100644 --- a/.github/workflows/build-push-quay.yml +++ b/.github/workflows/build-push-quay.yml @@ -4,10 +4,11 @@ on: branches: - v0.9.6 - 0.9.7-dev - - climb + - tbprofiler paths: - '**/Dockerfile*' - "bin/" + - "resources/" workflow_dispatch: @@ -46,6 +47,7 @@ jobs: - name: Copy folders to docker run: | cp -r bin docker/bin + cp -r resources docker/resources - name: Get image name id: image_name diff --git a/docker/Dockerfile.vcfpredict-0.9.8r1 b/docker/Dockerfile.vcfpredict-0.9.8r1 new file mode 100644 index 0000000..72d8ad4 --- /dev/null +++ b/docker/Dockerfile.vcfpredict-0.9.8r1 @@ -0,0 +1,78 @@ +FROM mambaorg/micromamba:jammy + +LABEL maintainer="pricea35@cardiff.ac.uk" \ +about.summary="container for the vcf predict workflow" + +COPY bin/ /opt/bin/ +COPY resources/tuberculosis.fa ~/tuberculosis.fa + +ENV PATH=/opt/bin:$PATH + +ARG TBPROFILER_VER="5.0.1" + +# this version is the shortened commit hash on the `master` branch here https://github.com/jodyphelan/tbdb/ +# commits are found on https://github.com/jodyphelan/tbdb/commits/master +# this was the latest commit as of 2023-10-26 +ARG TBDB_VER="e25540b" + +# install tb-profiler via bioconda; install into 'base' conda env +RUN micromamba install --yes --name base --channel conda-forge --channel bioconda \ + tb-profiler=${TBPROFILER_VER} && \ + micromamba clean --all --yes + +# hardcode 'base' env bin into PATH, so conda env does not have to be "activated" at run time +ENV PATH="/opt/conda/bin:${PATH}" + +# Version of database can be confirmed at /opt/conda/share/tbprofiler/tbdb.version.json +# can also run 'tb-profiler list_db' to find the same version info +# In 5.0.1 updating_tbdb does not work with tb-profiler update_tbdb --commit ${TBDB_VER} +RUN tb-profiler update_tbdb --commit ${TBDB_VER} + +ENV PACKAGES="procps curl wget git build-essential libhdf5-dev libffi-dev r-base-core jq" \ +PYTHON="python3 python3-pip python3-dev" + +ENV vcfmix_version=d4693344bf612780723e39ce27c8ae3868f95417 + + +RUN apt-get update \ +&& DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata \ +&& apt-get install -y $PACKAGES $PYTHON \ +&& apt-get install -y python3-packaging \ +&& git clone https://github.com/JeremyWesthead/VCFMIX.git \ +&& cd VCFMIX \ +&& git checkout ${vcfmix_version} \ +&& pip3 install recursive_diff \ +&& pip3 install . \ +&& cp -r data /usr/local/lib/python3.8/dist-packages \ +&& cd .. + +#taken and adapted from staphb/tbprofiler +ARG TBPROFILER_VER="5.0.1" + +# this version is the shortened commit hash on the `master` branch here https://github.com/jodyphelan/tbdb/ +# commits are found on https://github.com/jodyphelan/tbdb/commits/master +# this was the latest commit as of 2023-10-26 +ARG TBDB_VER="e25540b" + +# Install dependencies via apt-get; cleanup apt garbage +RUN apt-get update && apt-get install -y --no-install-recommends \ + wget \ + ca-certificates \ + procps && \ + apt-get autoclean && rm -rf /var/lib/apt/lists/* + +# install tb-profiler via bioconda; install into 'base' conda env +RUN micromamba install --yes --name base --channel conda-forge --channel bioconda \ + tb-profiler=${TBPROFILER_VER} && \ + micromamba clean --all --yes + +# hardcode 'base' env bin into PATH, so conda env does not have to be "activated" at run time +ENV PATH="/opt/conda/bin:${PATH}" + +# Version of database can be confirmed at /opt/conda/share/tbprofiler/tbdb.version.json +# can also run 'tb-profiler list_db' to find the same version info +# In 5.0.1 updating_tbdb does not work with tb-profiler update_tbdb --commit ${TBDB_VER} +RUN tb-profiler update_tbdb --commit ${TBDB_VER} + +#pre-add our TB reference +RUN tb-profiler update_tbdb --match_ref ~/tuberculosis.fa \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 27a0fcd..9ed781c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -98,7 +98,7 @@ profiles { withLabel:high_memory { memory = '18GB' } withLabel:preprocessing { - container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.7r9" + container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" } withLabel:getversion{ @@ -121,11 +121,11 @@ profiles { } withLabel:clockwork { - container = "quay.io/pathogen-genomics-cymru/clockwork:0.9.7r3" + container = "quay.io/pathogen-genomics-cymru/clockwork:0.9.8" } withLabel:vcfpredict { - container = "quay.io/pathogen-genomics-cymru/vcfpredict:0.9.7r3" + container = "quay.io/pathogen-genomics-cymru/vcfpredict:0.9.8r1" } } params{ @@ -161,11 +161,11 @@ profiles { withLabel:high_memory { memory = '18GB' } withLabel:getversion { - container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.7" + container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" } withLabel:preprocessing { - container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.7" + container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" } withName:downloadContamGenomes { @@ -182,11 +182,11 @@ profiles { } withLabel:clockwork { - container = "quay.io/pathogen-genomics-cymru/clockwork:0.9.7" + container = "quay.io/pathogen-genomics-cymru/clockwork:0.9.8r1" } withLabel:vcfpredict { - container = "quay.io/pathogen-genomics-cymru/vcfpredict:0.9.7" + container = "quay.io/pathogen-genomics-cymru/vcfpredict:0.9.8r1" } } @@ -215,11 +215,11 @@ profiles { withLabel:high_memory { memory = '18GB' } withLabel:getversion { - container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.7" + container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" } withLabel:preprocessing { - container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.7" + container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" } withName:downloadContamGenomes { @@ -235,11 +235,11 @@ profiles { } withLabel:clockwork { - container = "quay.io/pathogen-genomics-cymru/clockwork:0.9.7" + container = "quay.io/pathogen-genomics-cymru/clockwork:0.9.8" } withLabel:vcfpredict { - container = "quay.io/pathogen-genomics-cymru/vcfpredict:0.9.7" + container = "quay.io/pathogen-genomics-cymru/vcfpredict:0.9.8r1" } } } @@ -264,11 +264,11 @@ profiles { withLabel:high_memory { memory = '18GB' } withLabel:getversion { - container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.7" + container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" } withLabel:preprocessing { - container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.7" + container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" } withName:downloadContamGenomes { @@ -284,11 +284,11 @@ profiles { } withLabel:clockwork { - container = "quay.io/pathogen-genomics-cymru/clockwork:0.9.7" + container = "quay.io/pathogen-genomics-cymru/clockwork:0.9.8" } withLabel:vcfpredict { - container = "quay.io/pathogen-genomics-cymru/vcfpredict:0.9.7" + container = "quay.io/pathogen-genomics-cymru/vcfpredict:0.9.8" } } } From bef7e8c61fb83d7d3a2ce9af2cfc0c5b60568e16 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Thu, 7 Dec 2023 14:03:56 +0000 Subject: [PATCH 03/44] rm Dockerfile --- docker/Dockerfile.vcfpredict-0.9.8 | 51 ------------------------------ 1 file changed, 51 deletions(-) delete mode 100644 docker/Dockerfile.vcfpredict-0.9.8 diff --git a/docker/Dockerfile.vcfpredict-0.9.8 b/docker/Dockerfile.vcfpredict-0.9.8 deleted file mode 100644 index 68d928e..0000000 --- a/docker/Dockerfile.vcfpredict-0.9.8 +++ /dev/null @@ -1,51 +0,0 @@ -FROM ubuntu:20.04 - -LABEL maintainer="pricea35@cardiff.ac.uk" \ -about.summary="container for the vcf predict workflow" - -ENV PACKAGES="procps curl wget git build-essential libhdf5-dev libffi-dev r-base-core jq" \ -PYTHON="python3 python3-pip python3-dev" - -ENV vcfmix_version=d4693344bf612780723e39ce27c8ae3868f95417 \ -gumpy_version=1.0.15 \ -piezo_version=0.3 \ -gnomonicus_version=1.1.2 \ -tuberculosis_amr_catalogues=12d38733ad2e238729a3de9f725081e1d4872968 - -COPY bin/ /opt/bin/ -ENV PATH=/opt/bin:$PATH - - -RUN apt-get update \ -&& DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata \ -&& apt-get install -y $PACKAGES $PYTHON \ -&& apt-get install -y python3-packaging \ -&& git clone https://github.com/JeremyWesthead/VCFMIX.git \ -&& cd VCFMIX \ -&& git checkout ${vcfmix_version} \ -&& pip3 install recursive_diff \ -&& pip3 install awscli \ -&& pip3 install . \ -&& cp -r data /usr/local/lib/python3.8/dist-packages \ -&& cd .. - -RUN curl -fsSL https://github.com/oxfordmmm/gumpy/archive/refs/tags/v${gumpy_version}.tar.gz | tar -xz \ -&& cd gumpy-${gumpy_version} \ -&& pip3 install . \ -&& cd .. - -RUN curl -fsSL https://github.com/oxfordmmm/piezo/archive/refs/tags/v${piezo_version}.tar.gz | tar -xz \ -&& cd piezo-${piezo_version} \ -&& pip3 install . \ -&& cd .. - -RUN curl -fsSL https://github.com/oxfordmmm/gnomonicus/archive/refs/tags/v${gnomonicus_version}.tar.gz | tar -xz \ -&& cd gnomonicus-${gnomonicus_version} \ -&& pip3 install . \ -&& cd .. - -RUN git clone https://github.com/oxfordmmm/tuberculosis_amr_catalogues.git \ -&& cd tuberculosis_amr_catalogues \ -&& git checkout ${tuberculosis_amr_catalogues} \ -&& cd .. - From 1b5187104328522e345e1fddc67918608648b285 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Thu, 7 Dec 2023 14:16:19 +0000 Subject: [PATCH 04/44] docker update --- docker/Dockerfile.vcfpredict-0.9.8r1 | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/Dockerfile.vcfpredict-0.9.8r1 b/docker/Dockerfile.vcfpredict-0.9.8r1 index 72d8ad4..72e7dee 100644 --- a/docker/Dockerfile.vcfpredict-0.9.8r1 +++ b/docker/Dockerfile.vcfpredict-0.9.8r1 @@ -15,6 +15,7 @@ ARG TBPROFILER_VER="5.0.1" # this was the latest commit as of 2023-10-26 ARG TBDB_VER="e25540b" + # install tb-profiler via bioconda; install into 'base' conda env RUN micromamba install --yes --name base --channel conda-forge --channel bioconda \ tb-profiler=${TBPROFILER_VER} && \ From a72c6d1d925e0be7cc525df666f1c8a4b1555bd7 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Thu, 7 Dec 2023 14:22:38 +0000 Subject: [PATCH 05/44] fa to fasta --- docker/Dockerfile.vcfpredict-0.9.8r1 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.vcfpredict-0.9.8r1 b/docker/Dockerfile.vcfpredict-0.9.8r1 index 72e7dee..39874f3 100644 --- a/docker/Dockerfile.vcfpredict-0.9.8r1 +++ b/docker/Dockerfile.vcfpredict-0.9.8r1 @@ -4,7 +4,7 @@ LABEL maintainer="pricea35@cardiff.ac.uk" \ about.summary="container for the vcf predict workflow" COPY bin/ /opt/bin/ -COPY resources/tuberculosis.fa ~/tuberculosis.fa +COPY resources/tuberculosis.fasta ~/tuberculosis.fasta ENV PATH=/opt/bin:$PATH @@ -76,4 +76,4 @@ ENV PATH="/opt/conda/bin:${PATH}" RUN tb-profiler update_tbdb --commit ${TBDB_VER} #pre-add our TB reference -RUN tb-profiler update_tbdb --match_ref ~/tuberculosis.fa \ No newline at end of file +RUN tb-profiler update_tbdb --match_ref ~/tuberculosis.fasta \ No newline at end of file From 2a809fb9603013bc5f0f0c2cc3f35d76075debac Mon Sep 17 00:00:00 2001 From: whalleyt Date: Fri, 8 Dec 2023 10:25:54 +0000 Subject: [PATCH 06/44] tb-profiler in docker --- docker/Dockerfile.vcfpredict-0.9.8r1 | 1 - 1 file changed, 1 deletion(-) diff --git a/docker/Dockerfile.vcfpredict-0.9.8r1 b/docker/Dockerfile.vcfpredict-0.9.8r1 index 39874f3..f781423 100644 --- a/docker/Dockerfile.vcfpredict-0.9.8r1 +++ b/docker/Dockerfile.vcfpredict-0.9.8r1 @@ -34,7 +34,6 @@ PYTHON="python3 python3-pip python3-dev" ENV vcfmix_version=d4693344bf612780723e39ce27c8ae3868f95417 - RUN apt-get update \ && DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata \ && apt-get install -y $PACKAGES $PYTHON \ From bf1a6c89886c697b22a1003f0e0b476d697ed9a3 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Fri, 8 Dec 2023 12:02:00 +0000 Subject: [PATCH 07/44] new container for tbprofiler --- docker/Dockerfile.tbprofiler-0.9.8 | 46 ++++++++++++++++++++++++++++++ docker/Dockerfile.vcfpredict-0.9.8 | 22 ++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 docker/Dockerfile.tbprofiler-0.9.8 create mode 100644 docker/Dockerfile.vcfpredict-0.9.8 diff --git a/docker/Dockerfile.tbprofiler-0.9.8 b/docker/Dockerfile.tbprofiler-0.9.8 new file mode 100644 index 0000000..a6d1671 --- /dev/null +++ b/docker/Dockerfile.tbprofiler-0.9.8 @@ -0,0 +1,46 @@ +FROM mambaorg/micromamba:jammy + +LABEL maintainer="whalleyt@cardiff.ac.uk" \ +about.summary="container for the tb-profiler" + +COPY bin/ /opt/bin/ +COPY resources/tuberculosis.fasta ~/tuberculosis.fasta + +ENV PATH=/opt/bin:$PATH + +ARG TBPROFILER_VER="5.0.1" + +# this version is the shortened commit hash on the `master` branch here https://github.com/jodyphelan/tbdb/ +# commits are found on https://github.com/jodyphelan/tbdb/commits/master +# this was the latest commit as of 2023-10-26 +ARG TBDB_VER="e25540b" + +# Install dependencies via apt-get; cleanup apt garbage +RUN apt-get update && apt-get install -y --no-install-recommends \ + wget \ + ca-certificates \ + procps && \ + apt-get autoclean && rm -rf /var/lib/apt/lists/* + +# install tb-profiler via bioconda; install into 'base' conda env +RUN micromamba install --yes --name base --channel conda-forge --channel bioconda \ + tb-profiler=${TBPROFILER_VER} && \ + micromamba clean --all --yes +# hardcode 'base' env bin into PATH, so conda env does not have to be "activated" at run time +ENV PATH="/opt/conda/bin:${PATH}" + +# Version of database can be confirmed at /opt/conda/share/tbprofiler/tbdb.version.json +# can also run 'tb-profiler list_db' to find the same version info +# In 5.0.1 updating_tbdb does not work with tb-profiler update_tbdb --commit ${TBDB_VER} +RUN tb-profiler update_tbdb --commit ${TBDB_VER} + +# hardcode 'base' env bin into PATH, so conda env does not have to be "activated" at run time +ENV PATH="/opt/conda/bin:${PATH}" + +# Version of database can be confirmed at /opt/conda/share/tbprofiler/tbdb.version.json +# can also run 'tb-profiler list_db' to find the same version info +# In 5.0.1 updating_tbdb does not work with tb-profiler update_tbdb --commit ${TBDB_VER} +RUN tb-profiler update_tbdb --commit ${TBDB_VER} + +#pre-add our TB reference +RUN tb-profiler update_tbdb --match_ref ~/tuberculosis.fasta \ No newline at end of file diff --git a/docker/Dockerfile.vcfpredict-0.9.8 b/docker/Dockerfile.vcfpredict-0.9.8 new file mode 100644 index 0000000..068303f --- /dev/null +++ b/docker/Dockerfile.vcfpredict-0.9.8 @@ -0,0 +1,22 @@ +FROM ubuntu:20.04 + +LABEL maintainer="pricea35@cardiff.ac.uk" \ +about.summary="container for the vcf predict workflow" + +ENV PACKAGES="procps curl wget git build-essential libhdf5-dev libffi-dev r-base-core jq" \ +PYTHON="python3 python3-pip python3-dev" + +ENV vcfmix_version=d4693344bf612780723e39ce27c8ae3868f95417 \ + +RUN apt-get update \ +&& DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata \ +&& apt-get install -y $PACKAGES $PYTHON \ +&& apt-get install -y python3-packaging \ +&& git clone https://github.com/JeremyWesthead/VCFMIX.git \ +&& cd VCFMIX \ +&& git checkout ${vcfmix_version} \ +&& pip3 install recursive_diff \ +&& pip3 install awscli \ +&& pip3 install . \ +&& cp -r data /usr/local/lib/python3.8/dist-packages \ +&& cd .. \ No newline at end of file From 7ac2f3bca1a916f9e45c1c09c239dd333e9f0af1 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Fri, 8 Dec 2023 12:20:36 +0000 Subject: [PATCH 08/44] change base image of tbprofiler docker --- docker/Dockerfile.tbprofiler-0.9.8 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.tbprofiler-0.9.8 b/docker/Dockerfile.tbprofiler-0.9.8 index a6d1671..873a407 100644 --- a/docker/Dockerfile.tbprofiler-0.9.8 +++ b/docker/Dockerfile.tbprofiler-0.9.8 @@ -1,4 +1,4 @@ -FROM mambaorg/micromamba:jammy +FROM mambaorg/micromamba:1.3.0 LABEL maintainer="whalleyt@cardiff.ac.uk" \ about.summary="container for the tb-profiler" @@ -43,4 +43,4 @@ ENV PATH="/opt/conda/bin:${PATH}" RUN tb-profiler update_tbdb --commit ${TBDB_VER} #pre-add our TB reference -RUN tb-profiler update_tbdb --match_ref ~/tuberculosis.fasta \ No newline at end of file +RUN tb-profiler update_tbdb --match_ref ~/tuberculosis.fasta From df55e2ca94a03f05fe72e2e88f5613428ed669a4 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Fri, 8 Dec 2023 13:34:26 +0000 Subject: [PATCH 09/44] tb-profiler container --- docker/Dockerfile.tbprofiler-0.9.8 | 36 +++++++------ docker/Dockerfile.vcfpredict-0.9.8r1 | 78 ---------------------------- 2 files changed, 20 insertions(+), 94 deletions(-) delete mode 100644 docker/Dockerfile.vcfpredict-0.9.8r1 diff --git a/docker/Dockerfile.tbprofiler-0.9.8 b/docker/Dockerfile.tbprofiler-0.9.8 index 873a407..468d706 100644 --- a/docker/Dockerfile.tbprofiler-0.9.8 +++ b/docker/Dockerfile.tbprofiler-0.9.8 @@ -1,12 +1,10 @@ -FROM mambaorg/micromamba:1.3.0 +FROM mambaorg/micromamba:1.3.0 as app -LABEL maintainer="whalleyt@cardiff.ac.uk" \ -about.summary="container for the tb-profiler" +#copy the reference genome to pre-compute our index +COPY resources/tuberculosis.fasta /data/tuberculosis.fasta -COPY bin/ /opt/bin/ -COPY resources/tuberculosis.fasta ~/tuberculosis.fasta - -ENV PATH=/opt/bin:$PATH +USER root +WORKDIR / ARG TBPROFILER_VER="5.0.1" @@ -15,6 +13,19 @@ ARG TBPROFILER_VER="5.0.1" # this was the latest commit as of 2023-10-26 ARG TBDB_VER="e25540b" +# LABEL instructions tag the image with metadata that might be important to the user +LABEL base.image="micromamba:1.3.0" +LABEL dockerfile.version="1" +LABEL software="tbprofiler" +LABEL software.version="${TBPROFILER_VER}" +LABEL description="The pipeline aligns reads to the H37Rv reference using bowtie2, BWA or minimap2 and then calls variants using bcftools. These variants are then compared to a drug-resistance database." +LABEL website="https://github.com/jodyphelan/TBProfiler/" +LABEL license="https://github.com/jodyphelan/TBProfiler/blob/master/LICENSE" +LABEL maintainer="John Arnn" +LABEL maintainer.email="jarnn@utah.gov" +LABEL maintainer2="Curtis Kapsak" +LABEL maintainer2.email="kapsakcj@gmail.com" + # Install dependencies via apt-get; cleanup apt garbage RUN apt-get update && apt-get install -y --no-install-recommends \ wget \ @@ -26,13 +37,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ RUN micromamba install --yes --name base --channel conda-forge --channel bioconda \ tb-profiler=${TBPROFILER_VER} && \ micromamba clean --all --yes -# hardcode 'base' env bin into PATH, so conda env does not have to be "activated" at run time -ENV PATH="/opt/conda/bin:${PATH}" - -# Version of database can be confirmed at /opt/conda/share/tbprofiler/tbdb.version.json -# can also run 'tb-profiler list_db' to find the same version info -# In 5.0.1 updating_tbdb does not work with tb-profiler update_tbdb --commit ${TBDB_VER} -RUN tb-profiler update_tbdb --commit ${TBDB_VER} # hardcode 'base' env bin into PATH, so conda env does not have to be "activated" at run time ENV PATH="/opt/conda/bin:${PATH}" @@ -42,5 +46,5 @@ ENV PATH="/opt/conda/bin:${PATH}" # In 5.0.1 updating_tbdb does not work with tb-profiler update_tbdb --commit ${TBDB_VER} RUN tb-profiler update_tbdb --commit ${TBDB_VER} -#pre-add our TB reference -RUN tb-profiler update_tbdb --match_ref ~/tuberculosis.fasta +WORKDIR /data +RUN tb-profiler update_tbdb --match_ref tuberculosis.fasta diff --git a/docker/Dockerfile.vcfpredict-0.9.8r1 b/docker/Dockerfile.vcfpredict-0.9.8r1 deleted file mode 100644 index f781423..0000000 --- a/docker/Dockerfile.vcfpredict-0.9.8r1 +++ /dev/null @@ -1,78 +0,0 @@ -FROM mambaorg/micromamba:jammy - -LABEL maintainer="pricea35@cardiff.ac.uk" \ -about.summary="container for the vcf predict workflow" - -COPY bin/ /opt/bin/ -COPY resources/tuberculosis.fasta ~/tuberculosis.fasta - -ENV PATH=/opt/bin:$PATH - -ARG TBPROFILER_VER="5.0.1" - -# this version is the shortened commit hash on the `master` branch here https://github.com/jodyphelan/tbdb/ -# commits are found on https://github.com/jodyphelan/tbdb/commits/master -# this was the latest commit as of 2023-10-26 -ARG TBDB_VER="e25540b" - - -# install tb-profiler via bioconda; install into 'base' conda env -RUN micromamba install --yes --name base --channel conda-forge --channel bioconda \ - tb-profiler=${TBPROFILER_VER} && \ - micromamba clean --all --yes - -# hardcode 'base' env bin into PATH, so conda env does not have to be "activated" at run time -ENV PATH="/opt/conda/bin:${PATH}" - -# Version of database can be confirmed at /opt/conda/share/tbprofiler/tbdb.version.json -# can also run 'tb-profiler list_db' to find the same version info -# In 5.0.1 updating_tbdb does not work with tb-profiler update_tbdb --commit ${TBDB_VER} -RUN tb-profiler update_tbdb --commit ${TBDB_VER} - -ENV PACKAGES="procps curl wget git build-essential libhdf5-dev libffi-dev r-base-core jq" \ -PYTHON="python3 python3-pip python3-dev" - -ENV vcfmix_version=d4693344bf612780723e39ce27c8ae3868f95417 - -RUN apt-get update \ -&& DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata \ -&& apt-get install -y $PACKAGES $PYTHON \ -&& apt-get install -y python3-packaging \ -&& git clone https://github.com/JeremyWesthead/VCFMIX.git \ -&& cd VCFMIX \ -&& git checkout ${vcfmix_version} \ -&& pip3 install recursive_diff \ -&& pip3 install . \ -&& cp -r data /usr/local/lib/python3.8/dist-packages \ -&& cd .. - -#taken and adapted from staphb/tbprofiler -ARG TBPROFILER_VER="5.0.1" - -# this version is the shortened commit hash on the `master` branch here https://github.com/jodyphelan/tbdb/ -# commits are found on https://github.com/jodyphelan/tbdb/commits/master -# this was the latest commit as of 2023-10-26 -ARG TBDB_VER="e25540b" - -# Install dependencies via apt-get; cleanup apt garbage -RUN apt-get update && apt-get install -y --no-install-recommends \ - wget \ - ca-certificates \ - procps && \ - apt-get autoclean && rm -rf /var/lib/apt/lists/* - -# install tb-profiler via bioconda; install into 'base' conda env -RUN micromamba install --yes --name base --channel conda-forge --channel bioconda \ - tb-profiler=${TBPROFILER_VER} && \ - micromamba clean --all --yes - -# hardcode 'base' env bin into PATH, so conda env does not have to be "activated" at run time -ENV PATH="/opt/conda/bin:${PATH}" - -# Version of database can be confirmed at /opt/conda/share/tbprofiler/tbdb.version.json -# can also run 'tb-profiler list_db' to find the same version info -# In 5.0.1 updating_tbdb does not work with tb-profiler update_tbdb --commit ${TBDB_VER} -RUN tb-profiler update_tbdb --commit ${TBDB_VER} - -#pre-add our TB reference -RUN tb-profiler update_tbdb --match_ref ~/tuberculosis.fasta \ No newline at end of file From 2a8d5e82d8d46ad6dbe2b0e22158192b9b542459 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Tue, 12 Dec 2023 13:30:50 +0000 Subject: [PATCH 10/44] try another docker push --- docker/Dockerfile.tbprofiler-0.9.8 | 1 + docker/Dockerfile.vcfpredict-0.9.8 | 1 + 2 files changed, 2 insertions(+) diff --git a/docker/Dockerfile.tbprofiler-0.9.8 b/docker/Dockerfile.tbprofiler-0.9.8 index 468d706..bcbffad 100644 --- a/docker/Dockerfile.tbprofiler-0.9.8 +++ b/docker/Dockerfile.tbprofiler-0.9.8 @@ -8,6 +8,7 @@ WORKDIR / ARG TBPROFILER_VER="5.0.1" + # this version is the shortened commit hash on the `master` branch here https://github.com/jodyphelan/tbdb/ # commits are found on https://github.com/jodyphelan/tbdb/commits/master # this was the latest commit as of 2023-10-26 diff --git a/docker/Dockerfile.vcfpredict-0.9.8 b/docker/Dockerfile.vcfpredict-0.9.8 index 068303f..ca94910 100644 --- a/docker/Dockerfile.vcfpredict-0.9.8 +++ b/docker/Dockerfile.vcfpredict-0.9.8 @@ -8,6 +8,7 @@ PYTHON="python3 python3-pip python3-dev" ENV vcfmix_version=d4693344bf612780723e39ce27c8ae3868f95417 \ +#apt updates RUN apt-get update \ && DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata \ && apt-get install -y $PACKAGES $PYTHON \ From 76fc0c8874d35d0fd4c06eaf68caf415c56b026f Mon Sep 17 00:00:00 2001 From: whalleyt Date: Wed, 13 Dec 2023 13:46:46 +0000 Subject: [PATCH 11/44] tb-profiler db and run on vcf --- main.nf | 4 ++-- modules/vcfpredictModules.nf | 36 ++++++++++++++++++++++++++++++++++++ workflows/clockwork.nf | 1 + workflows/vcfpredict.nf | 29 ++++++++++++----------------- 4 files changed, 51 insertions(+), 19 deletions(-) diff --git a/main.nf b/main.nf index 140be73..9b46430 100644 --- a/main.nf +++ b/main.nf @@ -205,9 +205,9 @@ workflow { mpileup_vcf = clockwork.out.mpileup_vcf minos_vcf = clockwork.out.minos_vcf - genbank = channel.fromPath(params.gnomonicus_genbank) + reference = clockwork.out.reference - vcfpredict(mpileup_vcf, minos_vcf, genbank) + vcfpredict(mpileup_vcf, minos_vcf, reference) } diff --git a/modules/vcfpredictModules.nf b/modules/vcfpredictModules.nf index cee38b6..7a1f382 100644 --- a/modules/vcfpredictModules.nf +++ b/modules/vcfpredictModules.nf @@ -48,6 +48,42 @@ process vcfmix { """ } +process tbprofiler_update_db { + label 'low_memory' + label 'low_cpu' + label 'tbprofiler' + + input: + path(reference) + + script: + """ + tb-profiler update_tbdb --match_ref $reference + """ +} + +process tbprofiler { + label 'medium_memory' + label 'medium_cpu' + label 'tbprofiler' + + input: + val(sample_name) + path(minos_vcf) + + output: + path("results/tbprofiler.results.json") + + when: + isSampleTB =~ /CREATE\_ANTIBIOGRAM\_${sample_name}/ + + script: + """ + bgzip ${minos_vcf} + tb-profiler profile --vcf ${minos_vcf}.gz --threads ${task.cpus} + """ +} + process gnomonicus { tag {sample_name} diff --git a/workflows/clockwork.nf b/workflows/clockwork.nf index 3ffbaa0..bf6f5e1 100644 --- a/workflows/clockwork.nf +++ b/workflows/clockwork.nf @@ -39,5 +39,6 @@ workflow clockwork { mpileup_vcf = callVarsMpileup.out.mpileup_vcf.join(minos.out.minos_report, by: 0) minos_vcf = minos.out.minos_vcf.join(alignToRef.out.alignToRef_report, by: 0) + reference = getRefFromJSON.out } diff --git a/workflows/vcfpredict.nf b/workflows/vcfpredict.nf index 9efc651..2006e73 100644 --- a/workflows/vcfpredict.nf +++ b/workflows/vcfpredict.nf @@ -3,36 +3,31 @@ nextflow.enable.dsl = 2 // import modules include {vcfmix} from '../modules/vcfpredictModules.nf' params(params) -include {gnomonicus} from '../modules/vcfpredictModules.nf' params(params) -include {finalJson} from '../modules/vcfpredictModules.nf' params(params) +include {tbprofiler} from '../modules/vcfpredictModules.nf' params(params) +include {tbprofiler_update_db} from '../modules/vcfpredictModules.nf' params(params) // define workflow component workflow vcfpredict { take: - - clockwork_bcftools - clockwork_minos - genbank + clockwork_bcftools_tuple + minos_vcf_tuple + reference_fasta main: if ( params.vcfmix == "yes" ) { - vcfmix(clockwork_bcftools) + vcfmix(clockwork_bcftools_tuple) } - if ( params.gnomonicus == "yes" ) { - - gnomonicus(clockwork_minos, genbank) + if ( params.resistance_profiler == "tb-profiler"){ + //get just the vcf + minos_vcf = minos_vcf_tuple.map{it[1]} + sample_name = minos_vcf_tuple.map{it[0]} + tbprofiler_update_db(reference_fasta) + tbprofiler(sample_name, minos_vcf) } - - if ( (params.vcfmix == "yes") && (params.gnomonicus == "yes") ) { - - finalJson(vcfmix.out.vcfmix_json.join(gnomonicus.out.gnomon_json, by: 0)) - - } - } From ee5ab97bdfa62c092456728a6bd948a754442e42 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Thu, 14 Dec 2023 11:26:54 +0000 Subject: [PATCH 12/44] update config to remove gnomonicus params --- nextflow.config | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/nextflow.config b/nextflow.config index 9ed781c..5a4e0f5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -43,16 +43,9 @@ params { // run VCFMIX 'yes' or 'no' (set to no for synthetic samples) vcfmix = 'yes' - - // run gnomonicus 'yes' or 'no' - gnomonicus = 'yes' resistance_profiler = "tb-profiler" - // path to AMR catalogue for gnomon - // https://github.com/oxfordmmm/tuberculosis_amr_catalogues available at path /tuberculosis_amr_catalogues in container - amr_cat = "/tuberculosis_amr_catalogues/catalogues/NC_000962.3/NC_000962.3_WHO-UCN-GTB-PCI-2021.7_v1.0_GARC1_RUS.csv" - // path to singularity recipes directory (needed to strip software versions in getversion) sing_dir = "${baseDir}/singularity" @@ -65,8 +58,6 @@ params { //path to resources directory resource_dir = "${baseDir}/resources" refseq = "${resource_dir}/assembly_summary_refseq.txt" - gnomonicus_genbank = "${resource_dir}/H37rV_v3.gbk" - } profiles { @@ -135,9 +126,7 @@ profiles { afanc_myco_db = "s3://microbial-bioin-sp3/Mycobacteriaciae_DB_7.0/" resource_dir = "s3://microbial-bioin-sp3/lodestone_resources" - refseq = "${resource_dir}/assembly_summary_refseq.txt" - gnomonicus_genbank = "${resource_dir}/H37rV_v3.gbk" - + refseq = "${resource_dir}/assembly_summary_refseq.txt" } } singularity { From cc39928e26c0c117127984f16f5f0c2db057ff78 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Mon, 8 Jan 2024 19:24:10 +0000 Subject: [PATCH 13/44] change logic of end condition --- modules/vcfpredictModules.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/vcfpredictModules.nf b/modules/vcfpredictModules.nf index 7a1f382..776ff0a 100644 --- a/modules/vcfpredictModules.nf +++ b/modules/vcfpredictModules.nf @@ -33,7 +33,7 @@ process vcfmix { jq -s ".[0] * .[1]" ${sample_name}_report_previous.json ${sample_name}_f-stats.json > ${report_json} - if [ ${params.gnomonicus} == "no" ]; then echo '{"complete":"workflow complete without error"}' | jq '.' > ${error_log} && jq -s ".[0] * .[1] * .[2]" ${error_log} ${sample_name}_report_previous.json ${sample_name}_f-stats.json > ${report_json}; fi + if [ ${params.resistance_profiler} == "none" ]; then echo '{"complete":"workflow complete without error"}' | jq '.' > ${error_log} && jq -s ".[0] * .[1] * .[2]" ${error_log} ${sample_name}_report_previous.json ${sample_name}_f-stats.json > ${report_json}; fi """ stub: From d6af590e2b0bf368e242d1eeb668f4aac96cb4de Mon Sep 17 00:00:00 2001 From: whalleyt Date: Mon, 8 Jan 2024 19:50:44 +0000 Subject: [PATCH 14/44] python paths in vcfmix docker --- docker/Dockerfile.vcfpredict-0.9.8 | 4 ++++ nextflow.config | 24 +++++++++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile.vcfpredict-0.9.8 b/docker/Dockerfile.vcfpredict-0.9.8 index ca94910..3139c59 100644 --- a/docker/Dockerfile.vcfpredict-0.9.8 +++ b/docker/Dockerfile.vcfpredict-0.9.8 @@ -3,6 +3,10 @@ FROM ubuntu:20.04 LABEL maintainer="pricea35@cardiff.ac.uk" \ about.summary="container for the vcf predict workflow" +#add run-vcf to container +COPY bin/ /opt/bin/ +ENV PATH=/opt/bin:$PATH + ENV PACKAGES="procps curl wget git build-essential libhdf5-dev libffi-dev r-base-core jq" \ PYTHON="python3 python3-pip python3-dev" diff --git a/nextflow.config b/nextflow.config index 5a4e0f5..9b76d20 100644 --- a/nextflow.config +++ b/nextflow.config @@ -58,6 +58,7 @@ params { //path to resources directory resource_dir = "${baseDir}/resources" refseq = "${resource_dir}/assembly_summary_refseq.txt" + container_enabled = "false" } profiles { @@ -92,6 +93,10 @@ profiles { container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" } + withLabel:tbprofiler { + container = "twhalley93/tb-profiler:latest" + } + withLabel:getversion{ executor = "local" } @@ -116,10 +121,12 @@ profiles { } withLabel:vcfpredict { - container = "quay.io/pathogen-genomics-cymru/vcfpredict:0.9.8r1" + container = "quay.io/pathogen-genomics-cymru/vcfpredict:0.9.8" } } params{ + container_enabled = "true" + bowtie2_index = "s3://microbial-bioin-sp3/bowtie_hg19" bowtie_index_name = "hg19_1kgmaj" kraken_db = "s3://microbial-bioin-sp3/kraken_pluspf_16gb/" @@ -133,6 +140,7 @@ profiles { params{ resource_dir = "/resources" + container_enabled = "true" } singularity.enabled = 'true' @@ -156,6 +164,10 @@ profiles { withLabel:preprocessing { container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" } + + withLabel:tbprofiler { + container = "twhalley93/tb-profiler:latest" + } withName:downloadContamGenomes { shell = ['/bin/bash','-u'] @@ -191,6 +203,7 @@ profiles { params{ resource_dir = "/resources" + container_enabled = "true" } process { @@ -210,6 +223,10 @@ profiles { withLabel:preprocessing { container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" } + + withLabel:tbprofiler { + container = "twhalley93/tb-profiler:latest" + } withName:downloadContamGenomes { shell = ['/bin/bash','-u'] @@ -241,6 +258,7 @@ profiles { runOptions = "-u \$(id -u):\$(id -g)" params{ + container_enabled = "true" resource_dir = "/resources" } @@ -279,6 +297,10 @@ profiles { withLabel:vcfpredict { container = "quay.io/pathogen-genomics-cymru/vcfpredict:0.9.8" } + + withLabel:tbprofiler { + container = "twhalley93/tb-profiler:latest" + } } } } From eea6e25ef61c0959c2c96839b618c2857ac645ec Mon Sep 17 00:00:00 2001 From: whalleyt Date: Tue, 9 Jan 2024 16:21:36 +0000 Subject: [PATCH 15/44] linted indents --- main.nf | 51 +++++++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/main.nf b/main.nf index 9b46430..0a9cbdf 100644 --- a/main.nf +++ b/main.nf @@ -36,24 +36,24 @@ Produces as output one directory per sample, containing the relevant reports & a Mandatory and conditional parameters: ------------------------------------------------------------------------ --input_dir Directory containing fastq OR bam files. Workflow will process one or the other, so don't mix ---filetype File type in input_dir. One of either "fastq" or "bam". fastq files can be gzipped and do not +--filetype File type in input_dir. One of either "fastq" or "bam". fastq files can be gzipped and do not have to literally take the form "*.fastq"; see --pattern --pattern Regex to match files in input_dir, e.g. "*_R{1,2}.fq.gz". Only mandatory if --filetype is "fastq" --output_dir Output directory, in which will be created subdirectories matching base name of fastq/bam files ---unmix_myco Do you want to disambiguate mixed-mycobacterial samples by read alignment? One of "yes" or "no" - If "yes" workflow will remove reads mapping to any minority mycobacterial genomes but in doing so +--unmix_myco Do you want to disambiguate mixed-mycobacterial samples by read alignment? One of "yes" or "no" + If "yes" workflow will remove reads mapping to any minority mycobacterial genomes but in doing so WILL ALMOST CERTAINLY ALSO reduce coverage of the principal species - If "no" then mixed-mycobacterial samples will be left alone. Mixtures of mycobacteria + non-mycobacteria + If "no" then mixed-mycobacterial samples will be left alone. Mixtures of mycobacteria + non-mycobacteria will still be disambiguated --kraken_db Directory containing Kraken2 database files (obtain from https://benlangmead.github.io/aws-indexes/k2) --bowtie2_index Directory containing Bowtie2 index (obtain from ftp://ftp.ccb.jhu.edu/pub/data/bowtie2_indexes/hg19_1kgmaj_bt2.zip This is the Langmead lab pre-built major-allele-SNP reference; see https://github.com/BenLangmead/bowtie-majref) --bowtie_index_name Name of the bowtie index, e.g. hg19_1kgmaj ---vcfmix Run VFCMIX "yes" or "no". Should be set to "no" for synthetic samples +--vcfmix Run VFCMIX "yes" or "no". Should be set to "no" for synthetic samples --resistance_profiler Tool to profile resistance with. At the moment options are "tb-profiler" or "none" --amr_cat Path to the AMR catalogue (https://github.com/oxfordmmm/tuberculosis_amr_catalogues is at /tuberculosis_amr_catalogues in the vcfpredict container) ---afanc_myco_db Path to the Afanc database used for speciation. Obtain from https://s3.climb.ac.uk/microbial-bioin-sp3/Mycobacteriaciae_DB_3.0.tar.gz +--afanc_myco_db Path to the Afanc database used for speciation. Obtain from https://s3.climb.ac.uk/microbial-bioin-sp3/Mycobacteriaciae_DB_3.0.tar.gz Optional parameters: ------------------------------------------------------------------------ @@ -63,17 +63,17 @@ Optional parameters: default: null using this parameter will apply an additional sanity test to your sample - if you DO NOT use this parameter (default option), pipeline will determine principal species from + if you DO NOT use this parameter (default option), pipeline will determine principal species from the reads and consider any other species a contaminant - if you DO use this parameter, pipeline will expect this to be the principal species. It will fail - the sample if reads from this species are not actually the majority + If you DO use this parameter, pipeline will expect this to be the principal species. It will fail + the sample if reads from this species are not actually the majority Profiles: ------------------------------------------------------------------------ singularity to run with singularity -docker to run with docker +docker to run with docker Examples: @@ -88,10 +88,18 @@ nextflow run main.nf -profile docker --filetype bam --input_dir bam_dir --unmix_ resistance_profilers = ["tb-profiler", "none"] - if(!resistance_profilers.contains(params.resistance_profiler)){ +if(!resistance_profilers.contains(params.resistance_profiler)){ exit 1, 'Invalid resistance profiler. Must be one of "tb-profiler" or "none" to skip.' } +//tbprofiler container already has the reference genome in the DB, so skip if using docker +if((params.resistance_profiler == "tb-profiler") && (params.container_enabled == true)) { + update_tbprofiler = true +} else { + update_tbprofiler = false +} + +resistance_profiler = params.resistance_profiler // confirm that mandatory parameters have been set and that the conditional parameter, --pattern, has been used appropriately if ( params.input_dir == "" ) { @@ -125,18 +133,17 @@ M Y C O B A C T E R I A L P I P E L I N E Parameters used: ------------------------------------------------------------------------ ---input_dir ${params.input_dir} ---filetype ${params.filetype} ---pattern ${params.pattern} ---output_dir ${params.output_dir} ---unmix_myco ${params.unmix_myco} ---kraken_db ${params.kraken_db} +--input_dir ${params.input_dir} +--filetype ${params.filetype} +--pattern ${params.pattern} +--output_dir ${params.output_dir} +--unmix_myco ${params.unmix_myco} +--kraken_db ${params.kraken_db} --bowtie2_index ${params.bowtie2_index} --bowtie_index_name ${params.bowtie_index_name} ---species ${params.species} ---vcfmix ${params.vcfmix} ---gnomonicus ${params.gnomonicus} ---amr_cat ${params.amr_cat} +--resistance_profiler ${params.resistance_profiler} +--species ${params.species} +--vcfmix ${params.vcfmix} --afanc_myco_db ${params.afanc_myco_db} Runtime data: @@ -207,7 +214,7 @@ workflow { minos_vcf = clockwork.out.minos_vcf reference = clockwork.out.reference - vcfpredict(mpileup_vcf, minos_vcf, reference) + vcfpredict(mpileup_vcf, minos_vcf, reference, resistance_profiler, update_tbprofiler) } From f6783dba1ca08a10ce9af9cc8807374accbf74f2 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Wed, 10 Jan 2024 10:12:02 +0000 Subject: [PATCH 16/44] update readme, tidy up and remove gnomonicus references --- README.md | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 5c44791..4fac53e 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Pipeline cleans and QCs reads with fastp and FastQC, classifies with Kraken2 & A Note that while Mykrobe is included within this pipeline, it runs as an independent process and is not used for any downstream reporting. -**WARNING**: There are currently known errors with vcfmix and gnomonicus, as such `errorStrategy 'ignore'` has been added to the processes vcfpredict:vcfmix and vcfpredict:gnomonicus to stop the pipeline from crashing. Please check the stdout from nextflow to see whether these processes have ran successfully. +**WARNING**: There are currently known errors with vcfmix, as such `errorStrategy 'ignore'` has been added to the processes vcfpredict:vcfmix to stop the pipeline from crashing. Please check the stdout from nextflow to see whether these processes have ran successfully. ## Quick Start ## This is a Nextflow DSL2 pipeline, it requires a version of Nextflow that supports DSL2 and the stub-run feature. It is recommended to run the pipeline with `NXF_VER=20.11.0-edge`, as the pipeline has been tested using this version. E.g. to download @@ -29,6 +29,8 @@ NXF_VER=20.11.0-edge nextflow run main.nf -profile docker --filetype bam --input --output_dir . --kraken_db /path/to/database --bowtie2_index /path/to/index --bowtie_index_name hg19_1kgmaj ``` +There is also a pre-configured climb profile to run Lodestone on a CLIMB Jupyter Notebook Server. Add ```-profile climb``` to your command invocation. The input directory can point to an S3 bucket natively (e.g. ```--input_dir s3://my-team/bucket```). By default this will run the workflow in Docker containers and take advantage of kubernetes pods. The Kraken2, Bowtie2 and Afanc databases will by default point to the ```pluspf16```, ```hg19_1kgmaj_bt2``` and ```Mycobacteriaciae_DB_7.0``` directories by default. These are mounted on a public S3 bucket hosted on CLIMB. + ### Executors ### By default, the pipeline will just run on the local machine. To run on a cluster, modifications will have to be made to the `nextflow.config` to add in the executor. E.g. for a SLURM cluster add `process.executor = 'slurm'`. For more information on executor options see the Nextflow docs: https://www.nextflow.io/docs/latest/executor.html @@ -63,10 +65,8 @@ Directory containing Bowtie2 index (obtain from ftp://ftp.ccb.jhu.edu/pub/data/b Name of the bowtie index, e.g. hg19_1kgmaj
* **vcfmix**
Run [vcfmix](https://github.com/AlexOrlek/VCFMIX), yes or no. Set to no for synthetic samples
-* **gnomonicus**
-Run [gnomonicus](https://github.com/oxfordmmm/gnomonicus), yes or no
-* **amr_cat**
-Path to AMR catalogue for gnomonicus
+* **resistance_profiler**
+Run resistance profiling for Mycobacterium tubercuclosis. Either ["tb-profiler"](https://tbdr.lshtm.ac.uk/) or "none". * **afanc_myco_db**
Path to the [afanc](https://github.com/ArthurVM/Afanc) database used for speciation. Obtain from https://s3.climb.ac.uk/microbial-bioin-sp3/Mycobacteriaciae_DB_7.0.tar.gz
@@ -125,10 +125,7 @@ process clockwork:alignToRef\ 25. (Fail) If < 50% of the reference genome was covered at 10-fold depth process clockwork:minos\ -26. (Warn) If sample is not TB, then it is not passed to gnomonicus - -## Running on CLIMB Jupyter Hub -There is a pre-configured climb profile to run Lodestone on a CLIMB Jupyter Notebook Server. Add ```profile climb``` to your command invocation. The input directory can point to an S3 bucket natively (e.g. ```--input_dir s3://my-team/bucket```). By default this will run the workflow in Docker containers and take advantage of kubernetes pods. The Kraken2, Bowtie2 and Afanc databases will by default point to the ```pluspf16```, ```hg19_1kgmaj_bt2``` and ```Mycobacteriaciae_DB_7.0``` respectively. These are mounted on a public shared volume. +26. (Warn) If sample is not TB, then it is not passed to a resistance profiler ## Acknowledgements ## For a list of direct authors of this pipeline, please see the contributors list. All of the software dependencies of this pipeline are recorded in the version.json From 7851f5979c37286edc6293af0853f54d3f00528c Mon Sep 17 00:00:00 2001 From: whalleyt Date: Wed, 10 Jan 2024 11:18:17 +0000 Subject: [PATCH 17/44] remove reference to gnomonicus --- modules/clockworkModules.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/clockworkModules.nf b/modules/clockworkModules.nf index 4a2675d..d793a34 100644 --- a/modules/clockworkModules.nf +++ b/modules/clockworkModules.nf @@ -206,7 +206,7 @@ process callVarsCortex { process minos { /** - * @QCcheckpoint check if top species is TB, if yes pass vcf to gnomonicus + * @QCcheckpoint check if top species is TB, if yes pass vcf to resistance profiling */ tag { sample_name } @@ -241,7 +241,7 @@ process minos { cp ${sample_name}_report.json ${sample_name}_report_previous.json - if [[ \$top_hit =~ ^"Mycobacterium tuberculosis" ]]; then printf "CREATE_ANTIBIOGRAM_${sample_name}"; else echo '{"gnomonicus-warning":"sample is not TB so cannot produce antibiogram using gnomonicus"}' | jq '.' > ${error_log} && printf "no" && jq -s ".[0] * .[1]" ${error_log} ${sample_name}_report_previous.json > ${report_json}; fi + if [[ \$top_hit =~ ^"Mycobacterium tuberculosis" ]]; then printf "CREATE_ANTIBIOGRAM_${sample_name}"; else echo '{"resistance-profiling-warning":"sample is not TB so cannot produce antibiogram using resistance profiling tools"}' | jq '.' > ${error_log} && printf "no" && jq -s ".[0] * .[1]" ${error_log} ${sample_name}_report_previous.json > ${report_json}; fi """ stub: @@ -296,7 +296,7 @@ process gvcf { cp ${sample_name}_report.json ${sample_name}_report_previous.json - if [ ${params.vcfmix} == "no" ] && [ ${params.gnomonicus} == "no" ]; then echo '{"complete":"workflow complete without error"}' | jq '.' > ${error_log} && jq -s ".[0] * .[1]" ${error_log} ${sample_name}_report_previous.json > ${report_json}; fi + if [ ${params.vcfmix} == "no" ] && [ ${params.resistance_profiler} == "none" ]; then echo '{"complete":"workflow complete without error"}' | jq '.' > ${error_log} && jq -s ".[0] * .[1]" ${error_log} ${sample_name}_report_previous.json > ${report_json}; fi """ stub: From 3245b569662d0a526a6787545c13b1f34510dbe5 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Wed, 10 Jan 2024 15:52:58 +0000 Subject: [PATCH 18/44] push tbpofiler docker --- docker/Dockerfile.tbprofiler-0.9.8 | 1 - 1 file changed, 1 deletion(-) diff --git a/docker/Dockerfile.tbprofiler-0.9.8 b/docker/Dockerfile.tbprofiler-0.9.8 index bcbffad..468d706 100644 --- a/docker/Dockerfile.tbprofiler-0.9.8 +++ b/docker/Dockerfile.tbprofiler-0.9.8 @@ -8,7 +8,6 @@ WORKDIR / ARG TBPROFILER_VER="5.0.1" - # this version is the shortened commit hash on the `master` branch here https://github.com/jodyphelan/tbdb/ # commits are found on https://github.com/jodyphelan/tbdb/commits/master # this was the latest commit as of 2023-10-26 From 7c7968068542c54dc94a4eb5624be2ad6f10b0a1 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Wed, 10 Jan 2024 15:53:51 +0000 Subject: [PATCH 19/44] nextflow config for docker tbprofiler --- nextflow.config | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nextflow.config b/nextflow.config index 9b76d20..d268fb4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -94,7 +94,7 @@ profiles { } withLabel:tbprofiler { - container = "twhalley93/tb-profiler:latest" + container = "quay.io/pathogen-genomics-cymru//tb-profiler:latest" } withLabel:getversion{ @@ -166,7 +166,7 @@ profiles { } withLabel:tbprofiler { - container = "twhalley93/tb-profiler:latest" + container = "quay.io/pathogen-genomics-cymru//tb-profiler:latest" } withName:downloadContamGenomes { @@ -225,7 +225,7 @@ profiles { } withLabel:tbprofiler { - container = "twhalley93/tb-profiler:latest" + container = "quay.io/pathogen-genomics-cymru//tb-profiler:latest" } withName:downloadContamGenomes { @@ -299,7 +299,7 @@ profiles { } withLabel:tbprofiler { - container = "twhalley93/tb-profiler:latest" + container = "quay.io/pathogen-genomics-cymru//tb-profiler:latest" } } } From dc9ff40f3ce3a224909d36b39390d1a3413dc123 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Wed, 10 Jan 2024 16:43:45 +0000 Subject: [PATCH 20/44] add config to tidy up main config --- config/containers.config | 51 ++++++++ nextflow.config | 262 +++++---------------------------------- 2 files changed, 84 insertions(+), 229 deletions(-) create mode 100644 config/containers.config diff --git a/config/containers.config b/config/containers.config new file mode 100644 index 0000000..ae30606 --- /dev/null +++ b/config/containers.config @@ -0,0 +1,51 @@ +params{ + container_enabled = "true" + pipeline_version = 0.9.8 + container_enabled = "true" + resource_dir = "/resources" +} + + +process { + errorStrategy = 'ignore' + update_tbprofiler = "false" + + + withLabel:low_cpu {cpus = 2} + withLabel:normal_cpu { cpus = 8 } + withLabel:low_memory { memory = '5GB' } + withLabel:medium_memory { memory = '10GB' } + withLabel:high_memory { memory = '18GB' } + + withLabel:getversion { + container = "quay.io/pathogen-genomics-cymru/preprocessing:${pipeline_version}" + } + + withLabel:preprocessing { + container = "quay.io/pathogen-genomics-cymru/preprocessing:${pipeline_version}" + } + + withLabel:tbprofiler { + container = "quay.io/pathogen-genomics-cymru//tb-profiler:${pipeline_version}" + } + + withName:downloadContamGenomes { + shell = ['/bin/bash','-u'] + errorStrategy = { task.exitStatus in 100..113 ? 'retry' : 'terminate' } + maxRetries = 5 + } + + withLabel:retryAfanc { + shell = ['/bin/bash','-u'] + errorStrategy = {task.exitStatus == 1 ? 'retry' : 'ignore' } + maxRetries = 5 + } + + withLabel:clockwork { + container = "quay.io/pathogen-genomics-cymru/clockwork:${pipeline_version}" + } + + withLabel:vcfpredict { + container = "quay.io/pathogen-genomics-cymru/vcfpredict:${pipeline_version}" + } + } \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index d268fb4..4747852 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,13 +1,3 @@ -// config for lodestone - -manifest { - name = "pathogen-genomics-cymru/lodestone" -} - - -trace.overwrite = true -report.overwrite = true - params { // help message @@ -45,6 +35,7 @@ params { vcfmix = 'yes' resistance_profiler = "tb-profiler" + update_tbprofiler = "true" // path to singularity recipes directory (needed to strip software versions in getversion) sing_dir = "${baseDir}/singularity" @@ -62,245 +53,58 @@ params { } profiles { - climb { - - //this is pre-defined in the CLIMB nextflow.config; however it has been added to allow - //-profile climb to still work outside of CLIMB system (e.g. to access S3 buckets) - aws { - profile = "climb" - client { - endpoint = 'https://s3.climb.ac.uk' - s3PathStyleAccess = true - } - } - - docker.enabled = true - fixOwnership = true - runOptions = "-u \$(id -u):\$(id -g)" - - // define containers for each process - process { - k8s { - pullPolicy = "always" - } - withLabel:low_cpu {cpus = 2} - withLabel:normal_cpu { cpus = 8 } - withLabel:low_memory { memory = '5GB' } - withLabel:medium_memory { memory = '10GB' } - withLabel:high_memory { memory = '18GB' } - - withLabel:preprocessing { - container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" - } - - withLabel:tbprofiler { - container = "quay.io/pathogen-genomics-cymru//tb-profiler:latest" - } - - withLabel:getversion{ - executor = "local" - } - - withLabel:afanc_parse{ - executor = "local" - } - withName:downloadContamGenomes { - shell = ['/bin/bash','-u'] - errorStrategy = { task.exitStatus in 100..113 ? 'retry' : 'terminate' } - maxRetries = 5 - } - - withLabel:retry_afanc { - shell = ['/bin/bash','-u'] - errorStrategy = {task.exitStatus == 1 ? 'retry' : 'ignore' } - maxRetries = 5 - } - - withLabel:clockwork { - container = "quay.io/pathogen-genomics-cymru/clockwork:0.9.8" - } - - withLabel:vcfpredict { - container = "quay.io/pathogen-genomics-cymru/vcfpredict:0.9.8" - } - } - params{ - container_enabled = "true" - - bowtie2_index = "s3://microbial-bioin-sp3/bowtie_hg19" - bowtie_index_name = "hg19_1kgmaj" - kraken_db = "s3://microbial-bioin-sp3/kraken_pluspf_16gb/" - afanc_myco_db = "s3://microbial-bioin-sp3/Mycobacteriaciae_DB_7.0/" + climb { + includeConfig 'config/containers.config' + + //add in docker configs as the above config file is generic for any containerised run + docker.enabled = true + fixOwnership = true + runOptions = "-u \$(id -u):\$(id -g)" + + //params specific to paths on the climb system + params{ + bowtie2_index = "s3://microbial-bioin-sp3/bowtie_hg19" + bowtie_index_name = "hg19_1kgmaj" + kraken_db = "s3://microbial-bioin-sp3/kraken_pluspf_16gb/" + afanc_myco_db = "s3://microbial-bioin-sp3/Mycobacteriaciae_DB_7.0/" - resource_dir = "s3://microbial-bioin-sp3/lodestone_resources" - refseq = "${resource_dir}/assembly_summary_refseq.txt" + resource_dir = "s3://microbial-bioin-sp3/lodestone_resources" + refseq = "${resource_dir}/assembly_summary_refseq.txt" } - } - singularity { - - params{ - resource_dir = "/resources" - container_enabled = "true" } + + singularity { + includeConfig 'config/containers.config' - singularity.enabled = 'true' - singularity.autoMounts = 'true' - - // path to the singularity containers - singularity.cacheDir = "${baseDir}/singularity" - - process { - withLabel:low_cpu {cpus = 2} - withLabel:normal_cpu { cpus = 8 } - - withLabel:low_memory { memory = '5GB' } - withLabel:medium_memory { memory = '10GB' } - withLabel:high_memory { memory = '18GB' } - - withLabel:getversion { - container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" - } - - withLabel:preprocessing { - container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" - } - - withLabel:tbprofiler { - container = "quay.io/pathogen-genomics-cymru//tb-profiler:latest" - } - - withName:downloadContamGenomes { - shell = ['/bin/bash','-u'] - errorStrategy = { task.exitStatus in 100..113 ? 'retry' : 'terminate' } - maxRetries = 5 - } - - withLabel:retryAfanc { - shell = ['/bin/bash','-u'] - // Afanc sometimes fails curl in slurm, retry if so (error is masked as error status 1) - errorStrategy = {task.exitStatus == 1 ? 'retry' : 'ignore' } - maxRetries = 5 - } - - withLabel:clockwork { - container = "quay.io/pathogen-genomics-cymru/clockwork:0.9.8r1" - } - - withLabel:vcfpredict { - container = "quay.io/pathogen-genomics-cymru/vcfpredict:0.9.8r1" - } + singularity.enabled = 'true' + singularity.autoMounts = 'true' + //path to the singularity containers + singularity.cacheDir = "${baseDir}/singularity" } - } sp3 { - + + includeConfig 'config/containers.config' + + //add in singularity configs as the above config file is generic for any containerised run singularity.enabled = 'true' singularity.autoMounts = 'true' - // path to the singularity containers singularity.cacheDir = "/data/images" - - params{ - resource_dir = "/resources" - container_enabled = "true" - } process { - scratch = true - errorStrategy = 'ignore' - - withLabel:low_cpu {cpus = 2} - withLabel:normal_cpu { cpus = 8 } - withLabel:low_memory { memory = '5GB' } - withLabel:medium_memory { memory = '10GB' } - withLabel:high_memory { memory = '18GB' } - - withLabel:getversion { - container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" - } - - withLabel:preprocessing { - container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" - } - - withLabel:tbprofiler { - container = "quay.io/pathogen-genomics-cymru//tb-profiler:latest" - } - - withName:downloadContamGenomes { - shell = ['/bin/bash','-u'] - errorStrategy = { task.exitStatus in 100..113 ? 'retry' : 'terminate' } - maxRetries = 5 - } - - withLabel:retryAfanc { - shell = ['/bin/bash','-u'] - errorStrategy = {task.exitStatus == 1 ? 'retry' : 'ignore' } - maxRetries = 5 - } - - withLabel:clockwork { - container = "quay.io/pathogen-genomics-cymru/clockwork:0.9.8" - } - - withLabel:vcfpredict { - container = "quay.io/pathogen-genomics-cymru/vcfpredict:0.9.8r1" - } + scratch = true } - } + } docker { - + includeConfig 'config/containers.config' + + //add in docker configs as the above config file is generic for any containerised run docker.enabled = true fixOwnership = true runOptions = "-u \$(id -u):\$(id -g)" - - params{ - container_enabled = "true" - resource_dir = "/resources" - } - - // define containers for each process - process { - withLabel:low_cpu {cpus = 2} - withLabel:normal_cpu { cpus = 8 } - withLabel:low_memory { memory = '5GB' } - withLabel:medium_memory { memory = '10GB' } - withLabel:high_memory { memory = '18GB' } - - withLabel:getversion { - container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" - } - - withLabel:preprocessing { - container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" - } - - withName:downloadContamGenomes { - shell = ['/bin/bash','-u'] - errorStrategy = { task.exitStatus in 100..113 ? 'retry' : 'terminate' } - maxRetries = 5 - } - - withLabel:retryAfanc { - shell = ['/bin/bash','-u'] - errorStrategy = {task.exitStatus == 1 ? 'retry' : 'ignore' } - maxRetries = 5 - } - - withLabel:clockwork { - container = "quay.io/pathogen-genomics-cymru/clockwork:0.9.8" - } - - withLabel:vcfpredict { - container = "quay.io/pathogen-genomics-cymru/vcfpredict:0.9.8" - } - - withLabel:tbprofiler { - container = "quay.io/pathogen-genomics-cymru//tb-profiler:latest" - } - } } } From 3d266f5cba666ac874bda12d46b8b8ee175a7088 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Wed, 10 Jan 2024 17:28:38 +0000 Subject: [PATCH 21/44] config include to avoid repitition --- config/containers.config | 11 +++++------ nextflow.config | 10 ++++++++++ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/config/containers.config b/config/containers.config index ae30606..c1bfc80 100644 --- a/config/containers.config +++ b/config/containers.config @@ -1,6 +1,5 @@ params{ container_enabled = "true" - pipeline_version = 0.9.8 container_enabled = "true" resource_dir = "/resources" } @@ -18,15 +17,15 @@ process { withLabel:high_memory { memory = '18GB' } withLabel:getversion { - container = "quay.io/pathogen-genomics-cymru/preprocessing:${pipeline_version}" + container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" } withLabel:preprocessing { - container = "quay.io/pathogen-genomics-cymru/preprocessing:${pipeline_version}" + container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" } withLabel:tbprofiler { - container = "quay.io/pathogen-genomics-cymru//tb-profiler:${pipeline_version}" + container = "quay.io/pathogen-genomics-cymru/tbprofiler:0.9.8" } withName:downloadContamGenomes { @@ -42,10 +41,10 @@ process { } withLabel:clockwork { - container = "quay.io/pathogen-genomics-cymru/clockwork:${pipeline_version}" + container = "quay.io/pathogen-genomics-cymru/clockwork:0.9.8" } withLabel:vcfpredict { - container = "quay.io/pathogen-genomics-cymru/vcfpredict:${pipeline_version}" + container = "quay.io/pathogen-genomics-cymru/vcfpredict:0.9.8" } } \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 4747852..c8b15d1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -61,6 +61,16 @@ profiles { fixOwnership = true runOptions = "-u \$(id -u):\$(id -g)" + withLabel:getversion{ + executor = "local" + container = null + } + + withLabel:afanc_parse{ + executor = "local" + container = null + } + //params specific to paths on the climb system params{ bowtie2_index = "s3://microbial-bioin-sp3/bowtie_hg19" From 84f137dc85b9c293dd81e1923821ad2552c710a3 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Thu, 11 Jan 2024 10:16:52 +0000 Subject: [PATCH 22/44] gatk4 to tb-profiler container --- docker/Dockerfile.tbprofiler-0.9.8 | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/Dockerfile.tbprofiler-0.9.8 b/docker/Dockerfile.tbprofiler-0.9.8 index 468d706..b7f6d80 100644 --- a/docker/Dockerfile.tbprofiler-0.9.8 +++ b/docker/Dockerfile.tbprofiler-0.9.8 @@ -36,6 +36,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ # install tb-profiler via bioconda; install into 'base' conda env RUN micromamba install --yes --name base --channel conda-forge --channel bioconda \ tb-profiler=${TBPROFILER_VER} && \ + micromamba install -c bioconda -c conda-forge gatk4 && \ micromamba clean --all --yes # hardcode 'base' env bin into PATH, so conda env does not have to be "activated" at run time From f013273053fa224f7f3e7e118bcb4e1780c5445b Mon Sep 17 00:00:00 2001 From: whalleyt Date: Thu, 11 Jan 2024 10:31:13 +0000 Subject: [PATCH 23/44] tb-profiler docker container gatk --- docker/Dockerfile.tbprofiler-0.9.8 | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile.tbprofiler-0.9.8 b/docker/Dockerfile.tbprofiler-0.9.8 index b7f6d80..b1f8b9a 100644 --- a/docker/Dockerfile.tbprofiler-0.9.8 +++ b/docker/Dockerfile.tbprofiler-0.9.8 @@ -35,9 +35,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ # install tb-profiler via bioconda; install into 'base' conda env RUN micromamba install --yes --name base --channel conda-forge --channel bioconda \ - tb-profiler=${TBPROFILER_VER} && \ - micromamba install -c bioconda -c conda-forge gatk4 && \ - micromamba clean --all --yes + tb-profiler=${TBPROFILER_VER} + +RUN micromamba install -c bioconda -c conda-forge gatk4 +RUN micromamba clean --all --yes # hardcode 'base' env bin into PATH, so conda env does not have to be "activated" at run time ENV PATH="/opt/conda/bin:${PATH}" From 616ef139464b695970f4182257d56610fb2d25e6 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Thu, 11 Jan 2024 10:35:41 +0000 Subject: [PATCH 24/44] tb-profiler docker container gatk --- docker/Dockerfile.tbprofiler-0.9.8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile.tbprofiler-0.9.8 b/docker/Dockerfile.tbprofiler-0.9.8 index b1f8b9a..5575ee4 100644 --- a/docker/Dockerfile.tbprofiler-0.9.8 +++ b/docker/Dockerfile.tbprofiler-0.9.8 @@ -37,7 +37,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ RUN micromamba install --yes --name base --channel conda-forge --channel bioconda \ tb-profiler=${TBPROFILER_VER} -RUN micromamba install -c bioconda -c conda-forge gatk4 +RUN micromamba install --yes --name base --channel conda-forge --channel bioconda gatk4 RUN micromamba clean --all --yes # hardcode 'base' env bin into PATH, so conda env does not have to be "activated" at run time From c7fd33c19657b5e74c1657e8aa9f3181196b6b7c Mon Sep 17 00:00:00 2001 From: whalleyt Date: Thu, 11 Jan 2024 10:51:54 +0000 Subject: [PATCH 25/44] allelic depth --- modules/vcfpredictModules.nf | 26 ++++++++++++++++++++++++++ workflows/vcfpredict.nf | 16 ++++++++++++++-- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/modules/vcfpredictModules.nf b/modules/vcfpredictModules.nf index 776ff0a..98eb7aa 100644 --- a/modules/vcfpredictModules.nf +++ b/modules/vcfpredictModules.nf @@ -70,6 +70,7 @@ process tbprofiler { input: val(sample_name) path(minos_vcf) + val(isSampleTB) output: path("results/tbprofiler.results.json") @@ -84,6 +85,31 @@ process tbprofiler { """ } +process add_allelic_depth { + label 'low_memory' + label 'low_cpu' + label 'tbprofiler' + + input: + val(sample_name) + path(minos_vcf) + path(reference) + val(isSampleTB) + + output: + path("${sample_name}_allelic_depth.minos.vcf") + + when: + isSampleTB =~ /CREATE\_ANTIBIOGRAM\_${sample_name}/ + + script: + """ + samtools faidx $reference + gatk VariantAnnotator -R $reference -V $minos_vcf -A DepthPerAlleleBySample -O ${sample_name}_allelic_depth.minos.vcf + """ + +} + process gnomonicus { tag {sample_name} diff --git a/workflows/vcfpredict.nf b/workflows/vcfpredict.nf index 2006e73..b932325 100644 --- a/workflows/vcfpredict.nf +++ b/workflows/vcfpredict.nf @@ -5,6 +5,7 @@ nextflow.enable.dsl = 2 include {vcfmix} from '../modules/vcfpredictModules.nf' params(params) include {tbprofiler} from '../modules/vcfpredictModules.nf' params(params) include {tbprofiler_update_db} from '../modules/vcfpredictModules.nf' params(params) +include {add_allelic_depth} from '../modules/vcfpredictModules.nf' params(params) // define workflow component workflow vcfpredict { @@ -13,6 +14,7 @@ workflow vcfpredict { clockwork_bcftools_tuple minos_vcf_tuple reference_fasta + main: @@ -24,10 +26,20 @@ workflow vcfpredict { if ( params.resistance_profiler == "tb-profiler"){ //get just the vcf - minos_vcf = minos_vcf_tuple.map{it[1]} sample_name = minos_vcf_tuple.map{it[0]} + minos_vcf = minos_vcf_tuple.map{it[1]} + do_we_resistance_profile = minos_vcf_tuple.map{it[2]} + if (params.update_tbprofiler == "yes"){ tbprofiler_update_db(reference_fasta) - tbprofiler(sample_name, minos_vcf) + } + + //add allelic depth back in: was calculated in mpileup but lost in minos + add_allelic_depth(sample_name, minos_vcf, reference_fasta, do_we_resistance_profile) + tbprofiler(sample_name, add_allelic_depth,out, do_we_resistance_profile) + } + + if (params.vcfmix == "yes" && params.resistance_profiler != "none"){ + //finalJson(vcfmix.out.vcfmix_json.join(gnomonicus.out.gnomon_json, by: 0)) } } From 77e234a5f958d4b06a4699462a83bea1a6ef916e Mon Sep 17 00:00:00 2001 From: whalleyt Date: Thu, 11 Jan 2024 11:07:32 +0000 Subject: [PATCH 26/44] deal with json of tbprofiler --- docker/Dockerfile.tbprofiler-0.9.8 | 1 + modules/vcfpredictModules.nf | 16 +++++++++++++++- workflows/vcfpredict.nf | 5 +++-- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile.tbprofiler-0.9.8 b/docker/Dockerfile.tbprofiler-0.9.8 index 5575ee4..1947e3b 100644 --- a/docker/Dockerfile.tbprofiler-0.9.8 +++ b/docker/Dockerfile.tbprofiler-0.9.8 @@ -38,6 +38,7 @@ RUN micromamba install --yes --name base --channel conda-forge --channel biocond tb-profiler=${TBPROFILER_VER} RUN micromamba install --yes --name base --channel conda-forge --channel bioconda gatk4 +RUN micromamba install --yes --name base --channel conda-forge --channel bioconda samtools RUN micromamba clean --all --yes # hardcode 'base' env bin into PATH, so conda env does not have to be "activated" at run time diff --git a/modules/vcfpredictModules.nf b/modules/vcfpredictModules.nf index 98eb7aa..574dbe9 100644 --- a/modules/vcfpredictModules.nf +++ b/modules/vcfpredictModules.nf @@ -66,6 +66,9 @@ process tbprofiler { label 'medium_memory' label 'medium_cpu' label 'tbprofiler' + + publishDir "${params.output_dir}/${sample_name}/antibiogram", mode: 'copy', pattern: '*.tbprofiler-out.json', overwrite: 'true' + publishDir "${params.output_dir}/$sample_name", mode: 'copy', overwrite: 'true', pattern: '*{_err.json,_report.json}' input: val(sample_name) @@ -73,15 +76,25 @@ process tbprofiler { val(isSampleTB) output: - path("results/tbprofiler.results.json") + tuple val(sample_name), path("${sample_name}.tbprofiler-out.json"), path("${sample_name}_report.json"), emit: tbprofiler_json when: isSampleTB =~ /CREATE\_ANTIBIOGRAM\_${sample_name}/ script: + error_log = "${sample_name}_err.json" + tbprofiler_json = "${sample_name}.tbprofiler-out.json" + """ bgzip ${minos_vcf} tb-profiler profile --vcf ${minos_vcf}.gz --threads ${task.cpus} + mv results/tbprofiler.results.json ${tbprofiler_json} + + cp ${sample_name}_report.json ${sample_name}_report_previous.json + + echo '{"complete":"workflow complete without error"}' | jq '.' > ${error_log} + + jq -s ".[0] * .[1] * .[2]" ${error_log} ${sample_name}_report_previous.json ${tbprofiler_json} > ${report_json} """ } @@ -94,6 +107,7 @@ process add_allelic_depth { val(sample_name) path(minos_vcf) path(reference) + path(report_json) val(isSampleTB) output: diff --git a/workflows/vcfpredict.nf b/workflows/vcfpredict.nf index b932325..2896984 100644 --- a/workflows/vcfpredict.nf +++ b/workflows/vcfpredict.nf @@ -29,6 +29,7 @@ workflow vcfpredict { sample_name = minos_vcf_tuple.map{it[0]} minos_vcf = minos_vcf_tuple.map{it[1]} do_we_resistance_profile = minos_vcf_tuple.map{it[2]} + report_json = minos_vcf_tuple.map{it[3]} if (params.update_tbprofiler == "yes"){ tbprofiler_update_db(reference_fasta) @@ -36,10 +37,10 @@ workflow vcfpredict { //add allelic depth back in: was calculated in mpileup but lost in minos add_allelic_depth(sample_name, minos_vcf, reference_fasta, do_we_resistance_profile) - tbprofiler(sample_name, add_allelic_depth,out, do_we_resistance_profile) + tbprofiler(sample_name, add_allelic_depth,out, report_json, do_we_resistance_profile) } if (params.vcfmix == "yes" && params.resistance_profiler != "none"){ - //finalJson(vcfmix.out.vcfmix_json.join(gnomonicus.out.gnomon_json, by: 0)) + finalJson(vcfmix.out.vcfmix_json.join(gnomonicus.out.tbprofiler_json, by: 0)) } } From 5a760f1469e3973e8189ce3c089c6f83544b0ce8 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Thu, 11 Jan 2024 11:21:48 +0000 Subject: [PATCH 27/44] tidy up --- modules/vcfpredictModules.nf | 2 +- workflows/vcfpredict.nf | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/modules/vcfpredictModules.nf b/modules/vcfpredictModules.nf index 574dbe9..551788d 100644 --- a/modules/vcfpredictModules.nf +++ b/modules/vcfpredictModules.nf @@ -73,6 +73,7 @@ process tbprofiler { input: val(sample_name) path(minos_vcf) + path(report_json) val(isSampleTB) output: @@ -107,7 +108,6 @@ process add_allelic_depth { val(sample_name) path(minos_vcf) path(reference) - path(report_json) val(isSampleTB) output: diff --git a/workflows/vcfpredict.nf b/workflows/vcfpredict.nf index 2896984..375e410 100644 --- a/workflows/vcfpredict.nf +++ b/workflows/vcfpredict.nf @@ -6,6 +6,7 @@ include {vcfmix} from '../modules/vcfpredictModules.nf' params(params) include {tbprofiler} from '../modules/vcfpredictModules.nf' params(params) include {tbprofiler_update_db} from '../modules/vcfpredictModules.nf' params(params) include {add_allelic_depth} from '../modules/vcfpredictModules.nf' params(params) +include {finalJson} from '../modules/vcfpredictModules.nf' params(params) // define workflow component workflow vcfpredict { @@ -37,10 +38,10 @@ workflow vcfpredict { //add allelic depth back in: was calculated in mpileup but lost in minos add_allelic_depth(sample_name, minos_vcf, reference_fasta, do_we_resistance_profile) - tbprofiler(sample_name, add_allelic_depth,out, report_json, do_we_resistance_profile) + tbprofiler(sample_name, add_allelic_depth.out, report_json, do_we_resistance_profile) } if (params.vcfmix == "yes" && params.resistance_profiler != "none"){ - finalJson(vcfmix.out.vcfmix_json.join(gnomonicus.out.tbprofiler_json, by: 0)) + finalJson(vcfmix.out.vcfmix_json.join(tbprofiler.out.tbprofiler_json, by: 0)) } } From 255500fba15e47bb5d8365da1d22460553b8dc20 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Thu, 11 Jan 2024 15:18:47 +0000 Subject: [PATCH 28/44] add jq to tbprofiler docker --- docker/Dockerfile.tbprofiler-0.9.8 | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/Dockerfile.tbprofiler-0.9.8 b/docker/Dockerfile.tbprofiler-0.9.8 index 1947e3b..686c9c4 100644 --- a/docker/Dockerfile.tbprofiler-0.9.8 +++ b/docker/Dockerfile.tbprofiler-0.9.8 @@ -39,6 +39,7 @@ RUN micromamba install --yes --name base --channel conda-forge --channel biocond RUN micromamba install --yes --name base --channel conda-forge --channel bioconda gatk4 RUN micromamba install --yes --name base --channel conda-forge --channel bioconda samtools +RUN micromamba install --yes --name base --channel conda-forge jq RUN micromamba clean --all --yes # hardcode 'base' env bin into PATH, so conda env does not have to be "activated" at run time From 658f7bd58ce50cd7390f70de7e21833571b79c73 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Thu, 11 Jan 2024 15:24:56 +0000 Subject: [PATCH 29/44] remove error ignore --- config/containers.config | 1 - 1 file changed, 1 deletion(-) diff --git a/config/containers.config b/config/containers.config index c1bfc80..dece260 100644 --- a/config/containers.config +++ b/config/containers.config @@ -6,7 +6,6 @@ params{ process { - errorStrategy = 'ignore' update_tbprofiler = "false" From 930098083249c57f5e7a15bb1b79c33627efc63e Mon Sep 17 00:00:00 2001 From: whalleyt Date: Fri, 12 Jan 2024 11:38:02 +0000 Subject: [PATCH 30/44] update workflow vcfpredict --- .../containers-checkpoint.config | 49 +++ main.nf | 2 +- .../clockworkModules-checkpoint.nf | 313 ++++++++++++++++++ .../vcfpredictModules-checkpoint.nf | 216 ++++++++++++ modules/vcfpredictModules.nf | 1 + .../clockwork-checkpoint.nf | 44 +++ .../vcfpredict-checkpoint.nf | 47 +++ 7 files changed, 671 insertions(+), 1 deletion(-) create mode 100644 config/.ipynb_checkpoints/containers-checkpoint.config create mode 100644 modules/.ipynb_checkpoints/clockworkModules-checkpoint.nf create mode 100644 modules/.ipynb_checkpoints/vcfpredictModules-checkpoint.nf create mode 100644 workflows/.ipynb_checkpoints/clockwork-checkpoint.nf create mode 100644 workflows/.ipynb_checkpoints/vcfpredict-checkpoint.nf diff --git a/config/.ipynb_checkpoints/containers-checkpoint.config b/config/.ipynb_checkpoints/containers-checkpoint.config new file mode 100644 index 0000000..dece260 --- /dev/null +++ b/config/.ipynb_checkpoints/containers-checkpoint.config @@ -0,0 +1,49 @@ +params{ + container_enabled = "true" + container_enabled = "true" + resource_dir = "/resources" +} + + +process { + update_tbprofiler = "false" + + + withLabel:low_cpu {cpus = 2} + withLabel:normal_cpu { cpus = 8 } + withLabel:low_memory { memory = '5GB' } + withLabel:medium_memory { memory = '10GB' } + withLabel:high_memory { memory = '18GB' } + + withLabel:getversion { + container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" + } + + withLabel:preprocessing { + container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" + } + + withLabel:tbprofiler { + container = "quay.io/pathogen-genomics-cymru/tbprofiler:0.9.8" + } + + withName:downloadContamGenomes { + shell = ['/bin/bash','-u'] + errorStrategy = { task.exitStatus in 100..113 ? 'retry' : 'terminate' } + maxRetries = 5 + } + + withLabel:retryAfanc { + shell = ['/bin/bash','-u'] + errorStrategy = {task.exitStatus == 1 ? 'retry' : 'ignore' } + maxRetries = 5 + } + + withLabel:clockwork { + container = "quay.io/pathogen-genomics-cymru/clockwork:0.9.8" + } + + withLabel:vcfpredict { + container = "quay.io/pathogen-genomics-cymru/vcfpredict:0.9.8" + } + } \ No newline at end of file diff --git a/main.nf b/main.nf index 0a9cbdf..2f38ef6 100644 --- a/main.nf +++ b/main.nf @@ -214,7 +214,7 @@ workflow { minos_vcf = clockwork.out.minos_vcf reference = clockwork.out.reference - vcfpredict(mpileup_vcf, minos_vcf, reference, resistance_profiler, update_tbprofiler) + vcfpredict(mpileup_vcf, minos_vcf, reference) } diff --git a/modules/.ipynb_checkpoints/clockworkModules-checkpoint.nf b/modules/.ipynb_checkpoints/clockworkModules-checkpoint.nf new file mode 100644 index 0000000..d793a34 --- /dev/null +++ b/modules/.ipynb_checkpoints/clockworkModules-checkpoint.nf @@ -0,0 +1,313 @@ +// modules for the clockwork workflow + +process getRefFromJSON { + tag { sample_name } + label 'clockwork' + label 'low_memory' + label 'low_cpu' + + input: + path(species_json) + val(do_we_align) + val(sample_name) + + when: + do_we_align =~ /NOW\_ALIGN\_TO\_REF\_${sample_name}/ + + output: + stdout + + script: + """ + ref_string=\$(jq -r '.top_hit.file_paths.ref_fa' ${species_json}) + echo "\$ref_string" + """ + + +} + +process alignToRef { + /** + * @QCcheckpoint fail if insufficient number and/or quality of read alignments to the reference genome + */ + + tag { sample_name } + label 'clockwork' + label 'normal_cpu' + label 'medium_memory' + + publishDir "${params.output_dir}/$sample_name/output_bam", mode: 'copy', overwrite: 'true', pattern: '*{.bam,.bam.bai,_alignmentStats.json}' + publishDir "${params.output_dir}/$sample_name", mode: 'copy', overwrite: 'true', pattern: '*{_err.json,_report.json}' + + input: + tuple val(sample_name), path(fq1), path(fq2), path(software_json), path(species_json), val(doWeAlign) + path(reference_path) + + when: + doWeAlign =~ /NOW\_ALIGN\_TO\_REF\_${sample_name}/ + + output: + tuple val(sample_name), path("${sample_name}_report.json"), path("${sample_name}.bam"), path("${sample_name}.fa"), stdout, emit: alignToRef_bam + path("${sample_name}.bam.bai", emit: alignToRef_bai) + path("${sample_name}_alignmentStats.json", emit: alignToRef_json) + path "${sample_name}_err.json", emit: alignToRef_log optional true + tuple val(sample_name), path("${sample_name}_report.json"), emit: alignToRef_report + + script: + bam = "${sample_name}.bam" + bai = "${sample_name}.bam.bai" + stats = "${sample_name}.stats" + stats_json = "${sample_name}_alignmentStats.json" + report_json = "${sample_name}_report.json" + error_log = "${sample_name}_err.json" + + """ + echo $reference_path + cp ${reference_path} ${sample_name}.fa + + minimap2 -ax sr ${sample_name}.fa -t ${task.cpus} $fq1 $fq2 | samtools fixmate -m - - | samtools sort -T tmp - | samtools markdup --reference ${sample_name}.fa - minimap.bam + + java -jar /usr/local/bin/picard.jar AddOrReplaceReadGroups INPUT=minimap.bam OUTPUT=${bam} RGID=${sample_name} RGLB=lib RGPL=Illumina RGPU=unit RGSM=sample + + samtools index ${bam} ${bai} + samtools stats ${bam} > ${stats} + + parse_samtools_stats.py ${bam} ${stats} > ${stats_json} + create_final_json.py ${stats_json} ${species_json} + + cp ${sample_name}_report.json ${sample_name}_report_previous.json + + jq -s ".[0] * .[1]" ${software_json} ${sample_name}_report_previous.json > ${report_json} + + continue=\$(jq -r '.summary_questions.continue_to_clockwork' ${report_json}) + + if [ \$continue == 'yes' ]; then printf "NOW_VARCALL_${sample_name}"; elif [ \$continue == 'no' ]; then echo '{"error":"insufficient number and/or quality of read alignments to the reference genome"}' | jq '.' > ${error_log} && jq -s ".[0] * .[1]" ${error_log} ${sample_name}_report_previous.json > ${report_json}; fi + """ + + stub: + bam = "${sample_name}.bam" + bai = "${sample_name}.bam.bai" + stats = "${sample_name}.stats" + stats_json = "${sample_name}_alignmentStats.json" + out_json = "${sample_name}_report.json" + error_log = "${sample_name}_err.json" + + """ + touch ${sample_name}.fa + touch ${bam} + touch ${bai} + touch ${stats} + touch ${stats_json} + touch ${out_json} + touch ${error_log} + printf ${params.alignToRef_doWeVarCall} + """ +} + +process callVarsMpileup { + /** + * @QCcheckpoint none + */ + + tag { sample_name } + label 'clockwork' + label 'normal_cpu' + label 'low_memory' + + publishDir "${params.output_dir}/$sample_name/output_vcfs", mode: 'copy', pattern: '*.vcf' + + input: + tuple val(sample_name), path(report_json), path(bam), path(ref), val(doWeVarCall) + + when: + doWeVarCall =~ /NOW\_VARCALL\_${sample_name}/ + + output: + tuple val(sample_name), path("${sample_name}.bcftools.vcf"), emit: mpileup_vcf + + script: + bcftools_vcf = "${sample_name}.bcftools.vcf" + + """ + bcftools mpileup -Ou -a 'INFO/AD' -f ${ref} ${bam} | bcftools call --threads ${task.cpus} -vm -O v -o ${bcftools_vcf} + """ + + stub: + bcftools_vcf = "${sample_name}.bcftools.vcf" + + """ + touch ${bcftools_vcf} + """ +} + +process getRefCortex { + tag { sample_name } + label 'clockwork' + label 'low_memory' + label 'low_cpu' + + input: + tuple val(sample_name), path(report_json), path(bam), path(ref), val(doWeVarCall) + + when: + doWeVarCall =~ /NOW\_VARCALL\_${sample_name}/ + + output: + stdout + + script: + """ + ref_dir=\$(jq -r '.top_hit.file_paths.clockwork_ref_dir' ${report_json}) + echo "\$ref_dir" + """ + + +} + +process callVarsCortex { + /** + * @QCcheckpoint none + */ + + tag { sample_name } + label 'clockwork' + label 'normal_cpu' + label 'medium_memory' + + publishDir "${params.output_dir}/$sample_name/output_vcfs", mode: 'copy', pattern: '*.vcf' + + input: + tuple val(sample_name), path(report_json), path(bam), path(ref), val(doWeVarCall) + path(ref_dir) + + when: + doWeVarCall =~ /NOW\_VARCALL\_${sample_name}/ + + output: + tuple val(sample_name), path("${sample_name}.cortex.vcf"), emit: cortex_vcf + + script: + cortex_vcf = "${sample_name}.cortex.vcf" + + """ + cp -r ${ref_dir}/* . + + clockwork cortex . ${bam} cortex ${sample_name} + cp cortex/cortex.out/vcfs/cortex_wk_flow_I_RefCC_FINALcombined_BC_calls_at_all_k.raw.vcf ${cortex_vcf} + """ + + stub: + cortex_vcf = "${sample_name}.cortex.vcf" + + """ + touch ${cortex_vcf} + """ +} + +process minos { + /** + * @QCcheckpoint check if top species is TB, if yes pass vcf to resistance profiling + */ + + tag { sample_name } + label 'clockwork' + label 'medium_memory' + label 'normal_cpu' + + publishDir "${params.output_dir}/$sample_name/output_vcfs", mode: 'copy', pattern: '*.vcf' + publishDir "${params.output_dir}/$sample_name", mode: 'copy', overwrite: 'true', pattern: '*{_err.json,_report.json}' + + input: + tuple val(sample_name), path(report_json), path(bam), path(ref), val(doWeVarCall), path(cortex_vcf), path(bcftools_vcf) + + output: + tuple val(sample_name), path(report_json), path(bam), path(ref), emit: minos_bam + tuple val(sample_name), path("${sample_name}.minos.vcf"), stdout, emit: minos_vcf + tuple val(sample_name), path("${sample_name}_report.json"), emit: minos_report + path "${sample_name}_err.json", emit: minos_log optional true + + script: + minos_vcf = "${sample_name}.minos.vcf" + error_log = "${sample_name}_err.json" + + """ + awk '{print \$1}' ${ref} > ref.fa + + minos adjudicate --force --reads ${bam} minos ref.fa ${bcftools_vcf} ${cortex_vcf} + cp minos/final.vcf ${minos_vcf} + rm -rf minos + + top_hit=\$(jq -r '.top_hit.name' ${report_json}) + + cp ${sample_name}_report.json ${sample_name}_report_previous.json + + if [[ \$top_hit =~ ^"Mycobacterium tuberculosis" ]]; then printf "CREATE_ANTIBIOGRAM_${sample_name}"; else echo '{"resistance-profiling-warning":"sample is not TB so cannot produce antibiogram using resistance profiling tools"}' | jq '.' > ${error_log} && printf "no" && jq -s ".[0] * .[1]" ${error_log} ${sample_name}_report_previous.json > ${report_json}; fi + """ + + stub: + minos_vcf = "${sample_name}.minos.vcf" + error_log = "${sample_name}_err.json" + + """ + touch ${minos_vcf} + touch ${error_log} + printf ${params.minos_isSampleTB} + """ +} + +process gvcf { + /** + * @QCcheckpoint none + */ + + tag { sample_name } + label 'clockwork' + label 'normal_cpu' + label 'low_memory' + + publishDir "${params.output_dir}/$sample_name/output_fasta", mode: 'copy', pattern: '*.fa' + publishDir "${params.output_dir}/$sample_name/output_vcfs", mode: 'copy', pattern: '*.vcf.gz' + publishDir "${params.output_dir}/$sample_name", mode: 'copy', overwrite: 'true', pattern: '*{_err.json,_report.json}' + + input: + tuple val(sample_name), path(report_json), path(bam), path(ref), val(doWeValCall), path(minos_vcf), val(isSampleTB) + + output: + path("${sample_name}.gvcf.vcf.gz", emit: gvcf) + path("${sample_name}.fa", emit: gvcf_fa) + path "${sample_name}_err.json", emit: gvcf_log optional true + path "${sample_name}_report.json", emit: gvcf_report optional true + + script: + gvcf = "${sample_name}.gvcf.vcf" + gvcf_fa = "${sample_name}.fa" + error_log = "${sample_name}_err.json" + + """ + awk '{print \$1}' ${ref} > ref.fa + + samtools mpileup -ugf ref.fa ${bam} | bcftools call --threads ${task.cpus} -m -O v -o samtools_all_pos.vcf + + clockwork gvcf_from_minos_and_samtools ref.fa ${minos_vcf} samtools_all_pos.vcf ${gvcf} + clockwork gvcf_to_fasta ${gvcf} ${gvcf_fa} + + rm samtools_all_pos.vcf + gzip ${gvcf} + + cp ${sample_name}_report.json ${sample_name}_report_previous.json + + if [ ${params.vcfmix} == "no" ] && [ ${params.resistance_profiler} == "none" ]; then echo '{"complete":"workflow complete without error"}' | jq '.' > ${error_log} && jq -s ".[0] * .[1]" ${error_log} ${sample_name}_report_previous.json > ${report_json}; fi + """ + + stub: + gvcf = "${sample_name}.gvcf.vcf.gz" + gvcf_fa = "${sample_name}.fa" + error_log = "${sample_name}_err.json" + + """ + touch ${gvcf} + touch ${gvcf_fa} + touch ${error_log} + """ +} + diff --git a/modules/.ipynb_checkpoints/vcfpredictModules-checkpoint.nf b/modules/.ipynb_checkpoints/vcfpredictModules-checkpoint.nf new file mode 100644 index 0000000..4bc7957 --- /dev/null +++ b/modules/.ipynb_checkpoints/vcfpredictModules-checkpoint.nf @@ -0,0 +1,216 @@ +// modules for the vcfpredict workflow + +process vcfmix { + + tag {sample_name} + label 'vcfpredict' + label 'low_memory' + label 'low_cpu' + + errorStrategy 'ignore' + + publishDir "${params.output_dir}/${sample_name}/output_vcfs", mode: 'copy', pattern: '*_f-stats.json', overwrite: 'true' + publishDir "${params.output_dir}/${sample_name}/output_vcfs", mode: 'copy', pattern: '*.csv', overwrite: 'true' + publishDir "${params.output_dir}/$sample_name", mode: 'copy', overwrite: 'true', pattern: '*{_err.json,_report.json}' + + input: + tuple val(sample_name), path(vcf), path(report_json) + + output: + tuple val(sample_name), path("${sample_name}_f-stats.json"), emit: vcfmix_json + tuple val(sample_name), path("${sample_name}_f-stats.json"), path("${sample_name}_vcfmix-regions.csv"), emit: vcfmix_json_csv + path "${sample_name}_err.json", emit: vcfmix_log optional true + path ("${sample_name}_report.json", emit: vcfmix_report) + + script: + bcftools_vcf = "${sample_name}.bcftools.vcf" + error_log = "${sample_name}_err.json" + + """ + run-vcfmix.py ${bcftools_vcf} + + cp ${sample_name}_report.json ${sample_name}_report_previous.json + + jq -s ".[0] * .[1]" ${sample_name}_report_previous.json ${sample_name}_f-stats.json > ${report_json} + + if [ ${params.resistance_profiler} == "none" ]; then echo '{"complete":"workflow complete without error"}' | jq '.' > ${error_log} && jq -s ".[0] * .[1] * .[2]" ${error_log} ${sample_name}_report_previous.json ${sample_name}_f-stats.json > ${report_json}; fi + """ + + stub: + vcfmix_json = "${sample_name}_f-stats.json" + vcfmix_csv = "${sample_name}_vcfmix-regions.csv" + error_log = "${sample_name}_err.json" + + """ + touch ${vcfmix_json} + touch ${vcfmix_csv} + touch ${error_log} + """ +} + +process tbprofiler_update_db { + label 'low_memory' + label 'low_cpu' + label 'tbprofiler' + + input: + path(reference) + + script: + """ + tb-profiler update_tbdb --match_ref $reference + """ +} + +process tbprofiler { + label 'medium_memory' + label 'medium_cpu' + label 'tbprofiler' + + publishDir "${params.output_dir}/${sample_name}/antibiogram", mode: 'copy', pattern: '*.tbprofiler-out.json', overwrite: 'true' + publishDir "${params.output_dir}/$sample_name", mode: 'copy', overwrite: 'true', pattern: '*{_err.json,_report.json}' + + input: + val(sample_name) + path(minos_vcf) + path(report_json) + val(isSampleTB) + + output: + tuple val(sample_name), path("${sample_name}.tbprofiler-out.json"), path("${sample_name}_report.json"), emit: tbprofiler_json + + when: + isSampleTB =~ /CREATE\_ANTIBIOGRAM\_${sample_name}/ + + script: + error_log = "${sample_name}_err.json" + tbprofiler_json = "${sample_name}.tbprofiler-out.json" + + """ + bgzip ${minos_vcf} + tb-profiler profile --vcf ${minos_vcf}.gz --threads ${task.cpus} + mv results/tbprofiler.results.json ${tbprofiler_json} + + cp ${sample_name}_report.json ${sample_name}_report_previous.json + + echo '{"complete":"workflow complete without error"}' | jq '.' > ${error_log} + + jq -s ".[0] * .[1] * .[2]" ${error_log} ${sample_name}_report_previous.json ${tbprofiler_json} > ${report_json} + """ +} + +process add_allelic_depth { + label 'low_memory' + label 'low_cpu' + label 'tbprofiler' + + input: + val(sample_name) + path(minos_vcf) + path(reference) + val(isSampleTB) + + output: + path("${sample_name}_allelic_depth.minos.vcf") + + when: + isSampleTB =~ /CREATE\_ANTIBIOGRAM\_${sample_name}/ + + script: + """ + samtools faidx $reference + samtools dict $reference -o ${reference.baseName}.dict + gatk VariantAnnotator -R $reference -V $minos_vcf -A DepthPerAlleleBySample -O ${sample_name}_allelic_depth.minos.vcf + """ + +} + +process gnomonicus { + + tag {sample_name} + label 'vcfpredict' + label 'low_memory' + label 'low_cpu' + + errorStrategy 'ignore' + + publishDir "${params.output_dir}/${sample_name}/antibiogram", mode: 'copy', pattern: '*.gnomonicus-out.json', overwrite: 'true' + publishDir "${params.output_dir}/${sample_name}/antibiogram", mode: 'copy', pattern: '*.csv', overwrite: 'true' + publishDir "${params.output_dir}/${sample_name}/antibiogram", mode: 'copy', pattern: '*.fasta', overwrite: 'true' + publishDir "${params.output_dir}/$sample_name", mode: 'copy', overwrite: 'true', pattern: '*{_err.json,_report.json}' + + input: + tuple val(sample_name), path(vcf), val(isSampleTB), path(report_json) + path(genbank) + when: + isSampleTB =~ /CREATE\_ANTIBIOGRAM\_${sample_name}/ + + output: + tuple val(sample_name), path("${sample_name}.gnomonicus-out.json"), path("${sample_name}_report.json"), emit: gnomon_json + tuple val(sample_name), path("${sample_name}.effects.csv"), path("${sample_name}.mutations.csv"), emit: gnomon_csv optional true + tuple val(sample_name), path("*-fixed.fasta"), emit: gnomon_fasta + path("${sample_name}_err.json", emit: gnomon_log) + path ("${sample_name}_report.json", emit: gnomon_report) + + script: + minos_vcf = "${sample_name}.minos.vcf" + error_log = "${sample_name}_err.json" + + """ + gnomonicus --genome_object ${genbank} --catalogue ${params.amr_cat} --vcf_file ${minos_vcf} --output_dir . --json --fasta fixed + + cp ${sample_name}_report.json ${sample_name}_report_previous.json + + echo '{"complete":"workflow complete without error"}' | jq '.' > ${error_log} + + jq -s ".[0] * .[1] * .[2]" ${error_log} ${sample_name}_report_previous.json ${sample_name}.gnomonicus-out.json > ${report_json} + """ + + stub: + gnomonicus_json = "${sample_name}.gnomonicus-out.json" + gnomonicus_fasta = "${sample_name}-fixed.fasta" + gnomonicus_effects = "${sample_name}.effects.csv" + gnomonicus_mutations = "${sample_name}.mutations.csv" + error_log = "${sample_name}_err.json" + + """ + touch ${gnomonicus_json} + touch ${gnomonicus_fasta} + touch ${gnomonicus_effects} + touch ${gnomonicus_mutations} + touch ${error_log} + """ +} + +process finalJson { + + tag {sample_name} + label 'vcfpredict' + label 'low_memory' + label 'low_cpu' + + errorStrategy 'ignore' + + publishDir "${params.output_dir}/$sample_name", mode: 'copy', overwrite: 'true', pattern: '*_report.json' + + input: + tuple val(sample_name), path(vcfmix_json), path(gnomon_json), path(report_json) + + output: + tuple val(sample_name), path("${sample_name}_report.json"), emit: final_json + + script: + """ + cp ${sample_name}_report.json ${sample_name}_report_previous.json + + jq -s ".[0] * .[1]" ${sample_name}_report_previous.json ${vcfmix_json} > ${report_json} + """ + + stub: + report_json = "${sample_name}_report.json" + + """ + touch ${report_json} + """ + +} diff --git a/modules/vcfpredictModules.nf b/modules/vcfpredictModules.nf index 551788d..4bc7957 100644 --- a/modules/vcfpredictModules.nf +++ b/modules/vcfpredictModules.nf @@ -119,6 +119,7 @@ process add_allelic_depth { script: """ samtools faidx $reference + samtools dict $reference -o ${reference.baseName}.dict gatk VariantAnnotator -R $reference -V $minos_vcf -A DepthPerAlleleBySample -O ${sample_name}_allelic_depth.minos.vcf """ diff --git a/workflows/.ipynb_checkpoints/clockwork-checkpoint.nf b/workflows/.ipynb_checkpoints/clockwork-checkpoint.nf new file mode 100644 index 0000000..bf6f5e1 --- /dev/null +++ b/workflows/.ipynb_checkpoints/clockwork-checkpoint.nf @@ -0,0 +1,44 @@ +// enable dsl2 +nextflow.enable.dsl = 2 + +// import modules +include {alignToRef} from '../modules/clockworkModules.nf' params(params) +include {callVarsMpileup} from '../modules/clockworkModules.nf' params(params) +include {callVarsCortex} from '../modules/clockworkModules.nf' params(params) +include {minos} from '../modules/clockworkModules.nf' params(params) +include {gvcf} from '../modules/clockworkModules.nf' params(params) +include {getRefFromJSON} from '../modules/clockworkModules.nf' params(params) +include {getRefCortex} from '../modules/clockworkModules.nf' params(params) + +// define workflow component +workflow clockwork { + + take: + input_seqs_json + + main: + //get just the json + json = input_seqs_json.map{it[4]} + do_we_align = input_seqs_json.map{it[5]} + sample_name = input_seqs_json.map{it[0]} + + getRefFromJSON(json, do_we_align, sample_name) + alignToRef(input_seqs_json, getRefFromJSON.out) + + + callVarsMpileup(alignToRef.out.alignToRef_bam) + + getRefCortex(alignToRef.out.alignToRef_bam) + callVarsCortex(alignToRef.out.alignToRef_bam, getRefCortex.out) + + minos(alignToRef.out.alignToRef_bam.join(callVarsCortex.out.cortex_vcf, by: 0).join(callVarsMpileup.out.mpileup_vcf, by: 0)) + + gvcf(alignToRef.out.alignToRef_bam.join(minos.out.minos_vcf, by: 0)) + + emit: + + mpileup_vcf = callVarsMpileup.out.mpileup_vcf.join(minos.out.minos_report, by: 0) + minos_vcf = minos.out.minos_vcf.join(alignToRef.out.alignToRef_report, by: 0) + reference = getRefFromJSON.out + +} diff --git a/workflows/.ipynb_checkpoints/vcfpredict-checkpoint.nf b/workflows/.ipynb_checkpoints/vcfpredict-checkpoint.nf new file mode 100644 index 0000000..375e410 --- /dev/null +++ b/workflows/.ipynb_checkpoints/vcfpredict-checkpoint.nf @@ -0,0 +1,47 @@ +// enable dsl2 +nextflow.enable.dsl = 2 + +// import modules +include {vcfmix} from '../modules/vcfpredictModules.nf' params(params) +include {tbprofiler} from '../modules/vcfpredictModules.nf' params(params) +include {tbprofiler_update_db} from '../modules/vcfpredictModules.nf' params(params) +include {add_allelic_depth} from '../modules/vcfpredictModules.nf' params(params) +include {finalJson} from '../modules/vcfpredictModules.nf' params(params) + +// define workflow component +workflow vcfpredict { + + take: + clockwork_bcftools_tuple + minos_vcf_tuple + reference_fasta + + + main: + + if ( params.vcfmix == "yes" ) { + + vcfmix(clockwork_bcftools_tuple) + + } + + if ( params.resistance_profiler == "tb-profiler"){ + //get just the vcf + sample_name = minos_vcf_tuple.map{it[0]} + minos_vcf = minos_vcf_tuple.map{it[1]} + do_we_resistance_profile = minos_vcf_tuple.map{it[2]} + report_json = minos_vcf_tuple.map{it[3]} + + if (params.update_tbprofiler == "yes"){ + tbprofiler_update_db(reference_fasta) + } + + //add allelic depth back in: was calculated in mpileup but lost in minos + add_allelic_depth(sample_name, minos_vcf, reference_fasta, do_we_resistance_profile) + tbprofiler(sample_name, add_allelic_depth.out, report_json, do_we_resistance_profile) + } + + if (params.vcfmix == "yes" && params.resistance_profiler != "none"){ + finalJson(vcfmix.out.vcfmix_json.join(tbprofiler.out.tbprofiler_json, by: 0)) + } +} From f3345796f7d16278f02d241b9e262069fc643d5c Mon Sep 17 00:00:00 2001 From: whalleyt Date: Fri, 12 Jan 2024 11:40:23 +0000 Subject: [PATCH 31/44] rm checkpoints --- .../clockworkModules-checkpoint.nf | 313 ------------------ .../vcfpredictModules-checkpoint.nf | 216 ------------ .../clockwork-checkpoint.nf | 44 --- .../vcfpredict-checkpoint.nf | 47 --- 4 files changed, 620 deletions(-) delete mode 100644 modules/.ipynb_checkpoints/clockworkModules-checkpoint.nf delete mode 100644 modules/.ipynb_checkpoints/vcfpredictModules-checkpoint.nf delete mode 100644 workflows/.ipynb_checkpoints/clockwork-checkpoint.nf delete mode 100644 workflows/.ipynb_checkpoints/vcfpredict-checkpoint.nf diff --git a/modules/.ipynb_checkpoints/clockworkModules-checkpoint.nf b/modules/.ipynb_checkpoints/clockworkModules-checkpoint.nf deleted file mode 100644 index d793a34..0000000 --- a/modules/.ipynb_checkpoints/clockworkModules-checkpoint.nf +++ /dev/null @@ -1,313 +0,0 @@ -// modules for the clockwork workflow - -process getRefFromJSON { - tag { sample_name } - label 'clockwork' - label 'low_memory' - label 'low_cpu' - - input: - path(species_json) - val(do_we_align) - val(sample_name) - - when: - do_we_align =~ /NOW\_ALIGN\_TO\_REF\_${sample_name}/ - - output: - stdout - - script: - """ - ref_string=\$(jq -r '.top_hit.file_paths.ref_fa' ${species_json}) - echo "\$ref_string" - """ - - -} - -process alignToRef { - /** - * @QCcheckpoint fail if insufficient number and/or quality of read alignments to the reference genome - */ - - tag { sample_name } - label 'clockwork' - label 'normal_cpu' - label 'medium_memory' - - publishDir "${params.output_dir}/$sample_name/output_bam", mode: 'copy', overwrite: 'true', pattern: '*{.bam,.bam.bai,_alignmentStats.json}' - publishDir "${params.output_dir}/$sample_name", mode: 'copy', overwrite: 'true', pattern: '*{_err.json,_report.json}' - - input: - tuple val(sample_name), path(fq1), path(fq2), path(software_json), path(species_json), val(doWeAlign) - path(reference_path) - - when: - doWeAlign =~ /NOW\_ALIGN\_TO\_REF\_${sample_name}/ - - output: - tuple val(sample_name), path("${sample_name}_report.json"), path("${sample_name}.bam"), path("${sample_name}.fa"), stdout, emit: alignToRef_bam - path("${sample_name}.bam.bai", emit: alignToRef_bai) - path("${sample_name}_alignmentStats.json", emit: alignToRef_json) - path "${sample_name}_err.json", emit: alignToRef_log optional true - tuple val(sample_name), path("${sample_name}_report.json"), emit: alignToRef_report - - script: - bam = "${sample_name}.bam" - bai = "${sample_name}.bam.bai" - stats = "${sample_name}.stats" - stats_json = "${sample_name}_alignmentStats.json" - report_json = "${sample_name}_report.json" - error_log = "${sample_name}_err.json" - - """ - echo $reference_path - cp ${reference_path} ${sample_name}.fa - - minimap2 -ax sr ${sample_name}.fa -t ${task.cpus} $fq1 $fq2 | samtools fixmate -m - - | samtools sort -T tmp - | samtools markdup --reference ${sample_name}.fa - minimap.bam - - java -jar /usr/local/bin/picard.jar AddOrReplaceReadGroups INPUT=minimap.bam OUTPUT=${bam} RGID=${sample_name} RGLB=lib RGPL=Illumina RGPU=unit RGSM=sample - - samtools index ${bam} ${bai} - samtools stats ${bam} > ${stats} - - parse_samtools_stats.py ${bam} ${stats} > ${stats_json} - create_final_json.py ${stats_json} ${species_json} - - cp ${sample_name}_report.json ${sample_name}_report_previous.json - - jq -s ".[0] * .[1]" ${software_json} ${sample_name}_report_previous.json > ${report_json} - - continue=\$(jq -r '.summary_questions.continue_to_clockwork' ${report_json}) - - if [ \$continue == 'yes' ]; then printf "NOW_VARCALL_${sample_name}"; elif [ \$continue == 'no' ]; then echo '{"error":"insufficient number and/or quality of read alignments to the reference genome"}' | jq '.' > ${error_log} && jq -s ".[0] * .[1]" ${error_log} ${sample_name}_report_previous.json > ${report_json}; fi - """ - - stub: - bam = "${sample_name}.bam" - bai = "${sample_name}.bam.bai" - stats = "${sample_name}.stats" - stats_json = "${sample_name}_alignmentStats.json" - out_json = "${sample_name}_report.json" - error_log = "${sample_name}_err.json" - - """ - touch ${sample_name}.fa - touch ${bam} - touch ${bai} - touch ${stats} - touch ${stats_json} - touch ${out_json} - touch ${error_log} - printf ${params.alignToRef_doWeVarCall} - """ -} - -process callVarsMpileup { - /** - * @QCcheckpoint none - */ - - tag { sample_name } - label 'clockwork' - label 'normal_cpu' - label 'low_memory' - - publishDir "${params.output_dir}/$sample_name/output_vcfs", mode: 'copy', pattern: '*.vcf' - - input: - tuple val(sample_name), path(report_json), path(bam), path(ref), val(doWeVarCall) - - when: - doWeVarCall =~ /NOW\_VARCALL\_${sample_name}/ - - output: - tuple val(sample_name), path("${sample_name}.bcftools.vcf"), emit: mpileup_vcf - - script: - bcftools_vcf = "${sample_name}.bcftools.vcf" - - """ - bcftools mpileup -Ou -a 'INFO/AD' -f ${ref} ${bam} | bcftools call --threads ${task.cpus} -vm -O v -o ${bcftools_vcf} - """ - - stub: - bcftools_vcf = "${sample_name}.bcftools.vcf" - - """ - touch ${bcftools_vcf} - """ -} - -process getRefCortex { - tag { sample_name } - label 'clockwork' - label 'low_memory' - label 'low_cpu' - - input: - tuple val(sample_name), path(report_json), path(bam), path(ref), val(doWeVarCall) - - when: - doWeVarCall =~ /NOW\_VARCALL\_${sample_name}/ - - output: - stdout - - script: - """ - ref_dir=\$(jq -r '.top_hit.file_paths.clockwork_ref_dir' ${report_json}) - echo "\$ref_dir" - """ - - -} - -process callVarsCortex { - /** - * @QCcheckpoint none - */ - - tag { sample_name } - label 'clockwork' - label 'normal_cpu' - label 'medium_memory' - - publishDir "${params.output_dir}/$sample_name/output_vcfs", mode: 'copy', pattern: '*.vcf' - - input: - tuple val(sample_name), path(report_json), path(bam), path(ref), val(doWeVarCall) - path(ref_dir) - - when: - doWeVarCall =~ /NOW\_VARCALL\_${sample_name}/ - - output: - tuple val(sample_name), path("${sample_name}.cortex.vcf"), emit: cortex_vcf - - script: - cortex_vcf = "${sample_name}.cortex.vcf" - - """ - cp -r ${ref_dir}/* . - - clockwork cortex . ${bam} cortex ${sample_name} - cp cortex/cortex.out/vcfs/cortex_wk_flow_I_RefCC_FINALcombined_BC_calls_at_all_k.raw.vcf ${cortex_vcf} - """ - - stub: - cortex_vcf = "${sample_name}.cortex.vcf" - - """ - touch ${cortex_vcf} - """ -} - -process minos { - /** - * @QCcheckpoint check if top species is TB, if yes pass vcf to resistance profiling - */ - - tag { sample_name } - label 'clockwork' - label 'medium_memory' - label 'normal_cpu' - - publishDir "${params.output_dir}/$sample_name/output_vcfs", mode: 'copy', pattern: '*.vcf' - publishDir "${params.output_dir}/$sample_name", mode: 'copy', overwrite: 'true', pattern: '*{_err.json,_report.json}' - - input: - tuple val(sample_name), path(report_json), path(bam), path(ref), val(doWeVarCall), path(cortex_vcf), path(bcftools_vcf) - - output: - tuple val(sample_name), path(report_json), path(bam), path(ref), emit: minos_bam - tuple val(sample_name), path("${sample_name}.minos.vcf"), stdout, emit: minos_vcf - tuple val(sample_name), path("${sample_name}_report.json"), emit: minos_report - path "${sample_name}_err.json", emit: minos_log optional true - - script: - minos_vcf = "${sample_name}.minos.vcf" - error_log = "${sample_name}_err.json" - - """ - awk '{print \$1}' ${ref} > ref.fa - - minos adjudicate --force --reads ${bam} minos ref.fa ${bcftools_vcf} ${cortex_vcf} - cp minos/final.vcf ${minos_vcf} - rm -rf minos - - top_hit=\$(jq -r '.top_hit.name' ${report_json}) - - cp ${sample_name}_report.json ${sample_name}_report_previous.json - - if [[ \$top_hit =~ ^"Mycobacterium tuberculosis" ]]; then printf "CREATE_ANTIBIOGRAM_${sample_name}"; else echo '{"resistance-profiling-warning":"sample is not TB so cannot produce antibiogram using resistance profiling tools"}' | jq '.' > ${error_log} && printf "no" && jq -s ".[0] * .[1]" ${error_log} ${sample_name}_report_previous.json > ${report_json}; fi - """ - - stub: - minos_vcf = "${sample_name}.minos.vcf" - error_log = "${sample_name}_err.json" - - """ - touch ${minos_vcf} - touch ${error_log} - printf ${params.minos_isSampleTB} - """ -} - -process gvcf { - /** - * @QCcheckpoint none - */ - - tag { sample_name } - label 'clockwork' - label 'normal_cpu' - label 'low_memory' - - publishDir "${params.output_dir}/$sample_name/output_fasta", mode: 'copy', pattern: '*.fa' - publishDir "${params.output_dir}/$sample_name/output_vcfs", mode: 'copy', pattern: '*.vcf.gz' - publishDir "${params.output_dir}/$sample_name", mode: 'copy', overwrite: 'true', pattern: '*{_err.json,_report.json}' - - input: - tuple val(sample_name), path(report_json), path(bam), path(ref), val(doWeValCall), path(minos_vcf), val(isSampleTB) - - output: - path("${sample_name}.gvcf.vcf.gz", emit: gvcf) - path("${sample_name}.fa", emit: gvcf_fa) - path "${sample_name}_err.json", emit: gvcf_log optional true - path "${sample_name}_report.json", emit: gvcf_report optional true - - script: - gvcf = "${sample_name}.gvcf.vcf" - gvcf_fa = "${sample_name}.fa" - error_log = "${sample_name}_err.json" - - """ - awk '{print \$1}' ${ref} > ref.fa - - samtools mpileup -ugf ref.fa ${bam} | bcftools call --threads ${task.cpus} -m -O v -o samtools_all_pos.vcf - - clockwork gvcf_from_minos_and_samtools ref.fa ${minos_vcf} samtools_all_pos.vcf ${gvcf} - clockwork gvcf_to_fasta ${gvcf} ${gvcf_fa} - - rm samtools_all_pos.vcf - gzip ${gvcf} - - cp ${sample_name}_report.json ${sample_name}_report_previous.json - - if [ ${params.vcfmix} == "no" ] && [ ${params.resistance_profiler} == "none" ]; then echo '{"complete":"workflow complete without error"}' | jq '.' > ${error_log} && jq -s ".[0] * .[1]" ${error_log} ${sample_name}_report_previous.json > ${report_json}; fi - """ - - stub: - gvcf = "${sample_name}.gvcf.vcf.gz" - gvcf_fa = "${sample_name}.fa" - error_log = "${sample_name}_err.json" - - """ - touch ${gvcf} - touch ${gvcf_fa} - touch ${error_log} - """ -} - diff --git a/modules/.ipynb_checkpoints/vcfpredictModules-checkpoint.nf b/modules/.ipynb_checkpoints/vcfpredictModules-checkpoint.nf deleted file mode 100644 index 4bc7957..0000000 --- a/modules/.ipynb_checkpoints/vcfpredictModules-checkpoint.nf +++ /dev/null @@ -1,216 +0,0 @@ -// modules for the vcfpredict workflow - -process vcfmix { - - tag {sample_name} - label 'vcfpredict' - label 'low_memory' - label 'low_cpu' - - errorStrategy 'ignore' - - publishDir "${params.output_dir}/${sample_name}/output_vcfs", mode: 'copy', pattern: '*_f-stats.json', overwrite: 'true' - publishDir "${params.output_dir}/${sample_name}/output_vcfs", mode: 'copy', pattern: '*.csv', overwrite: 'true' - publishDir "${params.output_dir}/$sample_name", mode: 'copy', overwrite: 'true', pattern: '*{_err.json,_report.json}' - - input: - tuple val(sample_name), path(vcf), path(report_json) - - output: - tuple val(sample_name), path("${sample_name}_f-stats.json"), emit: vcfmix_json - tuple val(sample_name), path("${sample_name}_f-stats.json"), path("${sample_name}_vcfmix-regions.csv"), emit: vcfmix_json_csv - path "${sample_name}_err.json", emit: vcfmix_log optional true - path ("${sample_name}_report.json", emit: vcfmix_report) - - script: - bcftools_vcf = "${sample_name}.bcftools.vcf" - error_log = "${sample_name}_err.json" - - """ - run-vcfmix.py ${bcftools_vcf} - - cp ${sample_name}_report.json ${sample_name}_report_previous.json - - jq -s ".[0] * .[1]" ${sample_name}_report_previous.json ${sample_name}_f-stats.json > ${report_json} - - if [ ${params.resistance_profiler} == "none" ]; then echo '{"complete":"workflow complete without error"}' | jq '.' > ${error_log} && jq -s ".[0] * .[1] * .[2]" ${error_log} ${sample_name}_report_previous.json ${sample_name}_f-stats.json > ${report_json}; fi - """ - - stub: - vcfmix_json = "${sample_name}_f-stats.json" - vcfmix_csv = "${sample_name}_vcfmix-regions.csv" - error_log = "${sample_name}_err.json" - - """ - touch ${vcfmix_json} - touch ${vcfmix_csv} - touch ${error_log} - """ -} - -process tbprofiler_update_db { - label 'low_memory' - label 'low_cpu' - label 'tbprofiler' - - input: - path(reference) - - script: - """ - tb-profiler update_tbdb --match_ref $reference - """ -} - -process tbprofiler { - label 'medium_memory' - label 'medium_cpu' - label 'tbprofiler' - - publishDir "${params.output_dir}/${sample_name}/antibiogram", mode: 'copy', pattern: '*.tbprofiler-out.json', overwrite: 'true' - publishDir "${params.output_dir}/$sample_name", mode: 'copy', overwrite: 'true', pattern: '*{_err.json,_report.json}' - - input: - val(sample_name) - path(minos_vcf) - path(report_json) - val(isSampleTB) - - output: - tuple val(sample_name), path("${sample_name}.tbprofiler-out.json"), path("${sample_name}_report.json"), emit: tbprofiler_json - - when: - isSampleTB =~ /CREATE\_ANTIBIOGRAM\_${sample_name}/ - - script: - error_log = "${sample_name}_err.json" - tbprofiler_json = "${sample_name}.tbprofiler-out.json" - - """ - bgzip ${minos_vcf} - tb-profiler profile --vcf ${minos_vcf}.gz --threads ${task.cpus} - mv results/tbprofiler.results.json ${tbprofiler_json} - - cp ${sample_name}_report.json ${sample_name}_report_previous.json - - echo '{"complete":"workflow complete without error"}' | jq '.' > ${error_log} - - jq -s ".[0] * .[1] * .[2]" ${error_log} ${sample_name}_report_previous.json ${tbprofiler_json} > ${report_json} - """ -} - -process add_allelic_depth { - label 'low_memory' - label 'low_cpu' - label 'tbprofiler' - - input: - val(sample_name) - path(minos_vcf) - path(reference) - val(isSampleTB) - - output: - path("${sample_name}_allelic_depth.minos.vcf") - - when: - isSampleTB =~ /CREATE\_ANTIBIOGRAM\_${sample_name}/ - - script: - """ - samtools faidx $reference - samtools dict $reference -o ${reference.baseName}.dict - gatk VariantAnnotator -R $reference -V $minos_vcf -A DepthPerAlleleBySample -O ${sample_name}_allelic_depth.minos.vcf - """ - -} - -process gnomonicus { - - tag {sample_name} - label 'vcfpredict' - label 'low_memory' - label 'low_cpu' - - errorStrategy 'ignore' - - publishDir "${params.output_dir}/${sample_name}/antibiogram", mode: 'copy', pattern: '*.gnomonicus-out.json', overwrite: 'true' - publishDir "${params.output_dir}/${sample_name}/antibiogram", mode: 'copy', pattern: '*.csv', overwrite: 'true' - publishDir "${params.output_dir}/${sample_name}/antibiogram", mode: 'copy', pattern: '*.fasta', overwrite: 'true' - publishDir "${params.output_dir}/$sample_name", mode: 'copy', overwrite: 'true', pattern: '*{_err.json,_report.json}' - - input: - tuple val(sample_name), path(vcf), val(isSampleTB), path(report_json) - path(genbank) - when: - isSampleTB =~ /CREATE\_ANTIBIOGRAM\_${sample_name}/ - - output: - tuple val(sample_name), path("${sample_name}.gnomonicus-out.json"), path("${sample_name}_report.json"), emit: gnomon_json - tuple val(sample_name), path("${sample_name}.effects.csv"), path("${sample_name}.mutations.csv"), emit: gnomon_csv optional true - tuple val(sample_name), path("*-fixed.fasta"), emit: gnomon_fasta - path("${sample_name}_err.json", emit: gnomon_log) - path ("${sample_name}_report.json", emit: gnomon_report) - - script: - minos_vcf = "${sample_name}.minos.vcf" - error_log = "${sample_name}_err.json" - - """ - gnomonicus --genome_object ${genbank} --catalogue ${params.amr_cat} --vcf_file ${minos_vcf} --output_dir . --json --fasta fixed - - cp ${sample_name}_report.json ${sample_name}_report_previous.json - - echo '{"complete":"workflow complete without error"}' | jq '.' > ${error_log} - - jq -s ".[0] * .[1] * .[2]" ${error_log} ${sample_name}_report_previous.json ${sample_name}.gnomonicus-out.json > ${report_json} - """ - - stub: - gnomonicus_json = "${sample_name}.gnomonicus-out.json" - gnomonicus_fasta = "${sample_name}-fixed.fasta" - gnomonicus_effects = "${sample_name}.effects.csv" - gnomonicus_mutations = "${sample_name}.mutations.csv" - error_log = "${sample_name}_err.json" - - """ - touch ${gnomonicus_json} - touch ${gnomonicus_fasta} - touch ${gnomonicus_effects} - touch ${gnomonicus_mutations} - touch ${error_log} - """ -} - -process finalJson { - - tag {sample_name} - label 'vcfpredict' - label 'low_memory' - label 'low_cpu' - - errorStrategy 'ignore' - - publishDir "${params.output_dir}/$sample_name", mode: 'copy', overwrite: 'true', pattern: '*_report.json' - - input: - tuple val(sample_name), path(vcfmix_json), path(gnomon_json), path(report_json) - - output: - tuple val(sample_name), path("${sample_name}_report.json"), emit: final_json - - script: - """ - cp ${sample_name}_report.json ${sample_name}_report_previous.json - - jq -s ".[0] * .[1]" ${sample_name}_report_previous.json ${vcfmix_json} > ${report_json} - """ - - stub: - report_json = "${sample_name}_report.json" - - """ - touch ${report_json} - """ - -} diff --git a/workflows/.ipynb_checkpoints/clockwork-checkpoint.nf b/workflows/.ipynb_checkpoints/clockwork-checkpoint.nf deleted file mode 100644 index bf6f5e1..0000000 --- a/workflows/.ipynb_checkpoints/clockwork-checkpoint.nf +++ /dev/null @@ -1,44 +0,0 @@ -// enable dsl2 -nextflow.enable.dsl = 2 - -// import modules -include {alignToRef} from '../modules/clockworkModules.nf' params(params) -include {callVarsMpileup} from '../modules/clockworkModules.nf' params(params) -include {callVarsCortex} from '../modules/clockworkModules.nf' params(params) -include {minos} from '../modules/clockworkModules.nf' params(params) -include {gvcf} from '../modules/clockworkModules.nf' params(params) -include {getRefFromJSON} from '../modules/clockworkModules.nf' params(params) -include {getRefCortex} from '../modules/clockworkModules.nf' params(params) - -// define workflow component -workflow clockwork { - - take: - input_seqs_json - - main: - //get just the json - json = input_seqs_json.map{it[4]} - do_we_align = input_seqs_json.map{it[5]} - sample_name = input_seqs_json.map{it[0]} - - getRefFromJSON(json, do_we_align, sample_name) - alignToRef(input_seqs_json, getRefFromJSON.out) - - - callVarsMpileup(alignToRef.out.alignToRef_bam) - - getRefCortex(alignToRef.out.alignToRef_bam) - callVarsCortex(alignToRef.out.alignToRef_bam, getRefCortex.out) - - minos(alignToRef.out.alignToRef_bam.join(callVarsCortex.out.cortex_vcf, by: 0).join(callVarsMpileup.out.mpileup_vcf, by: 0)) - - gvcf(alignToRef.out.alignToRef_bam.join(minos.out.minos_vcf, by: 0)) - - emit: - - mpileup_vcf = callVarsMpileup.out.mpileup_vcf.join(minos.out.minos_report, by: 0) - minos_vcf = minos.out.minos_vcf.join(alignToRef.out.alignToRef_report, by: 0) - reference = getRefFromJSON.out - -} diff --git a/workflows/.ipynb_checkpoints/vcfpredict-checkpoint.nf b/workflows/.ipynb_checkpoints/vcfpredict-checkpoint.nf deleted file mode 100644 index 375e410..0000000 --- a/workflows/.ipynb_checkpoints/vcfpredict-checkpoint.nf +++ /dev/null @@ -1,47 +0,0 @@ -// enable dsl2 -nextflow.enable.dsl = 2 - -// import modules -include {vcfmix} from '../modules/vcfpredictModules.nf' params(params) -include {tbprofiler} from '../modules/vcfpredictModules.nf' params(params) -include {tbprofiler_update_db} from '../modules/vcfpredictModules.nf' params(params) -include {add_allelic_depth} from '../modules/vcfpredictModules.nf' params(params) -include {finalJson} from '../modules/vcfpredictModules.nf' params(params) - -// define workflow component -workflow vcfpredict { - - take: - clockwork_bcftools_tuple - minos_vcf_tuple - reference_fasta - - - main: - - if ( params.vcfmix == "yes" ) { - - vcfmix(clockwork_bcftools_tuple) - - } - - if ( params.resistance_profiler == "tb-profiler"){ - //get just the vcf - sample_name = minos_vcf_tuple.map{it[0]} - minos_vcf = minos_vcf_tuple.map{it[1]} - do_we_resistance_profile = minos_vcf_tuple.map{it[2]} - report_json = minos_vcf_tuple.map{it[3]} - - if (params.update_tbprofiler == "yes"){ - tbprofiler_update_db(reference_fasta) - } - - //add allelic depth back in: was calculated in mpileup but lost in minos - add_allelic_depth(sample_name, minos_vcf, reference_fasta, do_we_resistance_profile) - tbprofiler(sample_name, add_allelic_depth.out, report_json, do_we_resistance_profile) - } - - if (params.vcfmix == "yes" && params.resistance_profiler != "none"){ - finalJson(vcfmix.out.vcfmix_json.join(tbprofiler.out.tbprofiler_json, by: 0)) - } -} From f331b89a4347ef2c7b9ef97cd8ab499f524a7ae1 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Wed, 17 Jan 2024 10:20:15 +0000 Subject: [PATCH 32/44] remove print --- workflows/preprocessing.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/workflows/preprocessing.nf b/workflows/preprocessing.nf index 5097dad..fbb19aa 100644 --- a/workflows/preprocessing.nf +++ b/workflows/preprocessing.nf @@ -66,7 +66,6 @@ workflow preprocessing { bowtie2(kraken2.out.kraken2_fqs, bowtie_dir.toList()) identifyBacterialContaminants(bowtie2.out.bowtie2_fqs.join(speciation_report, by: 0).join(kraken2.out.kraken2_json, by: 0), resource_dir, refseq_path) - identifyBacterialContaminants.out.prev_sample_json.view() downloadContamGenomes(identifyBacterialContaminants.out.contam_list) From fdc78927c2a9aa5b2157465852e0ce7d2383144a Mon Sep 17 00:00:00 2001 From: whalleyt Date: Thu, 18 Jan 2024 12:47:36 +0000 Subject: [PATCH 33/44] minimap on original reference, not copy --- modules/clockworkModules.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/clockworkModules.nf b/modules/clockworkModules.nf index d793a34..f45ac62 100644 --- a/modules/clockworkModules.nf +++ b/modules/clockworkModules.nf @@ -47,7 +47,7 @@ process alignToRef { doWeAlign =~ /NOW\_ALIGN\_TO\_REF\_${sample_name}/ output: - tuple val(sample_name), path("${sample_name}_report.json"), path("${sample_name}.bam"), path("${sample_name}.fa"), stdout, emit: alignToRef_bam + tuple val(sample_name), path("${sample_name}_report.json"), path("${sample_name}.bam"), path(reference_path), stdout, emit: alignToRef_bam path("${sample_name}.bam.bai", emit: alignToRef_bai) path("${sample_name}_alignmentStats.json", emit: alignToRef_json) path "${sample_name}_err.json", emit: alignToRef_log optional true @@ -63,9 +63,9 @@ process alignToRef { """ echo $reference_path - cp ${reference_path} ${sample_name}.fa + cp $reference_path ${sample_name}.fa - minimap2 -ax sr ${sample_name}.fa -t ${task.cpus} $fq1 $fq2 | samtools fixmate -m - - | samtools sort -T tmp - | samtools markdup --reference ${sample_name}.fa - minimap.bam + minimap2 -ax sr $reference_path -t ${task.cpus} $fq1 $fq2 | samtools fixmate -m - - | samtools sort -T tmp - | samtools markdup --reference $reference_path - minimap.bam java -jar /usr/local/bin/picard.jar AddOrReplaceReadGroups INPUT=minimap.bam OUTPUT=${bam} RGID=${sample_name} RGLB=lib RGPL=Illumina RGPU=unit RGSM=sample From 024137f58954abcd6b11ad1d450484d3293d76ea Mon Sep 17 00:00:00 2001 From: whalleyt Date: Wed, 24 Jan 2024 10:16:37 +0000 Subject: [PATCH 34/44] tidy up --- .../containers-checkpoint.config | 49 ------------- config/containers.config | 1 - ...work-0.9.7 => Singularity.clockwork-0.9.8} | 10 ++- ...-0.9.7 => Singularity.preprocessing-0.9.8} | 14 ++-- singularity/Singularity.tbprofiler-0.9.8 | 70 +++++++++++++++++++ ...ict-0.9.7 => Singularity.vcfpredict-0.9.8} | 37 ++-------- 6 files changed, 94 insertions(+), 87 deletions(-) delete mode 100644 config/.ipynb_checkpoints/containers-checkpoint.config rename singularity/{Singularity.clockwork-0.9.7 => Singularity.clockwork-0.9.8} (97%) rename singularity/{Singularity.preprocessing-0.9.7 => Singularity.preprocessing-0.9.8} (95%) create mode 100644 singularity/Singularity.tbprofiler-0.9.8 rename singularity/{Singularity.vcfpredict-0.9.7 => Singularity.vcfpredict-0.9.8} (51%) diff --git a/config/.ipynb_checkpoints/containers-checkpoint.config b/config/.ipynb_checkpoints/containers-checkpoint.config deleted file mode 100644 index dece260..0000000 --- a/config/.ipynb_checkpoints/containers-checkpoint.config +++ /dev/null @@ -1,49 +0,0 @@ -params{ - container_enabled = "true" - container_enabled = "true" - resource_dir = "/resources" -} - - -process { - update_tbprofiler = "false" - - - withLabel:low_cpu {cpus = 2} - withLabel:normal_cpu { cpus = 8 } - withLabel:low_memory { memory = '5GB' } - withLabel:medium_memory { memory = '10GB' } - withLabel:high_memory { memory = '18GB' } - - withLabel:getversion { - container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" - } - - withLabel:preprocessing { - container = "quay.io/pathogen-genomics-cymru/preprocessing:0.9.8" - } - - withLabel:tbprofiler { - container = "quay.io/pathogen-genomics-cymru/tbprofiler:0.9.8" - } - - withName:downloadContamGenomes { - shell = ['/bin/bash','-u'] - errorStrategy = { task.exitStatus in 100..113 ? 'retry' : 'terminate' } - maxRetries = 5 - } - - withLabel:retryAfanc { - shell = ['/bin/bash','-u'] - errorStrategy = {task.exitStatus == 1 ? 'retry' : 'ignore' } - maxRetries = 5 - } - - withLabel:clockwork { - container = "quay.io/pathogen-genomics-cymru/clockwork:0.9.8" - } - - withLabel:vcfpredict { - container = "quay.io/pathogen-genomics-cymru/vcfpredict:0.9.8" - } - } \ No newline at end of file diff --git a/config/containers.config b/config/containers.config index dece260..e961b71 100644 --- a/config/containers.config +++ b/config/containers.config @@ -1,7 +1,6 @@ params{ container_enabled = "true" container_enabled = "true" - resource_dir = "/resources" } diff --git a/singularity/Singularity.clockwork-0.9.7 b/singularity/Singularity.clockwork-0.9.8 similarity index 97% rename from singularity/Singularity.clockwork-0.9.7 rename to singularity/Singularity.clockwork-0.9.8 index f3f3c24..0e13714 100644 --- a/singularity/Singularity.clockwork-0.9.7 +++ b/singularity/Singularity.clockwork-0.9.8 @@ -2,6 +2,8 @@ Bootstrap: docker From: debian:buster Stage: spython-base +%files +bin/ /opt/bin/ %labels maintainer="pricea35@cardiff.ac.uk" about.summary="container for the clockwork workflow" @@ -26,6 +28,9 @@ clockwork_version=2364dec4cbf25c844575e19e8fe0a319d10721b5 PACKAGES="procps curl git build-essential wget zlib1g-dev pkg-config jq r-base-core rsync autoconf libncurses-dev libbz2-dev liblzma-dev libcurl4-openssl-dev cmake tabix libvcflib-tools libssl-dev software-properties-common perl locales locales-all" PYTHON="python2.7 python-dev" +PATH=/opt/bin:$PATH + + apt-get update \ && apt-get install -y $PACKAGES $PYTHON \ && curl -fsSL https://www.python.org/ftp/python/${python_version}/Python-${python_version}.tgz | tar -xz \ @@ -36,7 +41,7 @@ apt-get update \ && ln -s /usr/local/bin/python3.6 /usr/local/bin/python3 \ && ln -s /usr/local/bin/pip3.6 /usr/local/bin/pip3 \ && pip3 install --upgrade pip \ -&& pip3 install 'cluster_vcf_records==0.13.1' pysam setuptools \ +&& pip3 install 'cluster_vcf_records==0.13.1' pysam setuptools awscli \ && wget -qO - https://adoptopenjdk.jfrog.io/adoptopenjdk/api/gpg/key/public | apt-key add - \ && add-apt-repository --yes https://adoptopenjdk.jfrog.io/adoptopenjdk/deb/ \ && apt-get update && apt-get install -y adoptopenjdk-8-hotspot @@ -136,6 +141,7 @@ export python_version=3.6.5 export clockwork_version=2364dec4cbf25c844575e19e8fe0a319d10721b5 export PACKAGES="procps curl git build-essential wget zlib1g-dev pkg-config jq r-base-core rsync autoconf libncurses-dev libbz2-dev liblzma-dev libcurl4-openssl-dev cmake tabix libvcflib-tools libssl-dev software-properties-common perl locales locales-all" export PYTHON="python2.7 python-dev" +export PATH=/opt/bin:$PATH export CLOCKWORK_CORTEX_DIR=/cortex export PATH=${PATH}:/clockwork/python/scripts export PICARD_JAR=/usr/local/bin/picard.jar @@ -145,4 +151,4 @@ export LANGUAGE=en_US.UTF-8 %runscript exec /bin/bash "$@" %startscript -exec /bin/bash "$@" \ No newline at end of file +exec /bin/bash "$@" diff --git a/singularity/Singularity.preprocessing-0.9.7 b/singularity/Singularity.preprocessing-0.9.8 similarity index 95% rename from singularity/Singularity.preprocessing-0.9.7 rename to singularity/Singularity.preprocessing-0.9.8 index 7ca3b35..a164d85 100644 --- a/singularity/Singularity.preprocessing-0.9.7 +++ b/singularity/Singularity.preprocessing-0.9.8 @@ -2,6 +2,8 @@ Bootstrap: docker From: ubuntu:focal Stage: spython-base +%files +bin/ /opt/bin/ %labels maintainer="pricea35@cardiff.ac.uk" about.summary="container for the preprocessing workflow" @@ -25,13 +27,15 @@ fastani_version=1.33 PACKAGES="procps curl git wget build-essential zlib1g-dev libncurses-dev libz-dev libbz2-dev liblzma-dev libcurl4-openssl-dev libgsl-dev rsync unzip ncbi-blast+ pigz jq libtbb-dev openjdk-11-jre-headless autoconf r-base-core locales locales-all" PYTHON="python3 python3-pip python3-dev" -PYTHON_PACKAGES="biopython" +PYTHON_PACKAGES="biopython awscli boto3" PATH=${PATH}:/usr/local/bin/mccortex/bin:/usr/local/bin/bwa-${bwa_version}:/opt/edirect LD_LIBRARY_PATH=/usr/local/lib export DEBIAN_FRONTEND="noninteractive" +PATH=/opt/bin:$PATH + apt-get update \ && DEBIAN_FRONTEND="noninteractive" apt-get install -y $PACKAGES $PYTHON \ && pip3 install --upgrade pip \ @@ -82,7 +86,7 @@ curl -fsSL https://github.com/OpenGene/fastp/archive/v${fastp_version}.tar.gz | && cd .. \ && rm -r fastp-${fastp_version} -wget http://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v${fastqc_version}.zip \ +wget https://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v${fastqc_version}.zip \ && unzip fastqc_v${fastqc_version}.zip \ && chmod +x FastQC/fastqc \ && mv FastQC/* /usr/local/bin \ @@ -102,10 +106,9 @@ curl -fsSL https://github.com/ArthurVM/Afanc/archive/refs/tags/v${afanc_version} && mv mash-Linux64-v${mash_version}/mash /usr/local/bin \ && rm -r mash-Linux* \ && wget https://github.com/ParBLiSS/FastANI/releases/download/v${fastani_version}/fastANI-Linux64-v${fastani_version}.zip \ -&& unzip fastANI-Linux64-v${fastani_version}.zip \ +&& unzip fastANI-Linux64-v${fastani_version}.zip \ && mv fastANI /usr/local/bin - sh -c "$(curl -fsSL ftp://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh)" \ && mkdir -p /opt/edirect \ && mv /root/edirect/* /opt/edirect @@ -149,9 +152,10 @@ export mash_version=2.3 export fastani_version=1.33 export PACKAGES="procps curl git wget build-essential zlib1g-dev libncurses-dev libz-dev libbz2-dev liblzma-dev libcurl4-openssl-dev libgsl-dev rsync unzip ncbi-blast+ pigz jq libtbb-dev openjdk-11-jre-headless autoconf r-base-core locales locales-all" export PYTHON="python3 python3-pip python3-dev" -export PYTHON_PACKAGES="biopython" +export PYTHON_PACKAGES="biopython awscli boto3" export PATH=${PATH}:/usr/local/bin/mccortex/bin:/usr/local/bin/bwa-${bwa_version}:/opt/edirect export LD_LIBRARY_PATH=/usr/local/lib +export PATH=/opt/bin:$PATH export LC_ALL=en_US.UTF-8 export LANG=en_US.UTF-8 export LANGUAGE=en_US.UTF-8 diff --git a/singularity/Singularity.tbprofiler-0.9.8 b/singularity/Singularity.tbprofiler-0.9.8 new file mode 100644 index 0000000..33be3bd --- /dev/null +++ b/singularity/Singularity.tbprofiler-0.9.8 @@ -0,0 +1,70 @@ +Bootstrap: docker +From: mambaorg/micromamba:1.3.0 +Stage: app + +%files +resources/tuberculosis.fasta /data/tuberculosis.fasta +%labels +base.image="micromamba:1.3.0" +dockerfile.version="1" +software="tbprofiler" +software.version="${TBPROFILER_VER}" +description="The pipeline aligns reads to the H37Rv reference using bowtie2, BWA or minimap2 and then calls variants using bcftools. These variants are then compared to a drug-resistance database." +website="https://github.com/jodyphelan/TBProfiler/" +license="https://github.com/jodyphelan/TBProfiler/blob/master/LICENSE" +maintainer="John Arnn" +maintainer.email="jarnn@utah.gov" +maintainer2="Curtis Kapsak" +maintainer2.email="kapsakcj@gmail.com" +%post + +#copy the reference genome to pre-compute our index + +su - root # USER root +mkdir -p / +cd / + +TBPROFILER_VER="5.0.1" + +# this version is the shortened commit hash on the `master` branch here https://github.com/jodyphelan/tbdb/ +# commits are found on https://github.com/jodyphelan/tbdb/commits/master +# this was the latest commit as of 2023-10-26 +TBDB_VER="e25540b" + +# LABEL instructions tag the image with metadata that might be important to the user + +# Install dependencies via apt-get; cleanup apt garbage +apt-get update && apt-get install -y --no-install-recommends \ +wget \ +ca-certificates \ +procps && \ +apt-get autoclean && rm -rf /var/lib/apt/lists/* + +# install tb-profiler via bioconda; install into 'base' conda env +micromamba install --yes --name base --channel conda-forge --channel bioconda \ +tb-profiler=${TBPROFILER_VER} + +micromamba install --yes --name base --channel conda-forge --channel bioconda gatk4 +micromamba install --yes --name base --channel conda-forge --channel bioconda samtools +micromamba install --yes --name base --channel conda-forge jq +micromamba clean --all --yes + +# hardcode 'base' env bin into PATH, so conda env does not have to be "activated" at run time +PATH="/opt/conda/bin:${PATH}" + +# Version of database can be confirmed at /opt/conda/share/tbprofiler/tbdb.version.json +# can also run 'tb-profiler list_db' to find the same version info +# In 5.0.1 updating_tbdb does not work with tb-profiler update_tbdb --commit ${TBDB_VER} +tb-profiler update_tbdb --commit ${TBDB_VER} + +mkdir -p /data +cd /data +tb-profiler update_tbdb --match_ref tuberculosis.fasta +%environment +export PATH="/opt/conda/bin:${PATH}" +%runscript +cd /data +exec /bin/bash "$@" +%startscript +cd /data +exec /bin/bash "$@" diff --git a/singularity/Singularity.vcfpredict-0.9.7 b/singularity/Singularity.vcfpredict-0.9.8 similarity index 51% rename from singularity/Singularity.vcfpredict-0.9.7 rename to singularity/Singularity.vcfpredict-0.9.8 index ff29506..0146e7d 100644 --- a/singularity/Singularity.vcfpredict-0.9.7 +++ b/singularity/Singularity.vcfpredict-0.9.8 @@ -2,22 +2,22 @@ Bootstrap: docker From: ubuntu:20.04 Stage: spython-base +%files +bin/ /opt/bin/ %labels maintainer="pricea35@cardiff.ac.uk" about.summary="container for the vcf predict workflow" %post +#add run-vcf to container +PATH=/opt/bin:$PATH PACKAGES="procps curl wget git build-essential libhdf5-dev libffi-dev r-base-core jq" PYTHON="python3 python3-pip python3-dev" vcfmix_version=d4693344bf612780723e39ce27c8ae3868f95417 -gumpy_version=1.0.15 -piezo_version=0.3 -gnomonicus_version=1.1.2 -tuberculosis_amr_catalogues=12d38733ad2e238729a3de9f725081e1d4872968 - +#apt updates apt-get update \ && DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata \ && apt-get install -y $PACKAGES $PYTHON \ @@ -26,38 +26,15 @@ apt-get update \ && cd VCFMIX \ && git checkout ${vcfmix_version} \ && pip3 install recursive_diff \ +&& pip3 install awscli \ && pip3 install . \ && cp -r data /usr/local/lib/python3.8/dist-packages \ && cd .. - -curl -fsSL https://github.com/oxfordmmm/gumpy/archive/refs/tags/v${gumpy_version}.tar.gz | tar -xz \ -&& cd gumpy-${gumpy_version} \ -&& pip3 install . \ -&& cd .. - -curl -fsSL https://github.com/oxfordmmm/piezo/archive/refs/tags/v${piezo_version}.tar.gz | tar -xz \ -&& cd piezo-${piezo_version} \ -&& pip3 install . \ -&& cd .. - -curl -fsSL https://github.com/oxfordmmm/gnomonicus/archive/refs/tags/v${gnomonicus_version}.tar.gz | tar -xz \ -&& cd gnomonicus-${gnomonicus_version} \ -&& pip3 install . \ -&& cd .. - -git clone https://github.com/oxfordmmm/tuberculosis_amr_catalogues.git \ -&& cd tuberculosis_amr_catalogues \ -&& git checkout ${tuberculosis_amr_catalogues} \ -&& cd .. - %environment +export PATH=/opt/bin:$PATH export PACKAGES="procps curl wget git build-essential libhdf5-dev libffi-dev r-base-core jq" export PYTHON="python3 python3-pip python3-dev" export vcfmix_version=d4693344bf612780723e39ce27c8ae3868f95417 -export gumpy_version=1.0.15 -export piezo_version=0.3 -export gnomonicus_version=1.1.2 -export tuberculosis_amr_catalogues=12d38733ad2e238729a3de9f725081e1d4872968 %runscript exec /bin/bash "$@" %startscript From 26497d5ba2fa25c8e0695c8e46e7fa2ac3a43de5 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Wed, 24 Jan 2024 14:00:42 +0000 Subject: [PATCH 35/44] fix allelic depth --- main.nf | 3 ++- modules/vcfpredictModules.nf | 3 ++- workflows/clockwork.nf | 1 + workflows/vcfpredict.nf | 4 +++- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 2f38ef6..0cd98f2 100644 --- a/main.nf +++ b/main.nf @@ -213,8 +213,9 @@ workflow { mpileup_vcf = clockwork.out.mpileup_vcf minos_vcf = clockwork.out.minos_vcf reference = clockwork.out.reference + bam = clockwork.out.bam - vcfpredict(mpileup_vcf, minos_vcf, reference) + vcfpredict(bam, mpileup_vcf, minos_vcf, reference) } diff --git a/modules/vcfpredictModules.nf b/modules/vcfpredictModules.nf index 4bc7957..042b403 100644 --- a/modules/vcfpredictModules.nf +++ b/modules/vcfpredictModules.nf @@ -107,6 +107,7 @@ process add_allelic_depth { input: val(sample_name) path(minos_vcf) + path(bam) path(reference) val(isSampleTB) @@ -120,7 +121,7 @@ process add_allelic_depth { """ samtools faidx $reference samtools dict $reference -o ${reference.baseName}.dict - gatk VariantAnnotator -R $reference -V $minos_vcf -A DepthPerAlleleBySample -O ${sample_name}_allelic_depth.minos.vcf + gatk VariantAnnotator -R $reference -I $bam -V $minos_vcf -A DepthPerAlleleBySample -O ${sample_name}_allelic_depth.minos.vcf """ } diff --git a/workflows/clockwork.nf b/workflows/clockwork.nf index bf6f5e1..148f523 100644 --- a/workflows/clockwork.nf +++ b/workflows/clockwork.nf @@ -40,5 +40,6 @@ workflow clockwork { mpileup_vcf = callVarsMpileup.out.mpileup_vcf.join(minos.out.minos_report, by: 0) minos_vcf = minos.out.minos_vcf.join(alignToRef.out.alignToRef_report, by: 0) reference = getRefFromJSON.out + bam = alignToRef.out.alignToRef_bam } diff --git a/workflows/vcfpredict.nf b/workflows/vcfpredict.nf index 375e410..8fec00f 100644 --- a/workflows/vcfpredict.nf +++ b/workflows/vcfpredict.nf @@ -12,6 +12,7 @@ include {finalJson} from '../modules/vcfpredictModules.nf' params(params) workflow vcfpredict { take: + clockwork_bam clockwork_bcftools_tuple minos_vcf_tuple reference_fasta @@ -31,13 +32,14 @@ workflow vcfpredict { minos_vcf = minos_vcf_tuple.map{it[1]} do_we_resistance_profile = minos_vcf_tuple.map{it[2]} report_json = minos_vcf_tuple.map{it[3]} + bam = clockwork_bam.map{it[2]} if (params.update_tbprofiler == "yes"){ tbprofiler_update_db(reference_fasta) } //add allelic depth back in: was calculated in mpileup but lost in minos - add_allelic_depth(sample_name, minos_vcf, reference_fasta, do_we_resistance_profile) + add_allelic_depth(sample_name, minos_vcf, bam, reference_fasta, do_we_resistance_profile) tbprofiler(sample_name, add_allelic_depth.out, report_json, do_we_resistance_profile) } From f149e82761649c6ad00881a070c5ef615fb1af97 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Wed, 24 Jan 2024 14:50:28 +0000 Subject: [PATCH 36/44] afanc memory --- modules/preprocessingModules.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/preprocessingModules.nf b/modules/preprocessingModules.nf index 9d7177a..0de3ab6 100644 --- a/modules/preprocessingModules.nf +++ b/modules/preprocessingModules.nf @@ -337,7 +337,7 @@ process afanc { tag { sample_name } label 'preprocessing' label 'normal_cpu' - label 'medium_memory' + label 'high_memory' label 'retry_afanc' publishDir "${params.output_dir}/$sample_name/speciation_reports_for_reads_postFastP", mode: 'copy', pattern: '*_afanc_report.json' From 441512d00f85b1b099c44ae812bbc3c17774d30c Mon Sep 17 00:00:00 2001 From: whalleyt Date: Wed, 24 Jan 2024 17:12:51 +0000 Subject: [PATCH 37/44] remove copy --- modules/clockworkModules.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/clockworkModules.nf b/modules/clockworkModules.nf index f45ac62..0bea703 100644 --- a/modules/clockworkModules.nf +++ b/modules/clockworkModules.nf @@ -63,7 +63,6 @@ process alignToRef { """ echo $reference_path - cp $reference_path ${sample_name}.fa minimap2 -ax sr $reference_path -t ${task.cpus} $fq1 $fq2 | samtools fixmate -m - - | samtools sort -T tmp - | samtools markdup --reference $reference_path - minimap.bam From f54bc37d6926c9b9ef2241aa411151dd7121d61d Mon Sep 17 00:00:00 2001 From: whalleyt Date: Fri, 26 Jan 2024 09:34:45 +0000 Subject: [PATCH 38/44] k8s job --- nextflow.config | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/nextflow.config b/nextflow.config index c8b15d1..707ea73 100644 --- a/nextflow.config +++ b/nextflow.config @@ -71,6 +71,11 @@ profiles { container = null } + process { + //process as job rather than pod, helps with stability + k8s.computeResourceType = 'Job' + } + //params specific to paths on the climb system params{ bowtie2_index = "s3://microbial-bioin-sp3/bowtie_hg19" From 7eece9ad6f79cad2b8a3c79a68dbd1015200f580 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Fri, 26 Jan 2024 09:44:03 +0000 Subject: [PATCH 39/44] update branches that actions wf works on --- .github/workflows/build-push-quay.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/build-push-quay.yml b/.github/workflows/build-push-quay.yml index cef945d..a1c2b72 100644 --- a/.github/workflows/build-push-quay.yml +++ b/.github/workflows/build-push-quay.yml @@ -2,9 +2,7 @@ name: build-push-quay on: push: branches: - - v0.9.6 - - 0.9.7-dev - - tbprofiler + - main paths: - '**/Dockerfile*' - "bin/" From 9eeb2a671bcbc2eba4c0b28495b00e6d4341156e Mon Sep 17 00:00:00 2001 From: whalleyt Date: Fri, 26 Jan 2024 09:55:27 +0000 Subject: [PATCH 40/44] update readme --- modules/preprocessingModules.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/preprocessingModules.nf b/modules/preprocessingModules.nf index 0de3ab6..f4c528d 100644 --- a/modules/preprocessingModules.nf +++ b/modules/preprocessingModules.nf @@ -434,7 +434,7 @@ process bowtie2 { tag { sample_name } label 'preprocessing' label 'normal_cpu' - label 'low_memory' + label 'medium_memory' publishDir "${params.output_dir}/$sample_name/output_reads", mode: 'copy', pattern: '*.fq.gz', overwrite: 'true' From 1f6252916066489c1a6d065c0b4869c7ad9b431f Mon Sep 17 00:00:00 2001 From: whalleyt Date: Fri, 26 Jan 2024 09:58:31 +0000 Subject: [PATCH 41/44] README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 4fac53e..a076e2a 100644 --- a/README.md +++ b/README.md @@ -131,3 +131,4 @@ process clockwork:minos\ For a list of direct authors of this pipeline, please see the contributors list. All of the software dependencies of this pipeline are recorded in the version.json The preprocessing sub-workflow is based on the preprocessing nextflow DSL1 pipeline written by Stephen Bush, University of Oxford. The clockwork sub-workflow uses aspects of the variant calling workflow from https://github.com/iqbal-lab-org/clockwork, lead author Martin Hunt, Iqbal Lab at EMBL-EBI + From b5291ad847e1f7f1dc0b443752872d4cb18114ca Mon Sep 17 00:00:00 2001 From: whalleyt Date: Thu, 8 Feb 2024 09:50:27 +0000 Subject: [PATCH 42/44] k8s job to stop fails --- nextflow.config | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/nextflow.config b/nextflow.config index 707ea73..43a0d71 100644 --- a/nextflow.config +++ b/nextflow.config @@ -71,9 +71,8 @@ profiles { container = null } - process { - //process as job rather than pod, helps with stability - k8s.computeResourceType = 'Job' + k8s { + computeResourceType = 'Job' } //params specific to paths on the climb system From aa906e6aacf283904f99d4d32255219a8931ad2a Mon Sep 17 00:00:00 2001 From: whalleyt Date: Thu, 8 Feb 2024 09:51:19 +0000 Subject: [PATCH 43/44] output csv --- modules/preprocessingModules.nf | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/preprocessingModules.nf b/modules/preprocessingModules.nf index f4c528d..8d05c49 100644 --- a/modules/preprocessingModules.nf +++ b/modules/preprocessingModules.nf @@ -398,6 +398,7 @@ process mykrobe { label 'medium_memory' publishDir "${params.output_dir}/$sample_name/speciation_reports_for_reads_postFastP", mode: 'copy', pattern: '*_mykrobe_report.json' + publishDir "${params.output_dir}/$sample_name/speciation_reports_for_reads_postFastP", mode: 'copy', pattern: '*_mykrobe_report.csv' input: tuple val(sample_name), path(fq1), path(fq2), val(run_mykrobe), path(software_json) @@ -413,7 +414,7 @@ process mykrobe { mykrobe_report = "${sample_name}_mykrobe_report.json" """ - mykrobe predict --sample ${sample_name} --species tb --threads ${task.cpus} --format json --output ${mykrobe_report} -1 $fq1 $fq2 + mykrobe predict --sample ${sample_name} --species tb --threads ${task.cpus} --format json_and_csv --output ${mykrobe_report} -1 $fq1 $fq2 printf ${sample_name} """ @@ -733,6 +734,7 @@ process reMykrobe { label 'low_memory' publishDir "${params.output_dir}/$sample_name/speciation_reports_for_reads_postFastP_and_postContamRemoval", mode: 'copy', pattern: '*_mykrobe_report.json' + publishDir "${params.output_dir}/$sample_name/speciation_reports_for_reads_postFastP_and_postContamRemoval", mode: 'copy', pattern: '*_mykrobe_report.csv' input: tuple val(sample_name), path(fq1), path(fq2), path(software_json) @@ -744,7 +746,7 @@ process reMykrobe { mykrobe_report = "${sample_name}_mykrobe_report.json" """ - mykrobe predict --sample ${sample_name} --species tb --threads ${task.cpus} --format json --output ${mykrobe_report} -1 $fq1 $fq2 + mykrobe predict --sample ${sample_name} --species tb --threads ${task.cpus} --format json_and_csv --output ${mykrobe_report} -1 $fq1 $fq2 """ stub: From 0e48d699bd1626687e18789a2e49efac25b42f78 Mon Sep 17 00:00:00 2001 From: whalleyt Date: Thu, 8 Feb 2024 10:03:13 +0000 Subject: [PATCH 44/44] change output name to deal with csv and json --- modules/preprocessingModules.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/preprocessingModules.nf b/modules/preprocessingModules.nf index 8d05c49..b59d0cc 100644 --- a/modules/preprocessingModules.nf +++ b/modules/preprocessingModules.nf @@ -411,7 +411,7 @@ process mykrobe { tuple val(sample_name), path(fq1), path(fq2), stdout, emit: mykrobe_fqs script: - mykrobe_report = "${sample_name}_mykrobe_report.json" + mykrobe_report = "${sample_name}_mykrobe_report" """ mykrobe predict --sample ${sample_name} --species tb --threads ${task.cpus} --format json_and_csv --output ${mykrobe_report} -1 $fq1 $fq2 @@ -422,7 +422,7 @@ process mykrobe { mykrobe_report = "${sample_name}_mykrobe_report.json" """ - touch ${mykrobe_report} + touch ${mykrobe_report}.json printf ${sample_name} """ } @@ -743,7 +743,7 @@ process reMykrobe { tuple val(sample_name), path("${sample_name}_mykrobe_report.json"), emit: reMykrobe_report script: - mykrobe_report = "${sample_name}_mykrobe_report.json" + mykrobe_report = "${sample_name}_mykrobe_report" """ mykrobe predict --sample ${sample_name} --species tb --threads ${task.cpus} --format json_and_csv --output ${mykrobe_report} -1 $fq1 $fq2 @@ -753,7 +753,7 @@ process reMykrobe { mykrobe_report = "${sample_name}_mykrobe_report.json" """ - touch ${mykrobe_report} + touch ${mykrobe_report}.json """ }