From ce16c595ee562d9a2489ff7e2734bae543c88ebf Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 2 Feb 2024 13:32:55 -0800 Subject: [PATCH] Use built-in Snakemake `retries` directive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We've been using a custom automatic retry for rules that fail occasionally during fetching of online resources since the workflow was still a bash script.¹ When the workflow was converted a Snakemake workflow, Snakemake did not have rule specific retries yet.² The latest GISAID ingest failed³ due to an error in `run_shell_command_n_times`, so I figured it's time to loop back and replace it with the built-it Snakemake `retries` directive that has been available since v7.7.0. ¹ https://github.com/nextstrain/ncov-ingest/commit/75ee3921137f879a94d6996f73770a77727408af ² https://github.com/nextstrain/ncov-ingest/pull/231/files#r758769573 ³ https://bedfordlab.slack.com/archives/CTZKJC7PZ/p1706908574251489?thread_ts=1706905060.565059&cid=CTZKJC7PZ --- Snakefile | 4 + workflow/snakemake_rules/fetch_sequences.smk | 98 ++++++++------------ 2 files changed, 45 insertions(+), 57 deletions(-) diff --git a/Snakefile b/Snakefile index a5f21b59..919d09ea 100644 --- a/Snakefile +++ b/Snakefile @@ -1,6 +1,10 @@ from subprocess import CalledProcessError +from snakemake.utils import min_version import os +# Snakemake 7.7.0 introduced `retries` directive used in fetch_sequences +min_version("7.7.0") + GENES = "E,M,N,ORF1a,ORF1b,ORF3a,ORF6,ORF7a,ORF7b,ORF8,ORF9b,S" GENES_SPACE_DELIMITED = GENES.replace(",", " ") GENE_LIST = GENES.split(",") diff --git a/workflow/snakemake_rules/fetch_sequences.smk b/workflow/snakemake_rules/fetch_sequences.smk index 7d1402d4..1926ed07 100644 --- a/workflow/snakemake_rules/fetch_sequences.smk +++ b/workflow/snakemake_rules/fetch_sequences.smk @@ -18,42 +18,28 @@ Produces different final outputs for GISAID vs GenBank/RKI: rki_ndjson = "data/rki.ndjson" """ -def run_shell_command_n_times(cmd, msg, cleanup_failed_cmd, retry_num=5): - attempt = 0 - while attempt < retry_num: - print(f"{msg} attempt number {attempt}") - try: - shell(cmd) - break - except CalledProcessError: - print("...FAILED") - attempt+=1 - shell("{cleanup_failed_cmd} && sleep 10") - else: - print(msg + f" has FAILED {retry_num} times. Exiting.") - raise Exception("function run_shell_command_n_times has failed") - rule fetch_main_gisaid_ndjson: output: ndjson = temp(f"data/gisaid.ndjson") - run: - run_shell_command_n_times( - f"./bin/fetch-from-gisaid {output.ndjson}", - f"Fetching from {database}", - f"rm {output.ndjson}" - ) + retries: 5 + shell: + """ + ./bin/fetch-from-gisaid {output.ndjson} + """ + rule fetch_ncbi_dataset_package: output: dataset_package = temp("data/ncbi_dataset.zip") + retries: 5 benchmark: "benchmarks/fetch_ncbi_dataset_package.txt" - run: - run_shell_command_n_times( - f"datasets download virus genome taxon SARS-CoV-2 --no-progressbar --filename {output.dataset_package}", - f"Fetching from {database} with NCBI Datasets", - f"rm -f {output.dataset_package}" - ) + shell: + """ + datasets download virus genome taxon SARS-CoV-2 \ + --no-progressbar \ + --filename {output.dataset_package} + """ rule extract_ncbi_dataset_sequences: input: @@ -142,36 +128,36 @@ rule fetch_biosample: """Fetching BioSample data (GenBank only)""" output: biosample = temp("data/biosample.ndjson") - run: - run_shell_command_n_times( - f"./bin/fetch-from-biosample > {output.biosample}", - "Fetch BioSample", - f"rm {output.biosample}" - ) + retries: 5 + shell: + """ + ./bin/fetch-from-biosample > {output.biosample} + """ + rule fetch_cog_uk_accessions: message: """Fetching COG-UK sample accesions (GenBank only)""" output: cog_uk_accessions = temp("data/cog_uk_accessions.tsv") - run: - run_shell_command_n_times( - f"./bin/fetch-from-cog-uk-accessions > {output.cog_uk_accessions}", - "Fetch COG-UK sample accessions", - f"rm {output.cog_uk_accessions}" - ) + retries: 5 + shell: + """ + ./bin/fetch-from-cog-uk-accessions > {output.cog_uk_accessions} + """ + rule fetch_cog_uk_metadata: message: """Fetching COG-UK metadata (GenBank only)""" output: cog_uk_metadata = temp("data/cog_uk_metadata.csv.gz") - run: - run_shell_command_n_times( - f"./bin/fetch-from-cog-uk-metadata > {output.cog_uk_metadata}", - "Fetch COG-UK metadata", - f"rm {output.cog_uk_metadata}" - ) + retries: 5 + shell: + """ + ./bin/fetch-from-cog-uk-metadata > {output.cog_uk_metadata} + """ + rule uncompress_cog_uk_metadata: input: @@ -185,23 +171,21 @@ rule uncompress_cog_uk_metadata: rule fetch_rki_sequences: output: rki_sequences=temp("data/rki_sequences.fasta.xz"), - run: - run_shell_command_n_times( - f"./bin/fetch-from-rki-sequences > {output.rki_sequences}", - "Fetch RKI sequences", - f"rm {output.rki_sequences}", - ) + retries: 5 + shell: + """ + ./bin/fetch-from-rki-sequences > {output.rki_sequences} + """ rule fetch_rki_metadata: output: rki_metadata=temp("data/rki_metadata.tsv.xz"), - run: - run_shell_command_n_times( - f"./bin/fetch-from-rki-metadata > {output.rki_metadata}", - "Fetch RKI metadata", - f"rm {output.rki_metadata}", - ) + retries: 5 + shell: + """ + ./bin/fetch-from-rki-metadata > {output.rki_metadata} + """ rule transform_rki_data_to_ndjson: