Skip to content

Commit

Permalink
Merge pull request #466 from nextstrain/ignore-cache
Browse files Browse the repository at this point in the history
Ignore cache if Nextclade or dataset version is different
  • Loading branch information
joverlee521 authored Jul 29, 2024
2 parents 48e2cec + 9a2ca57 commit f9bca07
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 26 deletions.
20 changes: 20 additions & 0 deletions bin/fetch-cache-version
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

# this script intentionally doesn't `set -euo pipefail`
# because otherwise the `head -n 2` step triggers SIGPIPE
# causing the script to exit before it is done.

s3_url="${1:?An S3 URL is required as the first argument}"


trap '' SIGPIPE

(aws s3 cp "$s3_url" - \
| zstd -T0 -dcq \
| head -n 2 \
| tsv-select -H -f 'nextclade_version,dataset_version' \
| tail -n 1 \
| jq --raw-input -c '
split("\t")
| { "nextclade_version": .[0], "nextclade_dataset_version": .[1] }') \
2> /dev/null
59 changes: 59 additions & 0 deletions bin/use-nextclade-cache
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/bin/bash
set -euo pipefail

bin="$(dirname "$0")"
vendored="$bin"/../vendored

main() {
s3_dst="${1:?A destination s3:// URL where the renew file is hosted is required as the first argument.}"
s3_src="${2:?A source s3:// URL where the fallback renew file is hosted is required as the second argument.}"
nextclade="${3:?A path to the Nextclade executable is required as the third argument}"
nextclade_dataset="${4:?A path to a Nextclade dataset ZIP archive is required as the fourth argument}"
# Nextclade dataset reference wildcard
reference="${5:-}"

if renew-flag-exists; then
echo "[INFO] Found renew flag" >&2
echo "false"
exit 0
fi

cache_versions="$(get-cache-version-info)"
cache_nextclade_version="$(echo "$cache_versions" | jq -r .nextclade_version)"
current_nextclade_version="$("$nextclade" --version)"
if [[ "$cache_nextclade_version" != "$current_nextclade_version" ]]; then
echo "[INFO] Current Nextclade version ($current_nextclade_version) is different from cache version ($cache_nextclade_version)" >&2
echo "false"
exit 0
fi

cache_dataset_version="$(echo "$cache_versions" | jq -r .nextclade_dataset_version)"
current_dataset_version="$(unzip -p "$nextclade_dataset" pathogen.json | jq -r '.version.tag')"
if [[ "$cache_dataset_version" != "$current_dataset_version" ]]; then
echo "[INFO] Current Nextclade dataset version ($current_dataset_version) is different from cache version ($cache_dataset_version)" >&2
echo "false"
exit 0
fi

echo "true"
}

renew-flag-exists() {
local renew_file="nextclade$reference.tsv.zst.renew"
local dst_renew_file="$s3_dst/$renew_file"
local src_renew_file="$s3_src/$renew_file"

"$vendored"/s3-object-exists "$dst_renew_file" || "$vendored"/s3-object-exists "$src_renew_file"
}

get-cache-version-info() {
# TODO: Update check a separate file for version info
# Currently just checks the first row of the nextclade.tsv file
local version_file="nextclade$reference.tsv.zst"
local dst_version_file="$s3_dst/$version_file"
local src_version_file="$s3_src/$version_file"

"$bin"/fetch-cache-version "$dst_version_file" || "$bin"/cache-version "$src_version_file"
}

main "$@"
84 changes: 58 additions & 26 deletions workflow/snakemake_rules/nextclade.smk
Original file line number Diff line number Diff line change
Expand Up @@ -62,52 +62,83 @@ if config.get("s3_dst") and config.get("s3_src"):
ruleorder: download_nextclade_tsv_from_s3 > create_empty_nextclade_info
ruleorder: download_previous_alignment_from_s3 > create_empty_nextclade_aligned


rule use_nextclade_cache:
input:
nextclade="data/nextclade",
nextclade_dataset=lambda w: f"data/nextclade_data/sars-cov-2{w.reference.replace('_','-')}.zip",
params:
dst_source=config["s3_dst"],
src_source=config["s3_src"],
output:
use_nextclade_cache=f"data/{database}/use_nextclade_cache{{reference}}.txt",
shell:
"""
./bin/use-nextclade-cache \
{params.dst_source:q} \
{params.src_source:q} \
{input.nextclade:q} \
{input.nextclade_dataset:q} \
{wildcards.reference:q} \
> {output.use_nextclade_cache}
"""


rule download_nextclade_tsv_from_s3:
"""
If there's a .renew touchfile, do not use the cache
"""
input:
use_nextclade_cache=f"data/{database}/use_nextclade_cache{{reference}}.txt",
params:
dst_source=config["s3_dst"] + "/nextclade{reference}.tsv.zst",
src_source=config["s3_src"] + "/nextclade{reference}.tsv.zst",
dst_rerun_touchfile=config["s3_dst"] + "/nextclade{reference}.tsv.zst.renew",
src_rerun_touchfile=config["s3_dst"] + "/nextclade{reference}.tsv.zst.renew",
lines=config.get("subsample", {}).get("nextclade", 0),
output:
nextclade=f"data/{database}/nextclade{{reference}}_old.tsv",
benchmark:
f"benchmarks/download_nextclade_tsv_from_s3_{database}{{reference}}.txt"
shell:
"""
./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.nextclade} 0 || \
./vendored/download-from-s3 {params.src_rerun_touchfile} {output.nextclade} 0 || \
./vendored/download-from-s3 {params.dst_source} {output.nextclade} {params.lines} || \
./vendored/download-from-s3 {params.src_source} {output.nextclade} {params.lines} || \
touch {output.nextclade}
use_nextclade_cache=$(cat {input.use_nextclade_cache})
if [[ "$use_nextclade_cache" == 'true' ]]; then
echo "[INFO] Downloading cached nextclade{wildcards.reference}.tsv.zst"
./vendored/download-from-s3 {params.dst_source} {output.nextclade} {params.lines} || \
./vendored/download-from-s3 {params.src_source} {output.nextclade} {params.lines}
else
echo "[INFO] Ignoring cached nextclade{wildcards.reference}.tsv.zst"
touch {output.nextclade}
fi
"""

rule download_previous_alignment_from_s3:
## NOTE two potential bugs with this implementation:
## (1) race condition. This file may be updated on the remote after download_nextclade has run but before this rule
## (2) we may get `download_nextclade` and `download_previous_alignment` from different s3 buckets
input:
use_nextclade_cache=f"data/{database}/use_nextclade_cache.txt",
params:
dst_source=config["s3_dst"] + "/{seqtype}.fasta.zst",
src_source=config["s3_src"] + "/{seqtype}.fasta.zst",
dst_rerun_touchfile=config["s3_dst"] + "/nextclade.tsv.zst.renew",
src_rerun_touchfile=config["s3_dst"] + "/nextclade.tsv.zst.renew",
lines=config.get("subsample", {}).get("nextclade", 0),
output:
alignment=temp(f"data/{database}/nextclade.{{seqtype}}.old.fasta"),
benchmark:
f"benchmarks/download_previous_alignment_from_s3_{database}{{seqtype}}.txt"
shell:
"""
./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.alignment} 0 || \
./vendored/download-from-s3 {params.src_rerun_touchfile} {output.alignment} 0 || \
./vendored/download-from-s3 {params.dst_source} {output.alignment} {params.lines} || \
./vendored/download-from-s3 {params.src_source} {output.alignment} {params.lines} || \
touch {output.alignment}
"""
use_nextclade_cache=$(cat {input.use_nextclade_cache})
if [[ "$use_nextclade_cache" == 'true' ]]; then
echo "[INFO] Downloading cached Nextclade {wildcards.seqtype}.fasta.zst"
./vendored/download-from-s3 {params.dst_source} {output.alignment} {params.lines} || \
./vendored/download-from-s3 {params.src_source} {output.alignment} {params.lines}
else
echo "[INFO] Ignoring cached Nextclade {wildcards.seqtype}.fasta.zst"
touch {output.alignment}
fi
"""

rule get_sequences_without_nextclade_annotations:
"""Find sequences in FASTA which don't have clades assigned yet"""
Expand Down Expand Up @@ -135,40 +166,40 @@ rule get_sequences_without_nextclade_annotations:
rule download_nextclade_executable:
"""Download Nextclade"""
output:
nextclade="nextclade",
nextclade="data/nextclade",
benchmark:
f"benchmarks/download_nextclade_executable_{database}.txt"
shell:
"""
if [ "$(uname)" = "Darwin" ]; then
curl -fsSL "https://github.com/nextstrain/nextclade/releases/latest/download/nextclade-x86_64-apple-darwin" -o "nextclade"
curl -fsSL "https://github.com/nextstrain/nextclade/releases/latest/download/nextclade-x86_64-apple-darwin" -o {output.nextclade:q}
else
curl -fsSL "https://github.com/nextstrain/nextclade/releases/latest/download/nextclade-x86_64-unknown-linux-gnu" -o "nextclade"
curl -fsSL "https://github.com/nextstrain/nextclade/releases/latest/download/nextclade-x86_64-unknown-linux-gnu" -o {output.nextclade:q}
fi
chmod +x nextclade
chmod +x {output.nextclade:q}
if ! command -v ./nextclade &>/dev/null; then
if ! command -v {output.nextclade:q} &>/dev/null; then
echo "[ERROR] Nextclade executable not found"
exit 1
fi
NEXTCLADE_VERSION="$(./nextclade --version)"
NEXTCLADE_VERSION="$({output.nextclade:q} --version)"
echo "[ INFO] Nextclade version: $NEXTCLADE_VERSION"
"""


rule download_nextclade_dataset:
"""Download Nextclade dataset"""
input:
"nextclade",
nextclade="data/nextclade",
output:
dataset="data/nextclade_data/{dataset_name}.zip",
benchmark:
f"benchmarks/download_nextclade_dataset_{database}_{{dataset_name}}.txt"
shell:
"""
./nextclade dataset get --name="{wildcards.dataset_name}" --output-zip={output.dataset} --verbose
{input.nextclade:q} dataset get --name="{wildcards.dataset_name}" --output-zip={output.dataset} --verbose
"""


Expand All @@ -179,7 +210,7 @@ rule run_wuhan_nextclade:
metrics which will ultimately end up in metadata.tsv.
"""
input:
nextclade_path="nextclade",
nextclade_path="data/nextclade",
dataset="data/nextclade_data/sars-cov-2.zip",
sequences=f"data/{database}/nextclade.sequences.fasta",
params:
Expand Down Expand Up @@ -214,7 +245,7 @@ rule run_21L_nextclade:
Like wuhan nextclade, but TSV only, no alignments output
"""
input:
nextclade_path="nextclade",
nextclade_path="data/nextclade",
dataset=lambda w: f"data/nextclade_data/sars-cov-2-21L.zip",
sequences=f"data/{database}/nextclade_21L.sequences.fasta",
output:
Expand All @@ -235,6 +266,7 @@ rule run_21L_nextclade:

rule nextclade_tsv_concat_versions:
input:
nextclade="data/nextclade",
tsv=f"data/{database}/nextclade{{reference}}_new_raw.tsv",
dataset=lambda w: f"data/nextclade_data/sars-cov-2{w.reference.replace('_','-')}.zip",
output:
Expand All @@ -245,7 +277,7 @@ rule nextclade_tsv_concat_versions:
"""
if [ -s {input.tsv} ]; then
# Get version numbers
nextclade_version="$(./nextclade --version)"
nextclade_version="$({input.nextclade:q} --version)"
dataset_version="$(unzip -p {input.dataset} pathogen.json | jq -r '.version.tag')"
timestamp="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
Expand Down

0 comments on commit f9bca07

Please sign in to comment.