Merge pull request #466 from nextstrain/ignore-cache

Ignore cache if Nextclade or dataset version is different
nextstrain · Jul 29, 2024 · f9bca07 · f9bca07
2 parents 48e2cec + 9a2ca57
commit f9bca07
Show file tree

Hide file tree

Showing 3 changed files with 137 additions and 26 deletions.
diff --git a/bin/fetch-cache-version b/bin/fetch-cache-version
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# this script intentionally doesn't `set -euo pipefail`
+# because otherwise the `head -n 2` step triggers SIGPIPE
+# causing the script to exit before it is done.
+
+s3_url="${1:?An S3 URL is required as the first argument}"
+
+
+trap '' SIGPIPE
+
+(aws s3 cp "$s3_url" - \
+    | zstd -T0 -dcq \
+    | head -n 2 \
+    | tsv-select -H -f 'nextclade_version,dataset_version' \
+    | tail -n 1 \
+    | jq --raw-input -c '
+        split("\t")
+        | { "nextclade_version": .[0], "nextclade_dataset_version": .[1] }') \
+     2> /dev/null
diff --git a/bin/use-nextclade-cache b/bin/use-nextclade-cache
@@ -0,0 +1,59 @@
+#!/bin/bash
+set -euo pipefail
+
+bin="$(dirname "$0")"
+vendored="$bin"/../vendored
+
+main() {
+    s3_dst="${1:?A destination s3:// URL where the renew file is hosted is required as the first argument.}"
+    s3_src="${2:?A source s3:// URL where the fallback renew file is hosted is required as the second argument.}"
+    nextclade="${3:?A path to the Nextclade executable is required as the third argument}"
+    nextclade_dataset="${4:?A path to a Nextclade dataset ZIP archive is required as the fourth argument}"
+    # Nextclade dataset reference wildcard
+    reference="${5:-}"
+
+    if renew-flag-exists; then
+        echo "[INFO] Found renew flag" >&2
+        echo "false"
+        exit 0
+    fi
+
+    cache_versions="$(get-cache-version-info)"
+    cache_nextclade_version="$(echo "$cache_versions" | jq -r .nextclade_version)"
+    current_nextclade_version="$("$nextclade" --version)"
+    if [[ "$cache_nextclade_version" != "$current_nextclade_version" ]]; then
+        echo "[INFO] Current Nextclade version ($current_nextclade_version) is different from cache version ($cache_nextclade_version)" >&2
+        echo "false"
+        exit 0
+    fi
+
+    cache_dataset_version="$(echo "$cache_versions" | jq -r .nextclade_dataset_version)"
+    current_dataset_version="$(unzip -p "$nextclade_dataset" pathogen.json | jq -r '.version.tag')"
+    if [[ "$cache_dataset_version" != "$current_dataset_version" ]]; then
+        echo "[INFO] Current Nextclade dataset version ($current_dataset_version) is different from cache version ($cache_dataset_version)" >&2
+        echo "false"
+        exit 0
+    fi
+
+    echo "true"
+}
+
+renew-flag-exists() {
+    local renew_file="nextclade$reference.tsv.zst.renew"
+    local dst_renew_file="$s3_dst/$renew_file"
+    local src_renew_file="$s3_src/$renew_file"
+
+    "$vendored"/s3-object-exists "$dst_renew_file" || "$vendored"/s3-object-exists "$src_renew_file"
+}
+
+get-cache-version-info() {
+    # TODO: Update check a separate file for version info
+    # Currently just checks the first row of the nextclade.tsv file
+    local version_file="nextclade$reference.tsv.zst"
+    local dst_version_file="$s3_dst/$version_file"
+    local src_version_file="$s3_src/$version_file"
+
+    "$bin"/fetch-cache-version "$dst_version_file" || "$bin"/cache-version "$src_version_file"
+}
+
+main "$@"
diff --git a/workflow/snakemake_rules/nextclade.smk b/workflow/snakemake_rules/nextclade.smk
@@ -62,52 +62,83 @@ if config.get("s3_dst") and config.get("s3_src"):
     ruleorder: download_nextclade_tsv_from_s3 > create_empty_nextclade_info
     ruleorder: download_previous_alignment_from_s3 > create_empty_nextclade_aligned
 
+
+    rule use_nextclade_cache:
+        input:
+            nextclade="data/nextclade",
+            nextclade_dataset=lambda w: f"data/nextclade_data/sars-cov-2{w.reference.replace('_','-')}.zip",
+        params:
+            dst_source=config["s3_dst"],
+            src_source=config["s3_src"],
+        output:
+            use_nextclade_cache=f"data/{database}/use_nextclade_cache{{reference}}.txt",
+        shell:
+            """
+            ./bin/use-nextclade-cache \
+                {params.dst_source:q} \
+                {params.src_source:q} \
+                {input.nextclade:q} \
+                {input.nextclade_dataset:q} \
+                {wildcards.reference:q} \
+                > {output.use_nextclade_cache}
+            """
+
+
     rule download_nextclade_tsv_from_s3:
         """
         If there's a .renew touchfile, do not use the cache
         """
+        input:
+            use_nextclade_cache=f"data/{database}/use_nextclade_cache{{reference}}.txt",
         params:
             dst_source=config["s3_dst"] + "/nextclade{reference}.tsv.zst",
             src_source=config["s3_src"] + "/nextclade{reference}.tsv.zst",
-            dst_rerun_touchfile=config["s3_dst"] + "/nextclade{reference}.tsv.zst.renew",
-            src_rerun_touchfile=config["s3_dst"] + "/nextclade{reference}.tsv.zst.renew",
             lines=config.get("subsample", {}).get("nextclade", 0),
         output:
             nextclade=f"data/{database}/nextclade{{reference}}_old.tsv",
         benchmark:
             f"benchmarks/download_nextclade_tsv_from_s3_{database}{{reference}}.txt"
         shell:
             """
-            ./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.nextclade} 0 ||  \
-            ./vendored/download-from-s3 {params.src_rerun_touchfile} {output.nextclade} 0 ||  \
-            ./vendored/download-from-s3 {params.dst_source} {output.nextclade} {params.lines} ||  \
-            ./vendored/download-from-s3 {params.src_source} {output.nextclade} {params.lines} ||  \
-            touch {output.nextclade}
+            use_nextclade_cache=$(cat {input.use_nextclade_cache})
+
+            if [[ "$use_nextclade_cache" == 'true' ]]; then
+                echo "[INFO] Downloading cached nextclade{wildcards.reference}.tsv.zst"
+                ./vendored/download-from-s3 {params.dst_source} {output.nextclade} {params.lines} ||  \
+                ./vendored/download-from-s3 {params.src_source} {output.nextclade} {params.lines}
+            else
+                echo "[INFO] Ignoring cached nextclade{wildcards.reference}.tsv.zst"
+                touch {output.nextclade}
+            fi
             """
 
     rule download_previous_alignment_from_s3:
         ## NOTE two potential bugs with this implementation:
         ## (1) race condition. This file may be updated on the remote after download_nextclade has run but before this rule
         ## (2) we may get `download_nextclade` and `download_previous_alignment` from different s3 buckets
+        input:
+            use_nextclade_cache=f"data/{database}/use_nextclade_cache.txt",
         params:
             dst_source=config["s3_dst"] + "/{seqtype}.fasta.zst",
             src_source=config["s3_src"] + "/{seqtype}.fasta.zst",
-            dst_rerun_touchfile=config["s3_dst"] + "/nextclade.tsv.zst.renew",
-            src_rerun_touchfile=config["s3_dst"] + "/nextclade.tsv.zst.renew",
             lines=config.get("subsample", {}).get("nextclade", 0),
         output:
             alignment=temp(f"data/{database}/nextclade.{{seqtype}}.old.fasta"),
         benchmark:
             f"benchmarks/download_previous_alignment_from_s3_{database}{{seqtype}}.txt"
         shell:
             """
-            ./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.alignment} 0 ||  \
-            ./vendored/download-from-s3 {params.src_rerun_touchfile} {output.alignment} 0 ||  \
-            ./vendored/download-from-s3 {params.dst_source} {output.alignment} {params.lines} ||  \
-            ./vendored/download-from-s3 {params.src_source} {output.alignment} {params.lines} ||  \
-            touch {output.alignment}
-            """
+            use_nextclade_cache=$(cat {input.use_nextclade_cache})
 
+            if [[ "$use_nextclade_cache" == 'true' ]]; then
+                echo "[INFO] Downloading cached Nextclade {wildcards.seqtype}.fasta.zst"
+                ./vendored/download-from-s3 {params.dst_source} {output.alignment} {params.lines} ||  \
+                ./vendored/download-from-s3 {params.src_source} {output.alignment} {params.lines}
+            else
+                echo "[INFO] Ignoring cached Nextclade {wildcards.seqtype}.fasta.zst"
+                touch {output.alignment}
+            fi
+            """
 
 rule get_sequences_without_nextclade_annotations:
     """Find sequences in FASTA which don't have clades assigned yet"""
@@ -135,40 +166,40 @@ rule get_sequences_without_nextclade_annotations:
 rule download_nextclade_executable:
     """Download Nextclade"""
     output:
-        nextclade="nextclade",
+        nextclade="data/nextclade",
     benchmark:
         f"benchmarks/download_nextclade_executable_{database}.txt"
     shell:
         """
         if [ "$(uname)" = "Darwin" ]; then
-            curl -fsSL "https://github.com/nextstrain/nextclade/releases/latest/download/nextclade-x86_64-apple-darwin" -o "nextclade"
+            curl -fsSL "https://github.com/nextstrain/nextclade/releases/latest/download/nextclade-x86_64-apple-darwin" -o {output.nextclade:q}
 
         else
-            curl -fsSL "https://github.com/nextstrain/nextclade/releases/latest/download/nextclade-x86_64-unknown-linux-gnu" -o "nextclade"
+            curl -fsSL "https://github.com/nextstrain/nextclade/releases/latest/download/nextclade-x86_64-unknown-linux-gnu" -o {output.nextclade:q}
         fi
-        chmod +x nextclade
+        chmod +x {output.nextclade:q}
 
-        if ! command -v ./nextclade &>/dev/null; then
+        if ! command -v {output.nextclade:q} &>/dev/null; then
             echo "[ERROR] Nextclade executable not found"
             exit 1
         fi
 
-        NEXTCLADE_VERSION="$(./nextclade --version)"
+        NEXTCLADE_VERSION="$({output.nextclade:q} --version)"
         echo "[ INFO] Nextclade version: $NEXTCLADE_VERSION"
         """
 
 
 rule download_nextclade_dataset:
     """Download Nextclade dataset"""
     input:
-        "nextclade",
+        nextclade="data/nextclade",
     output:
         dataset="data/nextclade_data/{dataset_name}.zip",
     benchmark:
         f"benchmarks/download_nextclade_dataset_{database}_{{dataset_name}}.txt"
     shell:
         """
-        ./nextclade dataset get --name="{wildcards.dataset_name}" --output-zip={output.dataset} --verbose
+        {input.nextclade:q} dataset get --name="{wildcards.dataset_name}" --output-zip={output.dataset} --verbose
         """
 
 
@@ -179,7 +210,7 @@ rule run_wuhan_nextclade:
     metrics which will ultimately end up in metadata.tsv.
     """
     input:
-        nextclade_path="nextclade",
+        nextclade_path="data/nextclade",
         dataset="data/nextclade_data/sars-cov-2.zip",
         sequences=f"data/{database}/nextclade.sequences.fasta",
     params:
@@ -214,7 +245,7 @@ rule run_21L_nextclade:
     Like wuhan nextclade, but TSV only, no alignments output
     """
     input:
-        nextclade_path="nextclade",
+        nextclade_path="data/nextclade",
         dataset=lambda w: f"data/nextclade_data/sars-cov-2-21L.zip",
         sequences=f"data/{database}/nextclade_21L.sequences.fasta",
     output:
@@ -235,6 +266,7 @@ rule run_21L_nextclade:
 
 rule nextclade_tsv_concat_versions:
     input:
+        nextclade="data/nextclade",
         tsv=f"data/{database}/nextclade{{reference}}_new_raw.tsv",
         dataset=lambda w: f"data/nextclade_data/sars-cov-2{w.reference.replace('_','-')}.zip",
     output:
@@ -245,7 +277,7 @@ rule nextclade_tsv_concat_versions:
         """
         if [ -s {input.tsv} ]; then
             # Get version numbers
-            nextclade_version="$(./nextclade --version)"
+            nextclade_version="$({input.nextclade:q} --version)"
             dataset_version="$(unzip -p {input.dataset} pathogen.json | jq -r '.version.tag')"
             timestamp="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"