From 56af6402976ecc50766b1df425e7a6a9e60cd33f Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Tue, 28 Jan 2025 11:48:46 +0100 Subject: [PATCH] feat: add automatic updates for sc2 datasets This new GitHub Action will download nightly Nextclade trees from nextstrain.org and submit a PR to update SC2 datasets every Monday 21:15 UTC. Once reviewed, the updates can be merged and released. We prefer releasing on Tuesday-Wednesday, hence the time. This update can also be triggered manually by going to [Actions page](https://github.com/nextstrain/nextclade_data/actions), clicking "update-sars-cov-2-datasets" and then "Run workflow" (chose "master" branch). You can also run the update script locally: ```bash ./scripts/update_sc2_datasets ``` In this case the PR will be opened from your name (or whatever account you are logged into in `git` and `gh`). --- .../workflows/update-sars-cov-2-datasets.yml | 55 +++++++ scripts/update_sc2_datasets | 140 ++++++++++++++++++ 2 files changed, 195 insertions(+) create mode 100644 .github/workflows/update-sars-cov-2-datasets.yml create mode 100755 scripts/update_sc2_datasets diff --git a/.github/workflows/update-sars-cov-2-datasets.yml b/.github/workflows/update-sars-cov-2-datasets.yml new file mode 100644 index 00000000..067dac9a --- /dev/null +++ b/.github/workflows/update-sars-cov-2-datasets.yml @@ -0,0 +1,55 @@ +name: update-sars-cov-2-datasets + +on: + schedule: + # At 21:15 on Monday + # https://crontab.guru/#15_21_*_*_1 + - cron: '15 21 * * 1' + + repository_dispatch: + types: update-sars-cov-2-datasets + +concurrency: + group: update-sars-cov-2-datasets-${{ github.workflow }}-${{ github.ref_type }}-${{ github.event.pull_request.number || github.ref || github.run_id }} + cancel-in-progress: true + +defaults: + run: + shell: bash -euxo pipefail {0} + + +jobs: + update-sars-cov-2-datasets: + runs-on: ubuntu-24.04 + + environment: + name: ${{ github.ref }} + + env: + GH_TOKEN: ${{ github.token }} + + steps: + - name: "Checkout code" + if: ${{ steps.config.outputs.is_fork != 'true' }} + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Number of commits to fetch. 0 indicates all history for all branches and tags. + fetch-tags: true # Whether to fetch tags, even if fetch-depth > 0. + ref: ${{ github.head_ref || github.ref }} + + - name: "Install dependencies" + run: | + sudo apt-get curl --yes -qq >/dev/null + + mkdir -p "${HOME}/bin" + export PATH="${HOME}/bin:${PATH}" + curl -fsSL "https://github.com/cli/cli/releases/download/v2.42.1/gh_2.42.1_linux_amd64.tar.gz" | tar xz -C "${HOME}/bin" --strip-components=2 gh_2.42.1_linux_amd64/bin/gh + + - name: "Configure git" + run: | + git config --global user.email "${{ secrets.BOT_GIT_USER_EMAIL }}" + git config --global user.name "${{ secrets.BOT_GIT_USER_NAME }}" + + - name: "Update datasets" + run: | + ./scripts/update_sc2_datasets diff --git a/scripts/update_sc2_datasets b/scripts/update_sc2_datasets new file mode 100755 index 00000000..3c91cdb3 --- /dev/null +++ b/scripts/update_sc2_datasets @@ -0,0 +1,140 @@ +#!/usr/bin/env bash + +# Submit a PR to update Nextclade tree +# +# Dependencies: +# mkdir -p "${HOME}/bin" +# export PATH="${HOME}/bin:${PATH}" +# curl -fsSL "https://github.com/cli/cli/releases/download/v2.42.1/gh_2.42.1_linux_amd64.tar.gz" | tar xz -C "${HOME}/bin" --strip-components=2 gh_2.42.1_linux_amd64/bin/gh +# +# Requires git user dataset_name and email to be set. For example: +# git config user.email "${{ secrets.GIT_USER_EMAIL }}" +# git config user.dataset_name "${{ secrets.GIT_USER_NAME }}" + +set -euxo pipefail +trap 'exit' INT + +# Directory where this script resides +THIS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd)" + + +BRANCH="auto-update-sc2-datasets" +TITLE="Update SARS-CoV-2 datasets [automated]" + +declare -A trees=( + ["https://nextstrain.org/staging/nextclade/sars-cov-2"]="nextstrain/sars-cov-2/wuhan-hu-1/orfs" + ["https://nextstrain.org/staging/nextclade/sars-cov-2/nsp"]="nextstrain/sars-cov-2/wuhan-hu-1/proteins" + ["https://nextstrain.org/staging/nextclade/sars-cov-2/BA.2"]="nextstrain/sars-cov-2/BA.2" + ["https://nextstrain.org/staging/nextclade/sars-cov-2/BA.2.86"]="nextstrain/sars-cov-2/BA.2.86" + ["https://nextstrain.org/staging/nextclade/sars-cov-2/XBB"]="nextstrain/sars-cov-2/XBB" +) + +function write_changelog() { + local dataset_name="${1:?}" + + cat <<~~ +## Unreleased + +Automated update + +~~ +} +export -f write_changelog + + +function update_dataset() { + set -euxo pipefail + + local tree_url="${1:?}" + local dataset_name="${2:?}" + + curl -fsSL \ + -o "data/${dataset_name}/tree.json" \ + -H "Accept: application/vnd.nextstrain.dataset.main+json;q=1, application/json;q=0.9, text/plain;q=0.8, */*;q=0.1" \ + "${tree_url}" + + write_changelog "${dataset_name}" > "data/${dataset_name}/CHANGELOG.md" +} +export -f update_dataset +parallel --jobs "${#trees[@]}" update_dataset ::: "${!trees[@]}" ::: "${trees[@]}" + + +# Setup git identity +gh auth setup-git >/dev/null + +# Delete the branch if already exists. This also closes the PR if exists. +if git rev-parse --verify "${BRANCH}"; then + git branch -D "${BRANCH}" + git push origin --delete "${BRANCH}" +fi + +# Create a new branch +git switch --quiet --create "${BRANCH}" + +# Stage commit and push the changes +git add $(printf "data/%s/tree.json " "${trees[@]}") $(printf "data/%s/CHANGELOG.md " "${trees[@]}") +git commit --quiet -m "${TITLE}" +git push --quiet + +# Function to generate markdown table from the associative array +function summary_table() { + echo "| Tree on Nextstrain.org | Submitted dataset | Nextclade link¹ |" + echo "| --- | --- | --- |" + for tree_url in "${!trees[@]}"; do + local nextstrain_path="${tree_url#https://nextstrain.org/}" + local dataset_name="${trees[$tree_url]}" + echo "| [${nextstrain_path}](${tree_url}) | [${dataset_name}](https://github.com/nextstrain/nextclade_data/tree/${BRANCH}/data/${trees[$tree_url]}) | [${dataset_name}](https://clades.nextstrain.org/?dataset-server=gh:@${BRANCH}@&dataset-name=${dataset_name}) |" + done +} + +# Create PR body +function pr_body() { + cat <<~~ +Automatically updates SARS-CoV-2 datasets. + +$(summary_table) + +--- + +¹ - Please make sure to wait for the CI to deploy dataset server before using links in the "Nextclade link" column. Once you see the "chore: rebuild [skip ci]" commit from nextstrain-bot below the links are ready to go. Otherwise they display the previous data (from master branch). + +--- + +Note: this pull request is submitted [automatically](https://github.com/nextstrain/nextclade_data/blob/master/.github/workflows/update-sc2-datasets.yml). + +~~ +} + +# Submit the PR, reading message body from stdin +function submit_pr() { + pr_body | gh pr create \ + --title "${TITLE}" \ + --body-file - \ + --repo "nextstrain/nextclade_data" +} + +# Get PR number from the dataset_name of gh command +function get_pr_number() { + # The message from gh command typically ends with a URL to the PR and PR number is the last component of the path. + # Split the string on '/' and take the last component. + echo "${1##*/}" +} + +# Close PR given its number +function close_pr() { + gh pr close "${1:?}" --delete-branch --repo "nextstrain/nextclade_data" +} + +if ! result=$(submit_pr 2>&1); then + echo "${result}" >&2 + if [[ "${result}" == *"already exists"* ]]; then + # Failed because PR already exists. Close and resubmit. + pr_number=$(get_pr_number "${result}") + close_pr "${pr_number}" + submit_pr + else + exit 1 + fi +else + echo "${result}" +fi