Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add automatic updates for sc2 datasets #255

Merged
merged 1 commit into from
Jan 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions .github/workflows/update-sars-cov-2-datasets.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
name: update-sars-cov-2-datasets

on:
schedule:
# At 21:15 on Monday
# https://crontab.guru/#15_21_*_*_1
- cron: '15 21 * * 1'

repository_dispatch:
types: update-sars-cov-2-datasets

concurrency:
group: update-sars-cov-2-datasets-${{ github.workflow }}-${{ github.ref_type }}-${{ github.event.pull_request.number || github.ref || github.run_id }}
cancel-in-progress: true

defaults:
run:
shell: bash -euxo pipefail {0}


jobs:
update-sars-cov-2-datasets:
runs-on: ubuntu-24.04

environment:
name: ${{ github.ref }}

env:
GH_TOKEN: ${{ github.token }}

steps:
- name: "Checkout code"
if: ${{ steps.config.outputs.is_fork != 'true' }}
uses: actions/checkout@v4
with:
fetch-depth: 0 # Number of commits to fetch. 0 indicates all history for all branches and tags.
fetch-tags: true # Whether to fetch tags, even if fetch-depth > 0.
ref: ${{ github.head_ref || github.ref }}

- name: "Install dependencies"
run: |
sudo apt-get curl --yes -qq >/dev/null

mkdir -p "${HOME}/bin"
export PATH="${HOME}/bin:${PATH}"
curl -fsSL "https://github.com/cli/cli/releases/download/v2.42.1/gh_2.42.1_linux_amd64.tar.gz" | tar xz -C "${HOME}/bin" --strip-components=2 gh_2.42.1_linux_amd64/bin/gh

- name: "Configure git"
run: |
git config --global user.email "${{ secrets.BOT_GIT_USER_EMAIL }}"
git config --global user.name "${{ secrets.BOT_GIT_USER_NAME }}"

- name: "Update datasets"
run: |
./scripts/update_sc2_datasets
140 changes: 140 additions & 0 deletions scripts/update_sc2_datasets
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
#!/usr/bin/env bash

# Submit a PR to update Nextclade tree
#
# Dependencies:
# mkdir -p "${HOME}/bin"
# export PATH="${HOME}/bin:${PATH}"
# curl -fsSL "https://github.com/cli/cli/releases/download/v2.42.1/gh_2.42.1_linux_amd64.tar.gz" | tar xz -C "${HOME}/bin" --strip-components=2 gh_2.42.1_linux_amd64/bin/gh
#
# Requires git user dataset_name and email to be set. For example:
# git config user.email "${{ secrets.GIT_USER_EMAIL }}"
# git config user.dataset_name "${{ secrets.GIT_USER_NAME }}"

set -euxo pipefail
trap 'exit' INT

# Directory where this script resides
THIS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd)"


BRANCH="auto-update-sc2-datasets"
TITLE="Update SARS-CoV-2 datasets [automated]"

declare -A trees=(
["https://nextstrain.org/staging/nextclade/sars-cov-2"]="nextstrain/sars-cov-2/wuhan-hu-1/orfs"
["https://nextstrain.org/staging/nextclade/sars-cov-2/nsp"]="nextstrain/sars-cov-2/wuhan-hu-1/proteins"
["https://nextstrain.org/staging/nextclade/sars-cov-2/BA.2"]="nextstrain/sars-cov-2/BA.2"
["https://nextstrain.org/staging/nextclade/sars-cov-2/BA.2.86"]="nextstrain/sars-cov-2/BA.2.86"
["https://nextstrain.org/staging/nextclade/sars-cov-2/XBB"]="nextstrain/sars-cov-2/XBB"
)

function write_changelog() {
local dataset_name="${1:?}"

cat <<~~
## Unreleased

Automated update

~~
}
export -f write_changelog


function update_dataset() {
set -euxo pipefail

local tree_url="${1:?}"
local dataset_name="${2:?}"

curl -fsSL \
-o "data/${dataset_name}/tree.json" \
-H "Accept: application/vnd.nextstrain.dataset.main+json;q=1, application/json;q=0.9, text/plain;q=0.8, */*;q=0.1" \
"${tree_url}"

write_changelog "${dataset_name}" > "data/${dataset_name}/CHANGELOG.md"
}
export -f update_dataset
parallel --jobs "${#trees[@]}" update_dataset ::: "${!trees[@]}" ::: "${trees[@]}"


# Setup git identity
gh auth setup-git >/dev/null

# Delete the branch if already exists. This also closes the PR if exists.
if git rev-parse --verify "${BRANCH}"; then
git branch -D "${BRANCH}"
git push origin --delete "${BRANCH}"
fi

# Create a new branch
git switch --quiet --create "${BRANCH}"

# Stage commit and push the changes
git add $(printf "data/%s/tree.json " "${trees[@]}") $(printf "data/%s/CHANGELOG.md " "${trees[@]}")
git commit --quiet -m "${TITLE}"
git push --quiet

# Function to generate markdown table from the associative array
function summary_table() {
echo "| Tree on Nextstrain.org | Submitted dataset | Nextclade link¹ |"
echo "| --- | --- | --- |"
for tree_url in "${!trees[@]}"; do
local nextstrain_path="${tree_url#https://nextstrain.org/}"
local dataset_name="${trees[$tree_url]}"
echo "| [${nextstrain_path}](${tree_url}) | [${dataset_name}](https://github.com/nextstrain/nextclade_data/tree/${BRANCH}/data/${trees[$tree_url]}) | [${dataset_name}](https://clades.nextstrain.org/?dataset-server=gh:@${BRANCH}@&dataset-name=${dataset_name}) |"
done
}

# Create PR body
function pr_body() {
cat <<~~
Automatically updates SARS-CoV-2 datasets.

$(summary_table)

---

¹ - Please make sure to wait for the CI to deploy dataset server before using links in the "Nextclade link" column. Once you see the "chore: rebuild [skip ci]" commit from nextstrain-bot below the links are ready to go. Otherwise they display the previous data (from master branch).

---

Note: this pull request is submitted [automatically](https://github.com/nextstrain/nextclade_data/blob/master/.github/workflows/update-sc2-datasets.yml).

~~
}

# Submit the PR, reading message body from stdin
function submit_pr() {
pr_body | gh pr create \
--title "${TITLE}" \
--body-file - \
--repo "nextstrain/nextclade_data"
}

# Get PR number from the dataset_name of gh command
function get_pr_number() {
# The message from gh command typically ends with a URL to the PR and PR number is the last component of the path.
# Split the string on '/' and take the last component.
echo "${1##*/}"
}

# Close PR given its number
function close_pr() {
gh pr close "${1:?}" --delete-branch --repo "nextstrain/nextclade_data"
}

if ! result=$(submit_pr 2>&1); then
echo "${result}" >&2
if [[ "${result}" == *"already exists"* ]]; then
# Failed because PR already exists. Close and resubmit.
pr_number=$(get_pr_number "${result}")
close_pr "${pr_number}"
submit_pr
else
exit 1
fi
else
echo "${result}"
fi
Loading