From 6b7959e18acf4ad1e9b0ad6a2f4dc0fbb9e90173 Mon Sep 17 00:00:00 2001 From: Dennis Ameling Date: Wed, 22 Nov 2023 10:23:46 +0100 Subject: [PATCH 01/11] Self hosted runners: make VM image configurable through environment variable The image SKU was hidden a bit in the azure-arm-template.json. Let's make it more visible by turning it into a parameter that the CI pipeline can provide. Signed-off-by: Dennis Ameling --- .github/workflows/create-azure-self-hosted-runners.yml | 2 ++ azure-self-hosted-runners/azure-arm-template.json | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/create-azure-self-hosted-runners.yml b/.github/workflows/create-azure-self-hosted-runners.yml index b9316d2a..dd6d9e19 100644 --- a/.github/workflows/create-azure-self-hosted-runners.yml +++ b/.github/workflows/create-azure-self-hosted-runners.yml @@ -39,6 +39,7 @@ env: # For more information, see https://learn.microsoft.com/en-us/azure/virtual-machines/dplsv5-dpldsv5-series (which # unfortunately does not have more information about price by region) AZURE_VM_REGION: westus2 + AZURE_VM_IMAGE: win11-22h2-ent # The following secrets are required for this workflow to run: # AZURE_CREDENTIALS - Credentials for the Azure CLI. It's recommended to set up a resource @@ -130,6 +131,7 @@ jobs: githubActionsRunnerRegistrationUrl="$ACTIONS_RUNNER_REGISTRATION_URL" githubActionsRunnerToken="$ACTIONS_RUNNER_TOKEN" postDeploymentPsScriptUrl="$POST_DEPLOYMENT_SCRIPT_URL" + virtualMachineImage="$AZURE_VM_IMAGE" virtualMachineName="${{ steps.generate-vm-name.outputs.vm_name }}" virtualMachineSize="$AZURE_VM_TYPE" publicIpAddressName1="${{ steps.generate-vm-name.outputs.vm_name }}-ip" diff --git a/azure-self-hosted-runners/azure-arm-template.json b/azure-self-hosted-runners/azure-arm-template.json index 42a81bad..1bf6b140 100644 --- a/azure-self-hosted-runners/azure-arm-template.json +++ b/azure-self-hosted-runners/azure-arm-template.json @@ -81,6 +81,9 @@ "osDiskDeleteOption": { "type": "string" }, + "virtualMachineImage": { + "type": "string" + }, "virtualMachineSize": { "type": "string" }, @@ -209,7 +212,7 @@ "imageReference": { "publisher": "microsoftwindowsdesktop", "offer": "windows11preview-arm64", - "sku": "win11-22h2-ent", + "sku": "[parameters('virtualMachineImage')]", "version": "latest" } }, From 59003a98f3fd0ac7c2c6ef95d35843609610f3fe Mon Sep 17 00:00:00 2001 From: Dennis Ameling Date: Wed, 22 Nov 2023 10:28:32 +0100 Subject: [PATCH 02/11] Self hosted runners: update image from 22h2 to 23h2 Version 23h2 was released on October 31st, 2023. Let's update to it, so that the runners benefit from the latest improvements to Windows 11 ARM64. Signed-off-by: Dennis Ameling --- .github/workflows/create-azure-self-hosted-runners.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/create-azure-self-hosted-runners.yml b/.github/workflows/create-azure-self-hosted-runners.yml index dd6d9e19..e60ddf4b 100644 --- a/.github/workflows/create-azure-self-hosted-runners.yml +++ b/.github/workflows/create-azure-self-hosted-runners.yml @@ -39,7 +39,7 @@ env: # For more information, see https://learn.microsoft.com/en-us/azure/virtual-machines/dplsv5-dpldsv5-series (which # unfortunately does not have more information about price by region) AZURE_VM_REGION: westus2 - AZURE_VM_IMAGE: win11-22h2-ent + AZURE_VM_IMAGE: win11-23h2-ent # The following secrets are required for this workflow to run: # AZURE_CREDENTIALS - Credentials for the Azure CLI. It's recommended to set up a resource From f428d14f1fbeb550d1f56c4413fab7b9cb205ee6 Mon Sep 17 00:00:00 2001 From: Dennis Ameling Date: Wed, 22 Nov 2023 11:26:06 +0100 Subject: [PATCH 03/11] Self hosted runners: add cleanup script Sometimes, VMs don't get deleted properly. Let's set up a script to clean up VMs that were created more than 3 hours ago. This script runs every 6 hours (cron) and we can also invoke it manually if needed. Signed-off-by: Dennis Ameling --- .../workflows/cleanup-self-hosted-runners.yml | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 .github/workflows/cleanup-self-hosted-runners.yml diff --git a/.github/workflows/cleanup-self-hosted-runners.yml b/.github/workflows/cleanup-self-hosted-runners.yml new file mode 100644 index 00000000..d365764f --- /dev/null +++ b/.github/workflows/cleanup-self-hosted-runners.yml @@ -0,0 +1,56 @@ +name: Cleanup Azure self hosted runners +run-name: Cleanup Azure self hosted runners + +on: + schedule: + # Run every 6 hours + - cron: "0 */6 * * *" + workflow_dispatch: + +# The following secrets are required for this workflow to run: +# AZURE_CREDENTIALS - Credentials for the Azure CLI. It's recommended to set up a resource +# group specifically for self-hosted Actions Runners. +# az ad sp create-for-rbac --name "{YOUR_DESCRIPTIVE_NAME_HERE}" --role contributor \ +# --scopes /subscriptions/{SUBSCRIPTION_ID_HERE}/resourceGroups/{RESOURCE_GROUP_HERE} \ +# --sdk-auth +# AZURE_RESOURCE_GROUP - Resource group to create the runner(s) in +jobs: + delete-runner: + runs-on: ubuntu-latest + steps: + - name: Azure Login + uses: azure/login@v1 + with: + creds: ${{ secrets.AZURE_CREDENTIALS }} + - name: Discover VMs to delete + uses: azure/CLI@v1 + with: + azcliversion: 2.54.0 + inlineScript: | + active_vms=$(az vm list -g ${{ secrets.AZURE_RESOURCE_GROUP }} | jq -c '.[] | {name,timeCreated}') + current_time=$(date +%s) + three_hours_ago=$(($current_time - 3 * 3600)) + + if [ -z "$active_vms" ]; then + echo "No active VMs found, nothing to do." + exit 0 + else + echo "Found these active VMs:" + echo $active_vms + fi + + for active_vm in ${active_vms[@]}; do + vm_name=$(echo $active_vm | jq '.name') + vm_creation_iso_string=$(echo $active_vm | jq -r '.timeCreated') + vm_creation_time=$(date -d $vm_creation_iso_string +%s) + + if [ "$three_hours_ago" -ge "$vm_creation_time" ]; then + echo "The VM ${vm_name} was created more than 3 hours ago and wasn't deleted. Let's do that now." + az vm delete -n "$vm_name" -g ${{ secrets.AZURE_RESOURCE_GROUP }} --yes + az network nsg delete -n "$vm_name"-nsg -g ${{ secrets.AZURE_RESOURCE_GROUP }} + az network vnet delete -n "$vm_name"-vnet -g ${{ secrets.AZURE_RESOURCE_GROUP }} + az network public-ip delete -n "$vm_name"-ip -g ${{ secrets.AZURE_RESOURCE_GROUP }} + else + echo "The VM ${vm_name} was created less then 3 hours ago and shouldn't be deleted yet. Skipping." + fi + done From d718f4948caae9b68848105ba793258817cdf5e0 Mon Sep 17 00:00:00 2001 From: Dennis Ameling Date: Sun, 15 Sep 2024 13:39:29 +0200 Subject: [PATCH 04/11] cleanup-self-hosted-runners: upgrade to v2 of azure/login and azure/CLI Signed-off-by: Dennis Ameling Signed-off-by: Johannes Schindelin --- .github/workflows/cleanup-self-hosted-runners.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cleanup-self-hosted-runners.yml b/.github/workflows/cleanup-self-hosted-runners.yml index d365764f..aab8014a 100644 --- a/.github/workflows/cleanup-self-hosted-runners.yml +++ b/.github/workflows/cleanup-self-hosted-runners.yml @@ -19,11 +19,11 @@ jobs: runs-on: ubuntu-latest steps: - name: Azure Login - uses: azure/login@v1 + uses: azure/login@v2 with: creds: ${{ secrets.AZURE_CREDENTIALS }} - name: Discover VMs to delete - uses: azure/CLI@v1 + uses: azure/CLI@v2 with: azcliversion: 2.54.0 inlineScript: | From 55c824e0fa3a294a7c9546014bfa7a5a77db8bbb Mon Sep 17 00:00:00 2001 From: Dennis Ameling Date: Sun, 15 Sep 2024 16:43:30 +0200 Subject: [PATCH 05/11] cleanup-self-hosted-runners: use JQ to extract the creation time This also quotes the string properly. While at it, also use a newer Azure CLI version Signed-off-by: Dennis Ameling Signed-off-by: Johannes Schindelin --- .github/workflows/cleanup-self-hosted-runners.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cleanup-self-hosted-runners.yml b/.github/workflows/cleanup-self-hosted-runners.yml index aab8014a..ecbc3808 100644 --- a/.github/workflows/cleanup-self-hosted-runners.yml +++ b/.github/workflows/cleanup-self-hosted-runners.yml @@ -25,7 +25,8 @@ jobs: - name: Discover VMs to delete uses: azure/CLI@v2 with: - azcliversion: 2.54.0 + # Stick to 2.63.0 until jq is added to 2.64.0+ https://github.com/Azure/azure-cli/issues/29830 + azcliversion: 2.63.0 inlineScript: | active_vms=$(az vm list -g ${{ secrets.AZURE_RESOURCE_GROUP }} | jq -c '.[] | {name,timeCreated}') current_time=$(date +%s) @@ -41,8 +42,10 @@ jobs: for active_vm in ${active_vms[@]}; do vm_name=$(echo $active_vm | jq '.name') - vm_creation_iso_string=$(echo $active_vm | jq -r '.timeCreated') - vm_creation_time=$(date -d $vm_creation_iso_string +%s) + # Use jq to extract and format the date-time string + vm_creation_time_string="$(echo $active_vm | + jq -r '.timeCreated | sub("\\.[0-9]+[+-][0-9]+:[0-9]+$"; "") | sub("T"; " ")')" + vm_creation_time=$(TZ=UTC date -d "$vm_creation_time_string" +%s) if [ "$three_hours_ago" -ge "$vm_creation_time" ]; then echo "The VM ${vm_name} was created more than 3 hours ago and wasn't deleted. Let's do that now." From cba83928fa466a056a611feffeb62fc4db00832b Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sat, 14 Sep 2024 11:01:18 +0200 Subject: [PATCH 06/11] cleanup-self-hosted-runner: order the conditional blocks from small to large In general, if..else..fi constructs are more readable when the first block is shorter than the second, i.e. to handle the "easy" things first; According to Cognitive Load Theory, this uses less mental resources when wrapping our head around the code. In this instance, there is an even better reason: The next commit will add _another_ condition, and having it in this order will avoid adding another nesting level: if too young skip else if busy skip else force delete VM fi Signed-off-by: Johannes Schindelin --- .github/workflows/cleanup-self-hosted-runners.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cleanup-self-hosted-runners.yml b/.github/workflows/cleanup-self-hosted-runners.yml index ecbc3808..fdd943b6 100644 --- a/.github/workflows/cleanup-self-hosted-runners.yml +++ b/.github/workflows/cleanup-self-hosted-runners.yml @@ -47,13 +47,13 @@ jobs: jq -r '.timeCreated | sub("\\.[0-9]+[+-][0-9]+:[0-9]+$"; "") | sub("T"; " ")')" vm_creation_time=$(TZ=UTC date -d "$vm_creation_time_string" +%s) - if [ "$three_hours_ago" -ge "$vm_creation_time" ]; then + if [ "$three_hours_ago" -lt "$vm_creation_time" ]; then + echo "The VM ${vm_name} was created less then 3 hours ago and shouldn't be deleted yet. Skipping." + else echo "The VM ${vm_name} was created more than 3 hours ago and wasn't deleted. Let's do that now." az vm delete -n "$vm_name" -g ${{ secrets.AZURE_RESOURCE_GROUP }} --yes az network nsg delete -n "$vm_name"-nsg -g ${{ secrets.AZURE_RESOURCE_GROUP }} az network vnet delete -n "$vm_name"-vnet -g ${{ secrets.AZURE_RESOURCE_GROUP }} az network public-ip delete -n "$vm_name"-ip -g ${{ secrets.AZURE_RESOURCE_GROUP }} - else - echo "The VM ${vm_name} was created less then 3 hours ago and shouldn't be deleted yet. Skipping." fi done From d2e6833ab8e52808290b064bd04f3092b97ba496 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sat, 14 Sep 2024 12:17:57 +0200 Subject: [PATCH 07/11] Add a shell script to authenticate with `gh` as an App In Git for Windows' automation, we do a lot with Javascript, but sometimes it is also convenient to use the GitHub CLI (`gh`) in a shell script. Sadly, the scope of the `GITHUB_TOKEN` provided in GitHub workflows is often too limited, and we'd like to authenticate as a GitHub App instead. Even more sadly, authenticating with `gh` this way is quite complicated and fraught with problems because the token _needs_ to be masked in the GitHub workflow logs. Rejoice! This patch brings a shell script that hides all that nasty complexity. All it needs are the environment variables: - GH_APP_ID - GH_APP_PRIVATE_KEY - GITHUB_REPOSITORY and of course `gh` on the `PATH`. That's it! Signed-off-by: Johannes Schindelin --- gh-cli-auth-as-app.sh | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100755 gh-cli-auth-as-app.sh diff --git a/gh-cli-auth-as-app.sh b/gh-cli-auth-as-app.sh new file mode 100755 index 00000000..5641f982 --- /dev/null +++ b/gh-cli-auth-as-app.sh @@ -0,0 +1,25 @@ +#!/bin/sh + +node -e '(async () => { + const [owner, repo] = process.env.GITHUB_REPOSITORY.split("/") + const getAppInstallationId = require("./get-app-installation-id") + const installationId = await getAppInstallationId( + console, + process.env.GH_APP_ID, + process.env.GH_APP_PRIVATE_KEY, + owner, + repo + ) + const getInstallationAccessToken = require("./get-installation-access-token") + const token = await getInstallationAccessToken( + console, + process.env.GH_APP_ID, + process.env.GH_APP_PRIVATE_KEY, + installationId + ) + process.stderr.write(`::add-mask::${token.token}\n`) + process.stdout.write(token.token) +})().catch(e => { + process.stderr.write(JSON.stringify(e, null, 2)) + process.exit(1) +})' | gh auth login --with-token From f3557271c498fca39721a861b6670bc85827cb75 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sat, 14 Sep 2024 11:12:47 +0200 Subject: [PATCH 08/11] cleanup-self-hosted-runner: avoid deleting VMs that are busy So far, we avoid deleting VMs when they are too young. But we can do better than that: since the VMs are intended to host an ephemeral runner, and that runner is registered with the current repository, we can have a look whether GitHub says that the runner is busy. If it is, well, let's leave it a-running! Since this query requires a larger scope than the standard GitHub workflows' `GITHUB_TOKEN` provides, we need to authenticate as the GitForWindowsHelper GitHub App to do that. This is too complicated, unfortunately, in a GitHub workflow (at least without the risk of using non-official GitHub Actions), therefore we use the newly-added shell script to authenticate the GitHub CLI as a GitHub App (which of course now requires us to check out the repository). Signed-off-by: Johannes Schindelin --- .github/workflows/cleanup-self-hosted-runners.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/workflows/cleanup-self-hosted-runners.yml b/.github/workflows/cleanup-self-hosted-runners.yml index fdd943b6..69defc33 100644 --- a/.github/workflows/cleanup-self-hosted-runners.yml +++ b/.github/workflows/cleanup-self-hosted-runners.yml @@ -18,12 +18,16 @@ jobs: delete-runner: runs-on: ubuntu-latest steps: + - uses: actions/checkout@v4 - name: Azure Login uses: azure/login@v2 with: creds: ${{ secrets.AZURE_CREDENTIALS }} - name: Discover VMs to delete uses: azure/CLI@v2 + env: + GH_APP_ID: ${{ secrets.GH_APP_ID }} + GH_APP_PRIVATE_KEY: ${{ secrets.GH_APP_PRIVATE_KEY }} with: # Stick to 2.63.0 until jq is added to 2.64.0+ https://github.com/Azure/azure-cli/issues/29830 azcliversion: 2.63.0 @@ -49,6 +53,13 @@ jobs: if [ "$three_hours_ago" -lt "$vm_creation_time" ]; then echo "The VM ${vm_name} was created less then 3 hours ago and shouldn't be deleted yet. Skipping." + elif test true = "$(if test ! -f .cli-authenticated; then + ./gh-cli-auth-as-app.sh && + >.cli-authenticated # only authenticate once + fi && + gh api repos/$GITHUB_REPOSITORY/actions/runners \ + --jq '.runners[] | select(.name == "'$vm_name'") | .busy')"; then + echo "The VM ${vm_name} is still busy." else echo "The VM ${vm_name} was created more than 3 hours ago and wasn't deleted. Let's do that now." az vm delete -n "$vm_name" -g ${{ secrets.AZURE_RESOURCE_GROUP }} --yes From 2d1ebde0d0769e73b654389e31b60d3f0e772c0a Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sat, 14 Sep 2024 11:16:37 +0200 Subject: [PATCH 09/11] cleanup-self-hosted-runner: clean up idle runners earlier The original idea was to wait at least 3 hours before deleting a VM, to avoid interfering with long-running workflows. Now that we have a way to query whether the runner _is_ busy, we do not need to wait that long. Basically, we only need to ensure that the runner had a chance to register itself and to then pick up the job. Typically, this takes around 6-7 minutes, so waiting for an hour is probably even a tad conservative. Signed-off-by: Johannes Schindelin --- .github/workflows/cleanup-self-hosted-runners.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cleanup-self-hosted-runners.yml b/.github/workflows/cleanup-self-hosted-runners.yml index 69defc33..66acd4f4 100644 --- a/.github/workflows/cleanup-self-hosted-runners.yml +++ b/.github/workflows/cleanup-self-hosted-runners.yml @@ -34,7 +34,7 @@ jobs: inlineScript: | active_vms=$(az vm list -g ${{ secrets.AZURE_RESOURCE_GROUP }} | jq -c '.[] | {name,timeCreated}') current_time=$(date +%s) - three_hours_ago=$(($current_time - 3 * 3600)) + one_hour_ago=$(($current_time - 3600)) if [ -z "$active_vms" ]; then echo "No active VMs found, nothing to do." @@ -51,8 +51,8 @@ jobs: jq -r '.timeCreated | sub("\\.[0-9]+[+-][0-9]+:[0-9]+$"; "") | sub("T"; " ")')" vm_creation_time=$(TZ=UTC date -d "$vm_creation_time_string" +%s) - if [ "$three_hours_ago" -lt "$vm_creation_time" ]; then - echo "The VM ${vm_name} was created less then 3 hours ago and shouldn't be deleted yet. Skipping." + if [ "$one_hour_ago" -lt "$vm_creation_time" ]; then + echo "The VM ${vm_name} was created less then 1 hour ago and shouldn't be deleted yet. Skipping." elif test true = "$(if test ! -f .cli-authenticated; then ./gh-cli-auth-as-app.sh && >.cli-authenticated # only authenticate once From 3fb438cac6661f3b87dffef113f06c28336decdd Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sat, 14 Sep 2024 11:21:07 +0200 Subject: [PATCH 10/11] cleanup-self-hosted-runner: fix some white-space issue Signed-off-by: Johannes Schindelin --- .github/workflows/cleanup-self-hosted-runners.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cleanup-self-hosted-runners.yml b/.github/workflows/cleanup-self-hosted-runners.yml index 66acd4f4..d3551175 100644 --- a/.github/workflows/cleanup-self-hosted-runners.yml +++ b/.github/workflows/cleanup-self-hosted-runners.yml @@ -50,7 +50,7 @@ jobs: vm_creation_time_string="$(echo $active_vm | jq -r '.timeCreated | sub("\\.[0-9]+[+-][0-9]+:[0-9]+$"; "") | sub("T"; " ")')" vm_creation_time=$(TZ=UTC date -d "$vm_creation_time_string" +%s) - + if [ "$one_hour_ago" -lt "$vm_creation_time" ]; then echo "The VM ${vm_name} was created less then 1 hour ago and shouldn't be deleted yet. Skipping." elif test true = "$(if test ! -f .cli-authenticated; then From e71129b12521457d1e97aa87722ce0880dd3d2d0 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sat, 14 Sep 2024 13:05:37 +0200 Subject: [PATCH 11/11] cleanup-self-hosted-runner: use workflow commands for better mark-up When logging output that we're deleting a VM, it should be marked a bit more prominently. Let's mark it as a warning: https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-a-warning-message This marks it in red in the output. Conversely, we want to mention when we skipped the deletion, and to make those messages stick out a bit (but less than warnings), let's use: https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-a-notice-message Signed-off-by: Johannes Schindelin --- .github/workflows/cleanup-self-hosted-runners.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cleanup-self-hosted-runners.yml b/.github/workflows/cleanup-self-hosted-runners.yml index d3551175..4e16891c 100644 --- a/.github/workflows/cleanup-self-hosted-runners.yml +++ b/.github/workflows/cleanup-self-hosted-runners.yml @@ -52,16 +52,16 @@ jobs: vm_creation_time=$(TZ=UTC date -d "$vm_creation_time_string" +%s) if [ "$one_hour_ago" -lt "$vm_creation_time" ]; then - echo "The VM ${vm_name} was created less then 1 hour ago and shouldn't be deleted yet. Skipping." + echo "::notice::The VM ${vm_name} was created less then 1 hour ago and shouldn't be deleted yet. Skipping." elif test true = "$(if test ! -f .cli-authenticated; then ./gh-cli-auth-as-app.sh && >.cli-authenticated # only authenticate once fi && gh api repos/$GITHUB_REPOSITORY/actions/runners \ --jq '.runners[] | select(.name == "'$vm_name'") | .busy')"; then - echo "The VM ${vm_name} is still busy." + echo "::notice::The VM ${vm_name} is still busy." else - echo "The VM ${vm_name} was created more than 3 hours ago and wasn't deleted. Let's do that now." + echo "::warning::The VM ${vm_name} was created more than 3 hours ago and wasn't deleted. Let's do that now." az vm delete -n "$vm_name" -g ${{ secrets.AZURE_RESOURCE_GROUP }} --yes az network nsg delete -n "$vm_name"-nsg -g ${{ secrets.AZURE_RESOURCE_GROUP }} az network vnet delete -n "$vm_name"-vnet -g ${{ secrets.AZURE_RESOURCE_GROUP }}