-
Notifications
You must be signed in to change notification settings - Fork 31
245 lines (213 loc) · 8.45 KB
/
stackhpc.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
name: Test deployment and reimage on OpenStack
on:
workflow_dispatch:
push:
branches:
- main
paths:
- '**'
- '!dev/**'
- 'dev/setup-env.sh'
- '!docs/**'
- '!README.md'
- '!.gitignore'
- '!.github/workflows/'
- '.github/workflows/stackhpc'
pull_request:
paths:
- '**'
- '!dev/**'
- 'dev/setup-env.sh'
- '!docs/**'
- '!README.md'
- '!.gitignore'
- '!.github/workflows/'
- '.github/workflows/stackhpc'
jobs:
openstack:
name: openstack-ci
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }} # to branch/PR + OS
cancel-in-progress: true
runs-on: ubuntu-22.04
strategy:
fail-fast: false # allow other matrix jobs to continue even if one fails
matrix:
os_version:
- RL8
- RL9
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }}
CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings
TF_VAR_os_version: ${{ matrix.os_version }}
steps:
- name: Find the latest release
run: |
echo LATEST_RELEASE_TAG=$(curl -s https://api.github.com/repos/stackhpc/ansible-slurm-appliance/releases/latest | jq -r .tag_name) >> "$GITHUB_ENV"
- name: Checkout latest release
uses: actions/checkout@v4
with:
ref: ${{ env.LATEST_RELEASE_TAG }}
- name: Find stackhpc tofu/terraform directory
# changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541
run: |
echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d \( -name terraform -o -name tofu )) >> "$GITHUB_ENV"
- name: Override CI_CLOUD if PR label is present
if: ${{ github.event_name == 'pull_request' }}
run: |
# Iterate over the labels
labels=$(echo '${{ toJSON(github.event.pull_request.labels) }}' | jq -r '.[].name')
echo $labels
for label in $labels; do
if [[ $label == CI_CLOUD=* ]]; then
# Extract the value after 'CI_CLOUD='
CI_CLOUD_OVERRIDE=${label#CI_CLOUD=}
echo "CI_CLOUD=${CI_CLOUD_OVERRIDE}" >> $GITHUB_ENV
fi
done
- name: Record debug info
run: |
echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG
echo STACKHPC_TF_DIR: $STACKHPC_TF_DIR
echo CI_CLOUD: $CI_CLOUD
- name: Setup ssh
run: |
set -x
mkdir ~/.ssh
echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa
chmod 0600 ~/.ssh/id_rsa
shell: bash
- name: Add bastion's ssh key to known_hosts
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
shell: bash
- name: Install ansible etc
run: dev/setup-env.sh
- name: Install OpenTofu
uses: opentofu/setup-opentofu@v1
with:
tofu_version: 1.6.2
- name: Initialise tofu
run: tofu init
working-directory: ${{ env.STACKHPC_TF_DIR }}
- name: Write clouds.yaml
run: |
mkdir -p ~/.config/openstack/
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
shell: bash
- name: Setup environment-specific inventory/tofu inputs
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook ansible/adhoc/generate-passwords.yml
echo vault_demo_user_password: "$DEMO_USER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml
env:
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
- name: Provision nodes using latest release image
id: provision_servers
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $STACKHPC_TF_DIR
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
- name: Delete infrastructure if provisioning failed
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $STACKHPC_TF_DIR
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
if: failure() && steps.provision_servers.outcome == 'failure'
- name: Configure cluster at latest release
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible all -m wait_for_connection
ansible-playbook -v ansible/site.yml
ansible-playbook -v ansible/ci/check_slurm.yml
- name: Run MPI-based tests at latest release
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -vv ansible/adhoc/hpctests.yml
# - name: Run EESSI tests
# run: |
# . venv/bin/activate
# . environments/.stackhpc/activate
# ansible-playbook -vv ansible/ci/check_eessi.yml
- name: Confirm Open Ondemand is up (via SOCKS proxy)
run: |
. venv/bin/activate
. environments/.stackhpc/activate
# load ansible variables into shell:
ansible-playbook ansible/ci/output_vars.yml \
-e output_vars_hosts=openondemand \
-e output_vars_path=$APPLIANCES_ENVIRONMENT_ROOT/vars.txt \
-e output_vars_items=bastion_ip,bastion_user,openondemand_servername
source $APPLIANCES_ENVIRONMENT_ROOT/vars.txt
# setup ssh proxying:
sudo apt-get --yes install proxychains
echo proxychains installed
ssh -v -fN -D 9050 ${bastion_user}@${bastion_ip}
echo port 9050 forwarded
# check OOD server returns 200:
statuscode=$(proxychains wget \
--quiet \
--spider \
--server-response \
--no-check-certificate \
--http-user=demo_user \
--http-password=${DEMO_USER_PASSWORD} https://${openondemand_servername} \
2>&1)
(echo $statuscode | grep "200 OK") || (echo $statuscode && exit 1)
env:
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
- name: Checkout current branch
uses: actions/checkout@v4
- name: Find stackhpc tofu/terraform directory
# changed in https://github.com/stackhpc/ansible-slurm-appliance/pull/541
run: |
echo STACKHPC_TF_DIR=$(find environments/.stackhpc/ -type d -name tofu -o -type d -name terraform) >> "$GITHUB_ENV"
# something about GH actions parsing eats \( \) type expressions
- name: Reimage login and control nodes to image in current branch
id: reimage_non_compute
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $STACKHPC_TF_DIR
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" -var-file=cluster_image.latest.tfvars.json
- name: Configure cluster using current branch
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible all -m wait_for_connection
ansible-playbook -v ansible/site.yml
ansible-playbook -v ansible/ci/check_slurm.yml
- name: Reimage compute nodes to image in current branch using slurm - tests compute-init
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -v ansible/adhoc/reboot_via_slurm.yml
ansible-playbook -v ansible/ci/check_slurm.yml
- name: Check sacct state survived reimage to current branch
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml
- name: Check MPI-based tests are shown in Grafana
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -vv ansible/ci/check_grafana.yml
- name: Run MPI-based tests again in current branch
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -vv ansible/adhoc/hpctests.yml
- name: Delete infrastructure
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $STACKHPC_TF_DIR
tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
if: ${{ success() || cancelled() }}