Skip to content

Commit a286a11

Browse files
Robust CI Restarts (NOAA-EMC#2093)
Improved CI robustness for reverting back to **CI-Ready** from any given state New Features: - Improved `scancel` routine (refactored into bash "subroutine") - Improved messaging (see below) when ever a user changes state - Any and all previous build scripts and running experiments are killed as a result of reset to **Ready** Resolves NOAA-EMC#2060
1 parent 73621e9 commit a286a11

File tree

5 files changed

+120
-45
lines changed

5 files changed

+120
-45
lines changed

Diff for: ci/platforms/config.orion

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
export GFS_CI_ROOT=/work2/noaa/stmp/GFS_CI_ROOT
44
export ICSDIR_ROOT=/work/noaa/global/glopara/data/ICSDIR
5-
export STMP="/work/noaa/stmp/${USER}"
5+
export STMP="/work2/noaa/stmp/${USER}"
66
export SLURM_ACCOUNT=nems
77
export max_concurrent_cases=5
88
export max_concurrent_pr=4

Diff for: ci/scripts/check_ci.sh

+8-7
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ case ${MACHINE_ID} in
3333
esac
3434
set +x
3535
source "${ROOT_DIR}/ush/module-setup.sh"
36+
source "${ROOT_DIR}/ci/scripts/utils/ci_utils.sh"
3637
module use "${ROOT_DIR}/modulefiles"
3738
module load "module_gwsetup.${MACHINE_ID}"
3839
module list
@@ -86,7 +87,7 @@ for pr in ${pr_list}; do
8687
if [[ -z $(ls -A "${pr_dir}/RUNTESTS/EXPDIR") ]] ; then
8788
"${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Running" --add-label "CI-${MACHINE_ID^}-Passed"
8889
sed -i "1 i\`\`\`" "${output_ci}"
89-
sed -i "1 i\All CI Test Cases Passed:" "${output_ci}"
90+
sed -i "1 i\All CI Test Cases Passed on ${MACHINE_ID^}:" "${output_ci}"
9091
"${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci}"
9192
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}"
9293
# Check to see if this PR that was opened by the weekly tests and if so close it if it passed on all platforms
@@ -131,8 +132,8 @@ for pr in ${pr_list}; do
131132
"${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Running" --add-label "CI-${MACHINE_ID^}-Failed"
132133
error_logs=$("${rocotostat}" -d "${db}" -w "${xml}" | grep -E 'FAIL|DEAD' | awk '{print "-c", $1, "-t", $2}' | xargs "${rocotocheck}" -d "${db}" -w "${xml}" | grep join | awk '{print $2}') || true
133134
{
134-
echo "Experiment ${pslot} Terminated: *** FAILED ***"
135-
echo "Experiment ${pslot} Terminated with ${num_failed} tasks failed at $(date)" || true
135+
echo "Experiment ${pslot} *** FAILED *** on ${MACHINE_ID^}"
136+
echo "Experiment ${pslot} with ${num_failed} tasks failed at $(date +'%D %r')" || true
136137
echo "Error logs:"
137138
echo "${error_logs}"
138139
} >> "${output_ci}"
@@ -141,7 +142,7 @@ for pr in ${pr_list}; do
141142
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}"
142143
for kill_cases in "${pr_dir}/RUNTESTS/"*; do
143144
pslot=$(basename "${kill_cases}")
144-
sacct --format=jobid,jobname%35,WorkDir%100,stat | grep "${pslot}" | grep "PR\/${pr}\/RUNTESTS" | awk '{print $1}' | xargs scancel || true
145+
cancel_slurm_jobs "${pslot}"
145146
done
146147
break
147148
fi
@@ -151,9 +152,9 @@ for pr in ${pr_list}; do
151152
rm -Rf "${pr_dir}/RUNTESTS/COMROT/${pslot}"
152153
rm -f "${output_ci_single}"
153154
# echo "\`\`\`" > "${output_ci_single}"
154-
DATE=$(date)
155-
echo "Experiment ${pslot} **SUCCESS** ${DATE}" >> "${output_ci_single}"
156-
echo "Experiment ${pslot} **SUCCESS** at ${DATE}" >> "${output_ci}"
155+
DATE=$(date +'%D %r')
156+
echo "Experiment ${pslot} **SUCCESS** on ${MACHINE_ID^} at ${DATE}" >> "${output_ci_single}"
157+
echo "Experiment ${pslot} *** SUCCESS *** at ${DATE}" >> "${output_ci}"
157158
"${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}"
158159

159160
fi

Diff for: ci/scripts/clone-build_ci.sh

+13-7
Original file line numberDiff line numberDiff line change
@@ -72,16 +72,17 @@ cd sorc || exit 1
7272
set +e
7373
./checkout.sh -c -g -u >> log.checkout 2>&1
7474
checkout_status=$?
75+
DATE=$(date +'%D %r')
7576
if [[ ${checkout_status} != 0 ]]; then
7677
{
7778
echo "Checkout: *** FAILED ***"
78-
echo "Checkout: Failed at $(date)" || true
79+
echo "Checkout: Failed at ${DATE}"
7980
echo "Checkout: see output at ${PWD}/log.checkout"
8081
} >> "${outfile}"
8182
exit "${checkout_status}"
8283
else
8384
{
84-
echo "Checkout: Completed at $(date)" || true
85+
echo "Checkout: Completed at ${DATE}"
8586
} >> "${outfile}"
8687
fi
8788

@@ -92,25 +93,30 @@ rm -rf log.build
9293
./build_all.sh >> log.build 2>&1
9394
build_status=$?
9495

96+
DATE=$(date +'%D %r')
9597
if [[ ${build_status} != 0 ]]; then
9698
{
9799
echo "Build: *** FAILED ***"
98-
echo "Build: Failed at $(date)" || true
99-
echo "Build: see output at ${PWD}/log.build"
100+
echo "Build: Failed at ${DATE}"
101+
cat "${PWD}/log.build"
100102
} >> "${outfile}"
101103
exit "${build_status}"
102104
else
103105
{
104-
echo "Build: Completed at $(date)" || true
106+
echo "Build: Completed at ${DATE}"
105107
} >> "${outfile}"
106108
fi
107109

108-
./link_workflow.sh
110+
LINK_LOGFILE_PATH=link_workflow.log
111+
rm -f "${LINK_LOGFILE_PATH}"
112+
./link_workflow.sh >> "${LINK_LOGFILE_PATH}" 2>&1
109113
link_status=$?
110114
if [[ ${link_status} != 0 ]]; then
115+
DATE=$(date +'%D %r')
111116
{
112117
echo "Link: *** FAILED ***"
113-
echo "Link: Failed at $(date)" || true
118+
echo "Link: Failed at ${DATE}"
119+
cat "${LINK_LOGFILE_PATH}"
114120
} >> "${outfile}"
115121
exit "${link_status}"
116122
fi

Diff for: ci/scripts/driver.sh

+74-30
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ export REPO_URL=${REPO_URL:-"https://github.com/NOAA-EMC/global-workflow.git"}
2525
################################################################
2626
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." >/dev/null 2>&1 && pwd )"
2727
scriptname=$(basename "${BASH_SOURCE[0]}")
28-
echo "Begin ${scriptname} at $(date -u)" || true
28+
echo "Begin ${scriptname} at $(date +'%D %r')" || true
2929
export PS4='+ $(basename ${BASH_SOURCE})[${LINENO}]'
3030

3131
#########################################################################
@@ -48,6 +48,7 @@ esac
4848
# setup runtime env for correct python install and git
4949
######################################################
5050
set +x
51+
source "${ROOT_DIR}/ci/scripts/utils/ci_utils.sh"
5152
source "${ROOT_DIR}/ush/module-setup.sh"
5253
module use "${ROOT_DIR}/modulefiles"
5354
module load "module_gwsetup.${MACHINE_ID}"
@@ -68,24 +69,57 @@ pr_list=$(${GH} pr list --repo "${REPO_URL}" --label "CI-${MACHINE_ID^}-Ready" -
6869
for pr in ${pr_list}; do
6970
pr_dir="${GFS_CI_ROOT}/PR/${pr}"
7071
db_list=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --add_pr "${pr}" --dbfile "${pr_list_dbfile}")
71-
pr_id=0
72+
output_ci_single="${GFS_CI_ROOT}/PR/${pr}/output_single.log"
7273
#############################################################
7374
# Check if a Ready labeled PR has changed back from once set
74-
# and in that case remove all previous jobs in scheduler and
75-
# and remove PR from filesystem to start clean
75+
# and in that case completely kill the previose driver.sh cron
76+
# job and all its decedands as well as removing all previous
77+
# jobs in scheduler and associated files in the PR
7678
#############################################################
7779
if [[ "${db_list}" == *"already is in list"* ]]; then
78-
pr_id=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --display "${pr}" | awk '{print $4}') || true
79-
pr_id=$((pr_id+1))
80-
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Ready "${pr_id}"
81-
for cases in "${pr_dir}/RUNTESTS/"*; do
82-
if [[ -z "${cases+x}" ]]; then
83-
break
80+
# Get the the PID and HOST of the driver.sh cron job
81+
# that is stored int he CI database for this PR
82+
driver_ID=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --display "${pr}" | awk '{print $4}') || true
83+
driver_PID=$(echo "${driver_ID}" | cut -d":" -f1) || true
84+
driver_HOST=$(echo "${driver_ID}" | cut -d":" -f2) || true
85+
host_name=$(hostname -s)
86+
rm -f "${output_ci_single}"
87+
{
88+
echo "CI Update on ${MACHINE_ID^} at $(date +'%D %r')" || true
89+
echo "================================================="
90+
echo "PR:${pr} Reset to ${MACHINE_ID^}-Ready by user and is now restarting CI tests" || true
91+
} >> "${output_ci_single}"
92+
if [[ "${driver_PID}" -ne 0 ]]; then
93+
echo "Driver PID: ${driver_PID} no longer running this build having it killed"
94+
if [[ "${driver_HOST}" == "${host_name}" ]]; then
95+
# shellcheck disable=SC2312
96+
pstree -A -p "${driver_PID}" | grep -Pow "(?<=\()[0-9]+(?=\))" | xargs kill
97+
else
98+
# shellcheck disable=SC2312
99+
ssh "${driver_HOST}" 'pstree -A -p "${driver_PID}" | grep -Eow "[0-9]+" | xargs kill'
84100
fi
85-
pslot=$(basename "${cases}")
86-
sacct --format=jobid,jobname%35,WorkDir%100,stat | grep "${pslot}" | grep "PR\/${pr}\/RUNTESTS" | awk '{print $1}' | xargs scancel || true
87-
done
88-
rm -Rf "${pr_dir}"
101+
{
102+
echo "Driver PID: Requested termination of ${driver_PID} and children on ${driver_HOST}"
103+
echo "Driver PID: has restarted as $$ on ${host_name}"
104+
} >> "${output_ci_single}"
105+
fi
106+
107+
experiments=$(find "${pr_dir}/RUNTESTS/EXPDIR" -mindepth 1 -maxdepth 1 -type d) || true
108+
if [[ -z "${experiments}" ]]; then
109+
echo "No current experiments to cancel in PR: ${pr} on ${MACHINE_ID^}" >> "${output_ci_single}"
110+
else
111+
for case in ${experiments}; do
112+
case_name=$(basename "${case}")
113+
cancel_slurm_jobs "${case_name}"
114+
{
115+
echo "Canceled all jobs for experiment ${case_name} in PR:${pr} on ${MACHINE_ID^}"
116+
} >> "${output_ci_single}"
117+
done
118+
fi
119+
sed -i "1 i\`\`\`" "${output_ci_single}"
120+
"${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}"
121+
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}"
122+
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --add_pr "${pr}" --dbfile "${pr_list_dbfile}"
89123
fi
90124
done
91125

@@ -110,34 +144,44 @@ for pr in ${pr_list}; do
110144
if [[ -z "${pr_building+x}" ]]; then
111145
continue
112146
fi
113-
"${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Ready" --add-label "CI-${MACHINE_ID^}-Building"
114-
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Building
115-
echo "Processing Pull Request #${pr}"
147+
id=$("${GH}" pr view "${pr}" --repo "${REPO_URL}" --json id --jq '.id')
116148
pr_dir="${GFS_CI_ROOT}/PR/${pr}"
149+
output_ci="${pr_dir}/output_ci_${id}"
150+
output_ci_single="${GFS_CI_ROOT}/PR/${pr}/output_single.log"
151+
driver_build_PID=$$
152+
driver_build_HOST=$(hostname -s)
153+
"${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Ready" --add-label "CI-${MACHINE_ID^}-Building"
154+
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Building "${driver_build_PID}:${driver_build_HOST}"
117155
rm -Rf "${pr_dir}"
118156
mkdir -p "${pr_dir}"
119-
# call clone-build_ci to clone and build PR
120-
id=$("${GH}" pr view "${pr}" --repo "${REPO_URL}" --json id --jq '.id')
157+
{
158+
echo "CI Update on ${MACHINE_ID^} at $(date +'%D %r')" || true
159+
echo "============================================"
160+
echo "Cloning and Building global-workflow PR: ${pr}"
161+
echo "with PID: ${driver_build_PID} on host: ${driver_build_HOST}"
162+
echo ""
163+
} >> "${output_ci_single}"
164+
sed -i "1 i\`\`\`" "${output_ci_single}"
165+
"${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}"
121166
set +e
122-
output_ci="${pr_dir}/output_build_${id}"
123-
rm -f "${output_ci}"
124167
"${ROOT_DIR}/ci/scripts/clone-build_ci.sh" -p "${pr}" -d "${pr_dir}" -o "${output_ci}"
125-
#echo "SKIPPING: ${ROOT_DIR}/ci/scripts/clone-build_ci.sh"
126168
ci_status=$?
127169
##################################################################
128170
# Checking for special case when Ready label was updated
129-
# that cause a running driver exit fail because was currently
130-
# building so we force and exit 0 instead to does not get relabled
171+
# but a race condtion caused the clone-build_ci.sh to start
172+
# and this instance fails before it was killed. In th case we
173+
# we need to exit this instance of the driver script
131174
#################################################################
132175
if [[ ${ci_status} -ne 0 ]]; then
133-
pr_id_check=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --display "{pr}" --dbfile "${pr_list_dbfile}" | awk '{print $4}') || true
134-
if [[ "${pr_id}" -ne "${pr_id_check}" ]]; then
176+
build_PID_check=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --display "{pr}" --dbfile "${pr_list_dbfile}" | awk '{print $4}' | cut -d":" -f1) || true
177+
if [[ "${build_PID_check}" -ne "$$" ]]; then
178+
echo "Driver build PID: ${build_PID_check} no longer running this build ... exiting"
135179
exit 0
136180
fi
137181
fi
138182
set -e
139183
if [[ ${ci_status} -eq 0 ]]; then
140-
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Built
184+
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Built "0:0"
141185
#setup space to put an experiment
142186
# export RUNTESTS for yaml case files to pickup
143187
export RUNTESTS="${pr_dir}/RUNTESTS"
@@ -159,7 +203,7 @@ for pr in ${pr_list}; do
159203
set +e
160204
export LOGFILE_PATH="${HOMEgfs}/ci/scripts/create_experiment.log"
161205
rm -f "${LOGFILE_PATH}"
162-
"${HOMEgfs}/workflow/create_experiment.py" --yaml "${HOMEgfs}/ci/cases/pr/${case}.yaml" 2>&1 "${LOGFILE_PATH}"
206+
"${HOMEgfs}/workflow/create_experiment.py" --yaml "${HOMEgfs}/ci/cases/pr/${case}.yaml" > "${LOGFILE_PATH}" 2>&1
163207
ci_status=$?
164208
set -e
165209
if [[ ${ci_status} -eq 0 ]]; then
@@ -174,7 +218,7 @@ for pr in ${pr_list}; do
174218
} >> "${output_ci}"
175219
else
176220
{
177-
echo "*** Failed *** to create experiment: ${pslot}"
221+
echo "*** Failed *** to create experiment: ${pslot} on ${MACHINE_ID^}"
178222
echo ""
179223
cat "${LOGFILE_PATH}"
180224
} >> "${output_ci}"
@@ -186,7 +230,7 @@ for pr in ${pr_list}; do
186230
done
187231

188232
"${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Building" --add-label "CI-${MACHINE_ID^}-Running"
189-
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Running
233+
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Running "0:0"
190234
"${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci}"
191235

192236
else

Diff for: ci/scripts/utils/ci_utils.sh

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/bin/env bash
2+
3+
function cancel_slurm_jobs() {
4+
5+
# Usage: cancel_slurm_jobs <substring>
6+
# Example: cancel_slurm_jobs "C48_ATM_3c4e7f74"
7+
#
8+
# Cancel all Slurm jobs that have the given substring in their name
9+
# So like in the example all jobs with "C48_ATM_3c4e7f74"
10+
# in their name will be canceled
11+
12+
local substring=$1
13+
local job_ids
14+
job_ids=$(squeue -u "${USER}" -h -o "%i")
15+
16+
for job_id in ${job_ids}; do
17+
job_name=$(sacct -j "${job_id}" --format=JobName%100 | head -3 | tail -1 | sed -r 's/\s+//g') || true
18+
if [[ "${job_name}" =~ ${substring} ]]; then
19+
echo "Canceling Slurm Job ${job_name} with: scancel ${job_id}"
20+
scancel "${job_id}"
21+
continue
22+
fi
23+
done
24+
}

0 commit comments

Comments
 (0)