@@ -25,7 +25,7 @@ export REPO_URL=${REPO_URL:-"https://github.com/NOAA-EMC/global-workflow.git"}
25
25
# ###############################################################
26
26
ROOT_DIR=" $( cd " $( dirname " ${BASH_SOURCE[0]} " ) /../.." > /dev/null 2>&1 && pwd ) "
27
27
scriptname=$( basename " ${BASH_SOURCE[0]} " )
28
- echo " Begin ${scriptname} at $( date -u ) " || true
28
+ echo " Begin ${scriptname} at $( date + ' %D %r ' ) " || true
29
29
export PS4=' + $(basename ${BASH_SOURCE})[${LINENO}]'
30
30
31
31
# ########################################################################
48
48
# setup runtime env for correct python install and git
49
49
# #####################################################
50
50
set +x
51
+ source " ${ROOT_DIR} /ci/scripts/utils/ci_utils.sh"
51
52
source " ${ROOT_DIR} /ush/module-setup.sh"
52
53
module use " ${ROOT_DIR} /modulefiles"
53
54
module load " module_gwsetup.${MACHINE_ID} "
@@ -68,24 +69,57 @@ pr_list=$(${GH} pr list --repo "${REPO_URL}" --label "CI-${MACHINE_ID^}-Ready" -
68
69
for pr in ${pr_list} ; do
69
70
pr_dir=" ${GFS_CI_ROOT} /PR/${pr} "
70
71
db_list=$( " ${ROOT_DIR} /ci/scripts/pr_list_database.py" --add_pr " ${pr} " --dbfile " ${pr_list_dbfile} " )
71
- pr_id=0
72
+ output_ci_single= " ${GFS_CI_ROOT} /PR/ ${pr} /output_single.log "
72
73
# ############################################################
73
74
# Check if a Ready labeled PR has changed back from once set
74
- # and in that case remove all previous jobs in scheduler and
75
- # and remove PR from filesystem to start clean
75
+ # and in that case completely kill the previose driver.sh cron
76
+ # job and all its decedands as well as removing all previous
77
+ # jobs in scheduler and associated files in the PR
76
78
# ############################################################
77
79
if [[ " ${db_list} " == * " already is in list" * ]]; then
78
- pr_id=$( " ${ROOT_DIR} /ci/scripts/pr_list_database.py" --dbfile " ${pr_list_dbfile} " --display " ${pr} " | awk ' {print $4}' ) || true
79
- pr_id=$(( pr_id+ 1 ))
80
- " ${ROOT_DIR} /ci/scripts/pr_list_database.py" --dbfile " ${pr_list_dbfile} " --update_pr " ${pr} " Open Ready " ${pr_id} "
81
- for cases in " ${pr_dir} /RUNTESTS/" * ; do
82
- if [[ -z " ${cases+x} " ]]; then
83
- break
80
+ # Get the the PID and HOST of the driver.sh cron job
81
+ # that is stored int he CI database for this PR
82
+ driver_ID=$( " ${ROOT_DIR} /ci/scripts/pr_list_database.py" --dbfile " ${pr_list_dbfile} " --display " ${pr} " | awk ' {print $4}' ) || true
83
+ driver_PID=$( echo " ${driver_ID} " | cut -d" :" -f1) || true
84
+ driver_HOST=$( echo " ${driver_ID} " | cut -d" :" -f2) || true
85
+ host_name=$( hostname -s)
86
+ rm -f " ${output_ci_single} "
87
+ {
88
+ echo " CI Update on ${MACHINE_ID^} at $( date +' %D %r' ) " || true
89
+ echo " ================================================="
90
+ echo " PR:${pr} Reset to ${MACHINE_ID^} -Ready by user and is now restarting CI tests" || true
91
+ } >> " ${output_ci_single} "
92
+ if [[ " ${driver_PID} " -ne 0 ]]; then
93
+ echo " Driver PID: ${driver_PID} no longer running this build having it killed"
94
+ if [[ " ${driver_HOST} " == " ${host_name} " ]]; then
95
+ # shellcheck disable=SC2312
96
+ pstree -A -p " ${driver_PID} " | grep -Pow " (?<=\()[0-9]+(?=\))" | xargs kill
97
+ else
98
+ # shellcheck disable=SC2312
99
+ ssh " ${driver_HOST} " ' pstree -A -p "${driver_PID}" | grep -Eow "[0-9]+" | xargs kill'
84
100
fi
85
- pslot=$( basename " ${cases} " )
86
- sacct --format=jobid,jobname%35,WorkDir%100,stat | grep " ${pslot} " | grep " PR\/${pr} \/RUNTESTS" | awk ' {print $1}' | xargs scancel || true
87
- done
88
- rm -Rf " ${pr_dir} "
101
+ {
102
+ echo " Driver PID: Requested termination of ${driver_PID} and children on ${driver_HOST} "
103
+ echo " Driver PID: has restarted as $$ on ${host_name} "
104
+ } >> " ${output_ci_single} "
105
+ fi
106
+
107
+ experiments=$( find " ${pr_dir} /RUNTESTS/EXPDIR" -mindepth 1 -maxdepth 1 -type d) || true
108
+ if [[ -z " ${experiments} " ]]; then
109
+ echo " No current experiments to cancel in PR: ${pr} on ${MACHINE_ID^} " >> " ${output_ci_single} "
110
+ else
111
+ for case in ${experiments} ; do
112
+ case_name=$( basename " ${case} " )
113
+ cancel_slurm_jobs " ${case_name} "
114
+ {
115
+ echo " Canceled all jobs for experiment ${case_name} in PR:${pr} on ${MACHINE_ID^} "
116
+ } >> " ${output_ci_single} "
117
+ done
118
+ fi
119
+ sed -i " 1 i\`\`\` " " ${output_ci_single} "
120
+ " ${GH} " pr comment " ${pr} " --repo " ${REPO_URL} " --body-file " ${output_ci_single} "
121
+ " ${ROOT_DIR} /ci/scripts/pr_list_database.py" --remove_pr " ${pr} " --dbfile " ${pr_list_dbfile} "
122
+ " ${ROOT_DIR} /ci/scripts/pr_list_database.py" --add_pr " ${pr} " --dbfile " ${pr_list_dbfile} "
89
123
fi
90
124
done
91
125
@@ -110,34 +144,44 @@ for pr in ${pr_list}; do
110
144
if [[ -z " ${pr_building+x} " ]]; then
111
145
continue
112
146
fi
113
- " ${GH} " pr edit --repo " ${REPO_URL} " " ${pr} " --remove-label " CI-${MACHINE_ID^} -Ready" --add-label " CI-${MACHINE_ID^} -Building"
114
- " ${ROOT_DIR} /ci/scripts/pr_list_database.py" --dbfile " ${pr_list_dbfile} " --update_pr " ${pr} " Open Building
115
- echo " Processing Pull Request #${pr} "
147
+ id=$( " ${GH} " pr view " ${pr} " --repo " ${REPO_URL} " --json id --jq ' .id' )
116
148
pr_dir=" ${GFS_CI_ROOT} /PR/${pr} "
149
+ output_ci=" ${pr_dir} /output_ci_${id} "
150
+ output_ci_single=" ${GFS_CI_ROOT} /PR/${pr} /output_single.log"
151
+ driver_build_PID=$$
152
+ driver_build_HOST=$( hostname -s)
153
+ " ${GH} " pr edit --repo " ${REPO_URL} " " ${pr} " --remove-label " CI-${MACHINE_ID^} -Ready" --add-label " CI-${MACHINE_ID^} -Building"
154
+ " ${ROOT_DIR} /ci/scripts/pr_list_database.py" --dbfile " ${pr_list_dbfile} " --update_pr " ${pr} " Open Building " ${driver_build_PID} :${driver_build_HOST} "
117
155
rm -Rf " ${pr_dir} "
118
156
mkdir -p " ${pr_dir} "
119
- # call clone-build_ci to clone and build PR
120
- id=$( " ${GH} " pr view " ${pr} " --repo " ${REPO_URL} " --json id --jq ' .id' )
157
+ {
158
+ echo " CI Update on ${MACHINE_ID^} at $( date +' %D %r' ) " || true
159
+ echo " ============================================"
160
+ echo " Cloning and Building global-workflow PR: ${pr} "
161
+ echo " with PID: ${driver_build_PID} on host: ${driver_build_HOST} "
162
+ echo " "
163
+ } >> " ${output_ci_single} "
164
+ sed -i " 1 i\`\`\` " " ${output_ci_single} "
165
+ " ${GH} " pr comment " ${pr} " --repo " ${REPO_URL} " --body-file " ${output_ci_single} "
121
166
set +e
122
- output_ci=" ${pr_dir} /output_build_${id} "
123
- rm -f " ${output_ci} "
124
167
" ${ROOT_DIR} /ci/scripts/clone-build_ci.sh" -p " ${pr} " -d " ${pr_dir} " -o " ${output_ci} "
125
- # echo "SKIPPING: ${ROOT_DIR}/ci/scripts/clone-build_ci.sh"
126
168
ci_status=$?
127
169
# #################################################################
128
170
# Checking for special case when Ready label was updated
129
- # that cause a running driver exit fail because was currently
130
- # building so we force and exit 0 instead to does not get relabled
171
+ # but a race condtion caused the clone-build_ci.sh to start
172
+ # and this instance fails before it was killed. In th case we
173
+ # we need to exit this instance of the driver script
131
174
# ################################################################
132
175
if [[ ${ci_status} -ne 0 ]]; then
133
- pr_id_check=$( " ${ROOT_DIR} /ci/scripts/pr_list_database.py" --display " {pr}" --dbfile " ${pr_list_dbfile} " | awk ' {print $4}' ) || true
134
- if [[ " ${pr_id} " -ne " ${pr_id_check} " ]]; then
176
+ build_PID_check=$( " ${ROOT_DIR} /ci/scripts/pr_list_database.py" --display " {pr}" --dbfile " ${pr_list_dbfile} " | awk ' {print $4}' | cut -d" :" -f1) || true
177
+ if [[ " ${build_PID_check} " -ne " $$ " ]]; then
178
+ echo " Driver build PID: ${build_PID_check} no longer running this build ... exiting"
135
179
exit 0
136
180
fi
137
181
fi
138
182
set -e
139
183
if [[ ${ci_status} -eq 0 ]]; then
140
- " ${ROOT_DIR} /ci/scripts/pr_list_database.py" --dbfile " ${pr_list_dbfile} " --update_pr " ${pr} " Open Built
184
+ " ${ROOT_DIR} /ci/scripts/pr_list_database.py" --dbfile " ${pr_list_dbfile} " --update_pr " ${pr} " Open Built " 0:0 "
141
185
# setup space to put an experiment
142
186
# export RUNTESTS for yaml case files to pickup
143
187
export RUNTESTS=" ${pr_dir} /RUNTESTS"
@@ -159,7 +203,7 @@ for pr in ${pr_list}; do
159
203
set +e
160
204
export LOGFILE_PATH=" ${HOMEgfs} /ci/scripts/create_experiment.log"
161
205
rm -f " ${LOGFILE_PATH} "
162
- " ${HOMEgfs} /workflow/create_experiment.py" --yaml " ${HOMEgfs} /ci/cases/pr/${case} .yaml" 2>&1 " ${LOGFILE_PATH} "
206
+ " ${HOMEgfs} /workflow/create_experiment.py" --yaml " ${HOMEgfs} /ci/cases/pr/${case} .yaml" > " ${LOGFILE_PATH} " 2>&1
163
207
ci_status=$?
164
208
set -e
165
209
if [[ ${ci_status} -eq 0 ]]; then
@@ -174,7 +218,7 @@ for pr in ${pr_list}; do
174
218
} >> " ${output_ci} "
175
219
else
176
220
{
177
- echo " *** Failed *** to create experiment: ${pslot} "
221
+ echo " *** Failed *** to create experiment: ${pslot} on ${MACHINE_ID^} "
178
222
echo " "
179
223
cat " ${LOGFILE_PATH} "
180
224
} >> " ${output_ci} "
@@ -186,7 +230,7 @@ for pr in ${pr_list}; do
186
230
done
187
231
188
232
" ${GH} " pr edit --repo " ${REPO_URL} " " ${pr} " --remove-label " CI-${MACHINE_ID^} -Building" --add-label " CI-${MACHINE_ID^} -Running"
189
- " ${ROOT_DIR} /ci/scripts/pr_list_database.py" --dbfile " ${pr_list_dbfile} " --update_pr " ${pr} " Open Running
233
+ " ${ROOT_DIR} /ci/scripts/pr_list_database.py" --dbfile " ${pr_list_dbfile} " --update_pr " ${pr} " Open Running " 0:0 "
190
234
" ${GH} " pr comment " ${pr} " --repo " ${REPO_URL} " --body-file " ${output_ci} "
191
235
192
236
else
0 commit comments