-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtiming-driver.sh
executable file
·170 lines (147 loc) · 6.01 KB
/
timing-driver.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#!/bin/bash -l
set -e
set -x
set -o pipefail
date
# Users should set special keys for using git over
# ssh for security concerns. This snippet will use
# a pre-arranged ssh key if the user provides one
# and indicates it with the TESTING_SSH_KEY environment
# variable.
# ===== To create a key:
# - Run ssh-keygen:
# $ ssh-keygen
# [enter a <keyname> when prompted]
# - Put the key(s) in a /secure/filesystem/location:
# $ mv <keyname>* /secure/filesystem/location
# - Add the key to GIT:
# $ [browse to] https://github.com/illinois-ceesd/timing/settings/keys/new
# $ Choose (New SSH key)
# $ Paste in the contents of /secure/filesystem/location/<keyname>.pub
# - Set the ENV variable before using this script:
# $ export TESTING_SSH_KEY=/secure/filesystem/location/<keyname>
if [ ! -z "${TESTING_SSH_KEY}" ]; then
eval $(ssh-agent)
trap "kill $SSH_AGENT_PID" EXIT
ssh-add ${TESTING_SSH_KEY}
fi
export TIMING_HOME=$(pwd)
export TIMING_BRANCH="y1-production"
export TIMING_ENV_NAME="nozzle.timing.env"
export MIRGE_BRANCH="y1-production"
./install-mirgecom.sh ${MIRGE_BRANCH} ${TIMING_ENV_NAME}
export EMIRGE_HOME="${TIMING_HOME}/emirge"
cd ${EMIRGE_HOME}
source config/activate_env.sh
cd mirgecom
BRANCH_HASH=$(git rev-parse origin/${MIRGE_BRANCH})
MAIN_HASH=$(git rev-parse origin/main)
# -- System information
TIMING_HOST=$(hostname)
TIMING_DATE=$(date "+%Y-%m-%d %H:%M")
TIME_SINCE_EPOCH=$(date +%s)
TIMING_PLATFORM=$(uname)
TIMING_ARCH=$(uname -m)
# export TIMING_REPO="illinois-ceesd/timing.git"
for driver_info in $(cat ${TIMING_HOME}/driver_info.txt)
do
DRIVER_NAME=$(echo $driver_info | cut -d ":" -f 1)
DRIVER_REPO=$(echo $driver_info | cut -d ":" -f 2)
DRIVER_BRANCH=$(echo $driver_info | cut -d ":" -f 3)
DRIVER_KEY=$(echo $driver_info | cut -d ":" -f 4)
TIMING_FILE=$(echo $driver_info | cut -d ":" -f 5)
# -- Produce the driver to use for timing
# --- Grab the nozzle driver repo
rm -Rf ${DRIVER_NAME}
git clone -b ${DRIVER_BRANCH} https://github.com/${DRIVER_REPO} ${DRIVER_NAME}
cd ${DRIVER_NAME}/timing_run
DRIVER_HASH=$(git rev-parse origin/${DRIVER_BRANCH})
cat <<EOF > timing_params.yaml
nviz: 100
nrestart: 100
current_dt: 5e-8
t_final: 1.e-6
order: 1
alpha_sc: 0.5
s0_sc: -5.0
kappa_sc: 0.5
EOF
# -- Run the case (platform-dependent)
printf "Running on Host: ${TIMING_HOST}\n"
date
GPU_ARCH="Unknown"
case $TIMING_HOST in
# --- Run the timing test in a batch job on Lassen@LLC
lassen*)
echo "Resolved Host: Lassen"
TIMING_HOST="Lassen"
GPU_ARCH="GV100GL"
rm -f timing_job.sh
rm -f timing-run-done
# ---- Generate a batch script for running the timing job
cat <<EOF > timing_job.sh
#!/bin/bash
#BSUB -nnodes 1
#BSUB -G uiuc
#BSUB -W 30
#BSUB -q pdebug
printf "Running with EMIRGE_HOME=${EMIRGE_HOME}\n"
source "${EMIRGE_HOME}/config/activate_env.sh"
export PYOPENCL_CTX="port:tesla"
export XDG_CACHE_HOME="/tmp/$USER/xdg-scratch"
rm -rf $XDG_CACHE_HOME
rm -f timing-run-done
jsrun -g 1 -a 1 -n 1 python -O -u -m mpi4py ./${DRIVER_KEY}.py -i timing_params.yaml
touch timing-run-done
EOF
chmod +x timing_job.sh
# ---- Submit the batch script and wait for the job to finish
bsub timing_job.sh
# ---- Wait 5 minutes right off the bat (the job is at least 90 sec)
sleep 300
iwait=0
while [ ! -f ./timing-run-done ]; do
iwait=$((iwait+1))
if [ "$iwait" -gt 360 ]; then # give up after 1 hour
printf "Timed out waiting on batch job.\n"
exit 1 # skip the rest of the script
fi
sleep 10
done
;;
# --- Run the timing test on an unknown/generic machine
*)
printf "Host: Unknown\n"
PYOPENCL_TEST=port:pthread python -m mpi4py ./${DRIVER_KEY}.py -i timing_params.yaml
;;
esac
date
# -- Process the results of the timing run
RUN_LOG_FILE="${DRIVER_KEY}-rank0.sqlite"
if [[ -f "${RUN_LOG_FILE}" ]]; then
timing_yaml_file="${DRIVER_KEY}_timings.yaml"
rm -f ${timing_yaml_file}
rm -f log.sqlite
# --- Pull the timings out of the sqlite files generated by logging
runalyzer-gather log.sqlite ${RUN_LOG_FILE}
CL_DEVICE=$(sqlite3 log.sqlite 'SELECT cl_device_name FROM runs')
STARTUP_TIME=$(runalyzer -m log.sqlite -c 'print(q("select $t_init.max").fetchall()[0][0])' | grep -v INFO)
FIRST_STEP=$(runalyzer -m log.sqlite -c 'print(sum(p[0] for p in q("select $t_step.max").fetchall()[0:1]))' | grep -v INFO)
FIRST_10_STEPS=$(runalyzer -m log.sqlite -c 'print(sum(p[0] for p in q("select $t_step.max").fetchall()[0:10]))' | grep -v INFO)
SECOND_10_STEPS=$(runalyzer -m log.sqlite -c 'print(sum(p[0] for p in q("select $t_step.max").fetchall()[11:21]))' | grep -v INFO)
# --- Create a YAML-compatible text snippet with the timing info
printf "run_date: ${TIMING_DATE}\nrun_host: ${TIMING_HOST}\n" > ${timing_yaml_file}
printf "cl_device: ${CL_DEVICE}\n" >> ${timing_yaml_file}
printf "run_epoch: ${TIME_SINCE_EPOCH}\nrun_platform: ${TIMING_PLATFORM}\n" >> ${timing_yaml_file}
printf "run_arch: ${TIMING_ARCH}\ngpu_arch: ${GPU_ARCH}\n" >> ${timing_yaml_file}
printf "mirge_version: ${MIRGE_HASH}\ny1_version: ${Y1_HASH}\n" >> ${timing_yaml_file}
printf "driver_version: ${DRIVER_HASH}\ndriver_md5sum: ${DRIVER_MD5SUM}\n" >> ${timing_yaml_file}
printf "time_startup: ${STARTUP_TIME}\ntime_first_step: ${FIRST_STEP}\n" >> ${timing_yaml_file}
printf "time_first_10: ${FIRST_10_STEPS}\ntime_second_10: ${SECOND_10_STEPS}\n---\n" >> ${timing_yaml_file}
# ---- Update the timing file with the current test data
cat ${timing_yaml_file} >> ${TIMING_HOME}/timing_data/${TIMING_FILE}
else
printf "Timing run did not produce the expected sqlite file: ${RUN_LOG_FILE}\n"
fi
done
date