-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbinder.sh
456 lines (410 loc) · 17 KB
/
binder.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
#!/bin/bash
# This is a binding script that properly restricts each MPI rank to an appropriate set of CPU cores, GPUs, and NIcs, in order to avoid contention.
# Usage: mpirun binder.sh <app>
# The exact binding per-rank will be printed at runtime.
# This script can also be used to bind more than 1 MPI rank per GPU, or restrict number of cores and GPUS and perform certain experiments, by setting env variables. Here are the most useful ones:
# MPI_PER_GPU=2 mpirun -n 8 binder.sh <app> # bind 2 ranks per GPU
# LS_PCI=2 mpirun -n 4 binder.sh <app> # skip half of the GPUs (eg. 0,2,4,6). Can be useful to understand the impact of different PCI-e topology.
# OMP_NUM_THREADS=2 mpirun -n 1 binder.sh <app> # will restrict to 2 cores per rank, using taskset
# The script will auto-enable MPS if multiple ranks are bound to the same GPU.
# The script works by calculating the number of MPI ranks per node, the desired number of ranks per GPU (env variable MPI_PER_GPU), then
# splits resources equally and assigns them to MPI ranks in round robin.
# The script assumes a straightforward CPU - GPU topology (GPU 0 = CPU 0 etc.), and might fail on other topologies, such as EPYC CPUs (although some code is included for them).
# Cores are bound using numactl (optionally taskset to prevent memory binding), GPUs using CUDA_VISIBLE_DEVICES, and NICs using UCX_NET_DEVICES.
# LICENSE : MIT - Copyright Louis Stuber/NVIDIA
set -o pipefail
if [[ -z $1 ]]; then
echo "Usage: mpirun $0 <app>"
echo " will bind your <app> properly and report the binding for each rank (GPUs, CPUs, NICs)."
echo
echo "This script reads the following environment variables:"
echo " MPI_PER_GPU # default: 1. Desired number of ranks per GPU"
exit 0
fi
# Get MPI rank/size using SLURM and/or OpenMPI env variables.
if [[ -z $OMPI_COMM_WORLD_LOCAL_RANK ]]; then
export OMPI_COMM_WORLD_RANK=$SLURM_PROCID
export OMPI_COMM_WORLD_LOCAL_RANK=$SLURM_LOCALID
export OMPI_COMM_WORLD_SIZE=$SLURM_NTASKS
export OMPI_COMM_WORLD_LOCAL_SIZE=$((OMPI_COMM_WORLD_SIZE/SLURM_NNODES))
fi
if [[ -z $OMPI_COMM_WORLD_LOCAL_RANK ]]; then
export OMPI_COMM_WORLD_RANK=$PMI_RANK
export OMPI_COMM_WORLD_LOCAL_RANK=$MPI_LOCALRANKID
export OMPI_COMM_WORLD_SIZE=$PMI_SIZE
export OMPI_COMM_WORLD_LOCAL_SIZE=$MPI_LOCALNRANKS
fi
if [[ -z $OMPI_COMM_WORLD_LOCAL_RANK ]]; then
echo "$0 Error: OMPI_COMM_WORLD_LOCAL_RANK not defined. This script only supports SLURM and/or OpenMPI."
exit 101
fi
if [[ -z $OMPI_COMM_WORLD_LOCAL_SIZE ]]; then
if [[ -n $LSUB_NTASKS_PER_NODE ]]; then
OMPI_COMM_WORLD_LOCAL_SIZE=$LSUB_NTASKS_PER_NODE
else
echo "$0 error: OMPI_COMM_WORLD_LOCAL_SIZE not defined."
exit 102
fi
fi
has_set_ngpus=true
# Detect hardware
# Number of GPUs on the node
if [[ -z $LS_NGPUS ]]; then
LS_NGPUS=$(nvidia-smi --query-gpu=count --format=csv -i 0 | head -2 | tail -1) || (echo "$0 Error: could not obtain number of GPUs"; exit 103)
has_set_ngpus=false
fi
if [[ $LS_NGPUS -le 0 ]]; then
LS_NGPUS=$OMPI_COMM_WORLD_LOCAL_SIZE
fi
# Number of cores and sockets (NUMA nodes are used as "sockets" as they are more reliable)
lscpu="$(lscpu)" #cache
nphys_sockets=$(echo "$lscpu" | grep Sock | awk '{print $2}')
ncores=$(echo "$lscpu" | grep "Core(s)" | awk '{print $4}')
nnumas=$(echo "$lscpu" | grep "NUMA node" | head -1 | awk '{print $3}')
if [[ -n $(echo "$lscpu" | grep "NUMA node$nphys_sockets" | awk '{print $4}') ]]; then
#System with multiple NUMA nodes per socket (eg. AMD EPYC)
nsockets=$nnumas
else
# 1 socket = 1 NUMA
nnumas=$nphys_sockets
nsockets=$nphys_sockets
fi
ncores_avail_per_socket=$((ncores*nphys_sockets/nnumas))
if [[ $LS_HYPERTHREAD == true ]] || [[ $LS_HYPERTHREAD == 1 ]]; then
hyperthread_cores=2
nthreads_avail_per_socket=$((ncores_avail_per_socket*hyperthread_cores))
else
hyperthread_cores=1
nthreads_avail_per_socket=$ncores_avail_per_socket
fi
if [[ -n $NCPU_SOCKETS ]]; then
nsockets=$NCPU_SOCKETS
elif [[ -n $LS_FORCE_NSOCKETS ]]; then
nsockets=$LS_FORCE_NSOCKETS
fi
# List the CPUs, NICs on the system, sorted by affinity with GPUs.
isocket=
if ! [[ "$DISABLE_AMD_OPTI" == true ]] && echo "$lscpu" | grep -q "AMD EPYC 7402" && [[ $nsockets == 8 ]] && [[ $LS_NGPUS == 4 ]]; then
#Special optimization for JUWELS
GPUS=(0 1 2 3)
CPUS=(3 2 1 0 7 6 5 4)
NICS=(mlx5_0 mlx5_1 mlx5_2 mlx5_3)
if [[ $DISABLE_HALF_NUMAS == true ]]; then
CPUS=(3 1 7 5)
fi
if [[ -n $NICS_PER_NODE ]]; then
for igpu in $(seq 0 $((LS_NGPUS-1))); do
NICS[$igpu]=${NICS[$((igpu/(8/NICS_PER_NODE)*(8/NICS_PER_NODE)))]}
done
fi
elif ! [[ "$DISABLE_AMD_OPTI" == true ]] && echo "$lscpu" | grep -q "AMD EPYC 7742" && [[ $nsockets == 8 ]] ; then
#Special optimization for DGX A100
GPUS=(0 1 2 3 4 5 6 7)
CPUS=(3 2 1 0 7 6 5 4)
NICS=(mlx5_1 mlx5_0 mlx5_3 mlx5_2 mlx5_7 mlx5_6 mlx5_9 mlx5_8)
if [[ $DISABLE_HALF_NUMAS == true ]]; then
CPUS=(3 1 7 5)
fi
if [[ -n $NICS_PER_NODE ]]; then
for igpu in $(seq 0 $((LS_NGPUS-1))); do
NICS[$igpu]=${NICS[$((igpu/(8/NICS_PER_NODE)*(8/NICS_PER_NODE)))]}
done
fi
else
#generic system, sequential order assumed
GPUS=
CPUS=
NICS=
declare -A GPUS CPUS NICS
if ibstat_out="$(ibstat -l 2>/dev/null | sort -V)" ; then
mapfile -t ibdevs <<< "${ibstat_out}"
else
if [[ -z $NB_NICS ]]; then
NB_NICS=$(ls -l /dev/infiniband/uverbs* 2>/dev/null | wc -l)
fi
ibdevs=
declare -A ibdevs
for inic in $(seq 0 $((NB_NICS-1))); do
ibdevs[$inic]=mlx5_$inic
done
fi
num_ibdevs="${#ibdevs[@]}"
for igpu in $(seq 0 $((LS_NGPUS-1))); do
GPUS[$igpu]=$igpu
NICS[$igpu]=${ibdevs[$((igpu*num_ibdevs/LS_NGPUS))]}
if [[ -n $NB_NICS_PER_NODE ]]; then
NICS[$igpu]=${ibdevs[$((igpu*num_ibdevs/LS_NGPUS/(num_ibdevs/NB_NICS_PER_NODE)*(num_ibdevs/NB_NICS_PER_NODE)))]}
fi
done
if ! [[ "$DISABLE_AMD_OPTI" == true ]] && echo "$lscpu" | grep -q "AMD EPYC 77" && [[ $nsockets -gt 1 ]]; then
unset CPUS
if [[ $nsockets -ge 8 ]]; then
CPUS=(3 2 1 0 7 6 5 4)
elif [[ $nsockets -ge 4 ]]; then
CPUS=(3 2 1 0)
elif [[ $nsockets -ge 2 ]]; then
CPUS=(1 0)
fi
else
if [[ $DISABLE_HALF_NUMAS != true ]]; then
for icpu in $(seq 0 $((nsockets-1))); do
CPUS[$icpu]=$((icpu))
done
else
for icpu in $(seq 0 2 $((nsockets-1))); do
CPUS[$((icpu/2))]=$((icpu))
done
fi
fi
fi
# BINDING
nGPUs=${#GPUS[@]}
nCPUs=${#CPUS[@]}
nNICs=${#NICS[@]}
if [[ $LS_DEBUG == 1 ]] && [[ $OMPI_COMM_WORLD_RANK == 0 ]]; then
if [[ "$nsockets" == "$nCPUs" ]]; then
nsockets_string="$nsockets"
else
nsockets_string="$nsockets ($nCPUs used)"
fi
echo "Debug Hardware found: nsockets=$nsockets_string GPUs: ${GPUS[*]} CPUS: ${CPUS[*]}"
fi
# Handle certain special cases
# Detect if multiple MPI ranks are bound to the same GPU and check that it is consistent with MPI_SIZE.
if [[ -z $MPI_PER_GPU ]]; then
if [[ $nGPUs -lt $OMPI_COMM_WORLD_LOCAL_SIZE ]]; then
if [[ $OMPI_COMM_WORLD_RANK == 0 ]] || [[ $LS_DEBUG == 1 ]]; then
echo "Error: there are $OMPI_COMM_WORLD_LOCAL_SIZE MPI ranks per node, but you seem to have only $nGPUs GPUs."
echo "To prevent mistakes, if you intended to use more than 1 MPI rank per GPU, this script requires you to set the env variable MPI_PER_GPU."
echo "Try to rerun with: export MPI_PER_GPU=$(((OMPI_COMM_WORLD_LOCAL_SIZE+nGPUs-1)/nGPUs))"
else
sleep 2
fi
[[ $OMPI_COMM_WORLD_RANK == 1 ]] && echo "$0 error"
exit 104
fi
MPI_PER_GPU=1
else
if [[ $((nGPUs*MPI_PER_GPU)) -lt $OMPI_COMM_WORLD_LOCAL_SIZE ]]; then
if [[ $OMPI_COMM_WORLD_RANK == 0 ]] || [[ $LS_DEBUG == 1 ]]; then
echo "Error: unsatisfiable value for MPI_PER_GPU. $OMPI_COMM_WORLD_LOCAL_SIZE ranks per node spawned, but only $nGPUs GPUs x $MPI_PER_GPU ranks requested."
echo "Try setting MPI_PER_GPU=$(((OMPI_COMM_WORLD_LOCAL_SIZE+nGPUs-1)/nGPUs))"
else
sleep 2
fi
[[ $OMPI_COMM_WORLD_RANK == 1 ]] && echo "$0 error"
exit 105
fi
fi
# LS_PCI is an experimental flag to skip half the GPUs. Default LS_PCI=1 uses all GPUs. LS_PCI=2 will use 0,2,4,6
if [[ -n $LS_PCI ]]; then
if [[ $((nGPUs*MPI_PER_GPU)) -lt $((OMPI_COMM_WORLD_LOCAL_SIZE*LS_PCI)) ]]; then
if [[ $OMPI_COMM_WORLD_RANK == 0 ]] || [[ $LS_DEBUG == 1 ]]; then
echo "Error: unsatisfiable value for LS_PCI. $OMPI_COMM_WORLD_LOCAL_SIZE ranks per node spawned, but only $((nGPUs/LS_PCI)) GPUs x $MPI_PER_GPU ranks requested."
else
sleep 2
fi
[[ $OMPI_COMM_WORLD_RANK == 1 ]] && echo "$0 error"
exit 106
fi
## skip half of the GPUs
for igpu in $(seq 0 $((nGPUs/LS_PCI-1))); do
GPUS[$igpu]=${GPUS[$((igpu*LS_PCI))]}
done
for igpu in $(seq $((nGPUs/LS_PCI)) $nGPUs); do
unset "GPUS[$igpu]"
done
nGPUs=$((nGPUs/LS_PCI))
if [[ $nGPUs != ${#GPUS[@]} ]]; then
echo "Internal $0 error. Please report this bug."
exit 107
fi
if [[ $LS_DEBUG == 1 ]] && [[ $OMPI_COMM_WORLD_RANK == 0 ]]; then
echo "Debug LS_PCI GPUs: ${GPUS[*]} CPUS: ${CPUS[*]}"
fi
LS_PCI_DEBUG_STRING=" (some GPUs are hidden due to LS_PCI)"
else
LS_PCI=1
LS_PCI_DEBUG_STRING=""
fi
## BINDING
my_gpu_id=$((OMPI_COMM_WORLD_LOCAL_RANK/MPI_PER_GPU))
my_cpu_id=$((OMPI_COMM_WORLD_LOCAL_RANK*nCPUs/nGPUs/MPI_PER_GPU))
my_nic_id=$((OMPI_COMM_WORLD_LOCAL_RANK*nNICs/nGPUs/MPI_PER_GPU))
nmpi_per_node=$((MPI_PER_GPU*nGPUs)) # Max number of ranks that could be spawned if the node was full. More reliable for calculations than OMPI_LOCAL_SIZE which can sometimes differ between nodes.
# GPU binding using CUDA_VISIBLE_DEVICES
export CUDA_VISIBLE_DEVICES=${GPUS[$my_gpu_id]}
## NIC binding
export OMPI_MCA_btl_openib_if_include=${NICS[$my_nic_id]}
export UCX_NET_DEVICES=$OMPI_MCA_btl_openib_if_include:1
## CPU binding
# This one is trickier since 2 cases must be handled: multiple NUMA nodes per GPU, or multiple GPUs per NUMA node.
# Note: here we assume that all CPUs must be used and are different, if some NUMA nodes have been disabled due to proximity, that must have been done in earlier steps
nsockets_per_mpi=1
nmpi_per_socket=$((nmpi_per_node/nCPUs))
if [[ $nmpi_per_socket == 0 ]]; then
#Weird case: multiple sockets are available per MPI rank
nsockets_per_mpi=$((nCPUs/nGPUs/MPI_PER_GPU))
nmpi_per_socket=1
fi
requested_env_variable="SCRIPT_AUTODETECTION]"
if [[ -z $NCORES_PER_MPI ]]; then
if [[ -z $OMP_NUM_THREADS ]]; then
nthreads_per_mpi=$((nthreads_avail_per_socket/nmpi_per_socket))
else
requested_env_variable="OMP_NUM_THREADS"
nthreads_per_mpi=$OMP_NUM_THREADS
fi
else
requested_env_variable="NCORES_PER_MPI"
nthreads_per_mpi=$NCORES_PER_MPI
fi
if [[ $nsockets_per_mpi -gt 1 ]]; then
nthreads_avail_per_mpi=$((nthreads_avail_per_socket*nsockets_per_mpi))
else
nthreads_avail_per_mpi=$((nthreads_avail_per_socket/nmpi_per_socket))
fi
if [[ $nthreads_avail_per_mpi -le 0 ]]; then
if ! $has_set_ngpus && [[ $OMPI_COMM_WORLD_SIZE -lt $nmpi_per_socket ]]; then
if [[ $OMPI_COMM_WORLD_RANK == 0 ]] || [[ $LS_DEBUG == 1 ]]; then
echo "$0 Error: the requested binding settings would cause overlap if all GPUs were used on the node. This is considered an error as it complicates the script logic. Please explicitly set the number of GPUs wou want to use with export LS_NGPUS=<number of GPUs>"
else
sleep 2
fi
exit 114
else
if [[ $OMPI_COMM_WORLD_RANK == 0 ]] || [[ $LS_DEBUG == 1 ]]; then
echo "$0 error. The number of MPI ranks per socket ($nmpi_per_socket) exceeds cores available ($nthreads_avail_per_socket). This is possibly the result of setting MPI_PER_GPU to a value that would cause overlap."
if [[ -z $LS_PCI ]] || [[ $LS_PCI == 1 ]]; then
echo "If you plan to use only a subset of the GPUs, please explicitly set the number of GPUs wou want to use with export LS_NGPUS=<number of GPUs> or use the LS_PCI variable."
fi
else
sleep 2
fi
exit 112
fi
fi
if [[ $nthreads_per_mpi -gt $nthreads_avail_per_mpi ]]; then
if [[ $OMPI_COMM_WORLD_RANK == 0 ]] || [[ $LS_DEBUG == 1 ]]; then
echo "$0 Error: oversubscription detected. $nthreads_per_mpi cores per rank requested through $requested_env_variable, but only $nthreads_avail_per_mpi available."
if [[ $LS_HYPERTHREAD == true ]] || [[ $LS_HYPERTHREAD == 1 ]]; then
echo "Please set $requested_env_variable=$nthreads_avail_per_mpi"
else
echo "If you intended to use hyperthreading, please export LS_HYPERTHREAD=true. Otherwise, set $requested_env_variable=$nthreads_avail_per_mpi"
fi
else
sleep 2
fi
exit 108;
fi
if [[ $LS_DEBUG == 1 ]] && [[ $OMPI_COMM_WORLD_RANK == 0 ]]; then
echo "Debug nthreads_avail_per_socket=$nthreads_avail_per_socket nthreads_per_mpi=$nthreads_per_mpi nthreads_avail_per_mpi=$nthreads_avail_per_mpi nsockets_per_mpi=$nsockets_per_mpi nmpi_per_socket=$nmpi_per_socket"
fi
nthreads_avail_per_socket_per_mpi=$(((nthreads_avail_per_mpi+nsockets_per_mpi-1)/nsockets_per_mpi))
nthreads_per_socket_per_mpi=$(((nthreads_per_mpi+nsockets_per_mpi-1)/nsockets_per_mpi))
n=$nthreads_per_mpi
#finally do the socket/core binding
cores=
for my_id in $(seq $my_cpu_id $((my_cpu_id+nsockets_per_mpi-1)) ); do
isocket=${CPUS[$my_id]}
#establish list of cores
intrasocket_rank=$((OMPI_COMM_WORLD_LOCAL_RANK % nmpi_per_socket))
hyperthread_start=$((ncores_avail_per_socket*nsockets))
ncores_per_mpi=$((nthreads_per_socket_per_mpi/hyperthread_cores))
if [[ $ncores_per_mpi == 0 ]]; then
ncores_per_mpi=1
fi
if [[ $CORE_ORDER == sequential ]]; then
first=$((isocket*ncores_avail_per_socket + intrasocket_rank*nthreads_per_socket_per_mpi))
ihyper=$(((isocket+1)*ncores_avail_per_socket))
for i in $(seq 0 $((nthreads_per_socket_per_mpi-1))); do
if [[ $((first+i)) -lt $ihyper ]]; then
if [[ $n -gt 0 ]]; then
cores=$cores$((first+i)),
n=$((n-1))
fi
else
if [[ $n -gt 0 ]]; then
#hyperthreading
cores=$cores$((hyperthread_start+i+first-ihyper+isocket*ncores_avail_per_socket)),
n=$((n-1))
fi
fi
done
else
#Scaled over the range of cores available
first=$((isocket*ncores_avail_per_socket + intrasocket_rank*nthreads_avail_per_socket_per_mpi))
ihyper=$(((isocket+1)*ncores_avail_per_socket))
for i in $(seq 0 $((nthreads_per_socket_per_mpi-1))); do
idest=$((first+i*nthreads_avail_per_socket_per_mpi/nthreads_per_socket_per_mpi))
if [[ $idest -lt $ihyper ]]; then
if [[ $n -gt 0 ]]; then
cores=$cores$idest,
n=$((n-1))
fi
else
if [[ $n -gt 0 ]]; then
#hyperthreading
cores=$cores$((hyperthread_start+idest-ihyper+isocket*ncores_avail_per_socket)),
n=$((n-1))
fi
fi
done
fi
done
#remove last comma
cores=${cores::-1}
if [[ -n $OMP_NUM_THREADS ]] && [[ -z $OMP_PROC_BIND ]]; then
export OMP_PROC_BIND=true
fi
if [[ -n "$FORCE_UCX_NET_DEVICES" ]]; then
if [[ "$FORCE_UCX_NET_DEVICES" == unset ]]; then
unset UCX_NET_DEVICES
else
export UCX_NET_DEVICES="$FORCE_UCX_NET_DEVICES"
fi
fi
if [[ $GPU_BINDING == unset ]]; then
unset CUDA_VISIBLE_DEVICES
fi
if [[ $GOMP_PIN == 1 ]]; then
export GOMP_CPU_AFFINITY=$cores
fi
if [[ -z $ENABLE_MPS ]]; then
if [[ -z $MPI_PER_GPU ]] || [[ $MPI_PER_GPU == 1 ]]; then
export ENABLE_MPS=false
else
export ENABLE_MPS=true
fi
fi
if [[ $ENABLE_MPS == true ]] || [[ $ENABLE_MPS == 1 ]]; then
if [[ $OMPI_COMM_WORLD_LOCAL_RANK == 0 ]];then
env -u CUDA_VISIBLE_DEVICES nvidia-cuda-mps-control -d && echo "Starting MPS on $(hostname)"
else
# wait until MPS is started
timeout 10 bash -c 'until pgrep nvidia-cuda-mps >/dev/null; do sleep 0.5; done' || echo "WARNING: $0 Timed out waiting for the MPS daemon to start. Consider running without MPS to reduce startup overhead (ENABLE_MPS=false)."
fi
fi
USE_NUMACTL=${USE_NUMACTL:-true}
USE_MEMORY_BINDING=${USE_MEMORY_BINDING:-true}
ADD_MEM_BINDING=""
MEMLIST="<off>"
if $USE_NUMACTL && $USE_MEMORY_BINDING; then
MEMLIST="${CPUS[$my_cpu_id]}"
for my_id in $(seq $((my_cpu_id+1)) $((my_cpu_id+nsockets_per_mpi-1)) ); do
MEMLIST="$MEMLIST,${CPUS[$my_id]}"
done
ADD_MEM_BINDING="--membind=$MEMLIST"
fi
export HIP_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
echo "MPI Rank $OMPI_COMM_WORLD_RANK host $(hostname) GPU=$CUDA_VISIBLE_DEVICES cores=$cores membind=$MEMLIST UCX_NET_DEVICES=$UCX_NET_DEVICES"
if $USE_NUMACTL; then
exec numactl -a --physcpubind=$cores $ADD_MEM_BINDING $@
else
exec taskset -c $cores $@
fi
if [[ $ENABLE_MPS == true ]] || [[ $ENABLE_MPS == 1 ]]; then
if [[ $OMPI_COMM_WORLD_LOCAL_RANK == 0 ]];then
echo quit | nvidia-cuda-mps-control && echo "Stopped MPS"
fi
fi