-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsrun_launcher.sh.example
41 lines (33 loc) · 1.05 KB
/
srun_launcher.sh.example
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/bin/bash
# REQUIRE: PORT
# To handle the SIGUSR1 signal
handle_signal()
{
echo "$(date) Signal received..."
kill -s SIGUSR1 $PID
}
trap handle_signal SIGUSR1
# Environment
echo "Set up your environment here, for example, conda activate {env} or source {env}/bin/activate"
# Multi-GPU
if [ -z "$SLURM_NTASKS_PER_NODE" ]
then
SLURM_NTASKS_PER_NODE=$(expr $SLURM_NTASKS / $SLURM_NNODES)
fi
export WORLD_SIZE=$(expr $SLURM_NTASKS_PER_NODE \* $SLURM_NNODES)
export LOCAL_WORLD_SIZE=$SLURM_NTASKS_PER_NODE
FIRSTNODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$FIRSTNODE
export MASTER_PORT=$PORT
export LOCAL_RANK=$SLURM_LOCALID
export RANK=$(expr $SLURM_NODEID \* $SLURM_NTASKS_PER_NODE + $SLURM_LOCALID)
export OMP_NUM_THREADS=8
echo master $MASTER_ADDR, port $MASTER_PORT, world size $WORLD_SIZE, local world size $LOCAL_WORLD_SIZE, local rank $LOCAL_RANK, rank $RANK
WANDB_PROJECT=lm-training WANDB_MODE=offline python run_clm.py $@ &
PID="$!"
echo $PID
wait $PID
CODE=$?
echo "$PID exit code $CODE"
wait
exit $CODE