Skip to content

Commit 4467e84

Browse files
committed
Updated accelerate scripts
1 parent 043dbc5 commit 4467e84

File tree

3 files changed

+19
-15
lines changed

3 files changed

+19
-15
lines changed

accelerate.sh

-4
This file was deleted.

run-accelerate-gpu4.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#SBATCH --partition=gputest
44
#SBATCH --ntasks=1
55
#SBATCH --cpus-per-task=40
6-
#SBATCH --mem=0
6+
#SBATCH --mem=320G
77
#SBATCH --time=15
88
#SBATCH --gres=gpu:v100:4
99

@@ -12,6 +12,6 @@ module load pytorch
1212

1313
#pip install --user accelerate
1414

15-
srun accelerate launch --multi_gpu --num_processes=4 --num_machines=1 \
15+
srun apptainer_wrapper exec accelerate launch --multi_gpu --num_processes=4 --num_machines=1 \
1616
--mixed_precision=bf16 --dynamo_backend=no \
1717
mnist_accelerate.py --epochs=100

run-accelerate-gpu8.sh

+17-9
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,27 @@
44
#SBATCH --nodes=2
55
#SBATCH --ntasks-per-node=1
66
#SBATCH --cpus-per-task=40
7-
#SBATCH --mem=0
7+
#SBATCH --mem=320G
88
#SBATCH --time=15
99
#SBATCH --gres=gpu:v100:4
1010

1111
module purge
1212
module load pytorch
1313

14-
#pip install --user accelerate
14+
GPUS_PER_NODE=4
15+
MASTER_ADDR=$(hostname -i)
16+
MASTER_PORT=12802
1517

16-
MASTER_IP=$(ip -4 -brief addr show | grep -E 'hsn0|ib0' | grep -oP '([\d]+.[\d.]+)')
17-
MASTER_PORT=29400
18-
19-
srun accelerate.sh --multi_gpu --num_processes=8 --num_machines=2 \
20-
--mixed_precision=no --dynamo_backend=no \
21-
--main_process_ip=$MASTER_IP --main_process_port=$MASTER_PORT \
22-
mnist_accelerate.py --epochs=100
18+
# Note: --machine_rank must be evaluated on each node, hence the LAUNCH_CMD setup
19+
export LAUNCH_CMD="
20+
accelerate launch \
21+
--multi_gpu --mixed_precision no \
22+
--num_machines=${SLURM_NNODES} \
23+
--num_processes=$(expr ${SLURM_NNODES} \* ${GPUS_PER_NODE}) \
24+
--machine_rank=\${SLURM_NODEID} \
25+
--main_process_ip=${MASTER_ADDR} \
26+
--main_process_port=${MASTER_PORT} \
27+
mnist_accelerate.py --epochs=100 \
28+
"
29+
echo ${LAUNCH_CMD}
30+
srun singularity_wrapper exec bash -c "${LAUNCH_CMD}"

0 commit comments

Comments
 (0)