File tree Expand file tree Collapse file tree 3 files changed +19
-15
lines changed
Expand file tree Collapse file tree 3 files changed +19
-15
lines changed Load Diff This file was deleted.
Original file line number Diff line number Diff line change 33# SBATCH --partition=gputest
44# SBATCH --ntasks=1
55# SBATCH --cpus-per-task=40
6- # SBATCH --mem=0
6+ # SBATCH --mem=320G
77# SBATCH --time=15
88# SBATCH --gres=gpu:v100:4
99
@@ -12,6 +12,6 @@ module load pytorch
1212
1313# pip install --user accelerate
1414
15- srun accelerate launch --multi_gpu --num_processes=4 --num_machines=1 \
15+ srun apptainer_wrapper exec accelerate launch --multi_gpu --num_processes=4 --num_machines=1 \
1616 --mixed_precision=bf16 --dynamo_backend=no \
1717 mnist_accelerate.py --epochs=100
Original file line number Diff line number Diff line change 44# SBATCH --nodes=2
55# SBATCH --ntasks-per-node=1
66# SBATCH --cpus-per-task=40
7- # SBATCH --mem=0
7+ # SBATCH --mem=320G
88# SBATCH --time=15
99# SBATCH --gres=gpu:v100:4
1010
1111module purge
1212module load pytorch
1313
14- # pip install --user accelerate
14+ GPUS_PER_NODE=4
15+ MASTER_ADDR=$( hostname -i)
16+ MASTER_PORT=12802
1517
16- MASTER_IP=$( ip -4 -brief addr show | grep -E ' hsn0|ib0' | grep -oP ' ([\d]+.[\d.]+)' )
17- MASTER_PORT=29400
18-
19- srun accelerate.sh --multi_gpu --num_processes=8 --num_machines=2 \
20- --mixed_precision=no --dynamo_backend=no \
21- --main_process_ip=$MASTER_IP --main_process_port=$MASTER_PORT \
22- mnist_accelerate.py --epochs=100
18+ # Note: --machine_rank must be evaluated on each node, hence the LAUNCH_CMD setup
19+ export LAUNCH_CMD="
20+ accelerate launch \
21+ --multi_gpu --mixed_precision no \
22+ --num_machines=${SLURM_NNODES} \
23+ --num_processes=$( expr ${SLURM_NNODES} \* ${GPUS_PER_NODE} ) \
24+ --machine_rank=\$ {SLURM_NODEID} \
25+ --main_process_ip=${MASTER_ADDR} \
26+ --main_process_port=${MASTER_PORT} \
27+ mnist_accelerate.py --epochs=100 \
28+ "
29+ echo ${LAUNCH_CMD}
30+ srun singularity_wrapper exec bash -c " ${LAUNCH_CMD} "
You can’t perform that action at this time.
0 commit comments