File tree 3 files changed +19
-15
lines changed
3 files changed +19
-15
lines changed Load Diff This file was deleted.
Original file line number Diff line number Diff line change 3
3
# SBATCH --partition=gputest
4
4
# SBATCH --ntasks=1
5
5
# SBATCH --cpus-per-task=40
6
- # SBATCH --mem=0
6
+ # SBATCH --mem=320G
7
7
# SBATCH --time=15
8
8
# SBATCH --gres=gpu:v100:4
9
9
@@ -12,6 +12,6 @@ module load pytorch
12
12
13
13
# pip install --user accelerate
14
14
15
- srun accelerate launch --multi_gpu --num_processes=4 --num_machines=1 \
15
+ srun apptainer_wrapper exec accelerate launch --multi_gpu --num_processes=4 --num_machines=1 \
16
16
--mixed_precision=bf16 --dynamo_backend=no \
17
17
mnist_accelerate.py --epochs=100
Original file line number Diff line number Diff line change 4
4
# SBATCH --nodes=2
5
5
# SBATCH --ntasks-per-node=1
6
6
# SBATCH --cpus-per-task=40
7
- # SBATCH --mem=0
7
+ # SBATCH --mem=320G
8
8
# SBATCH --time=15
9
9
# SBATCH --gres=gpu:v100:4
10
10
11
11
module purge
12
12
module load pytorch
13
13
14
- # pip install --user accelerate
14
+ GPUS_PER_NODE=4
15
+ MASTER_ADDR=$( hostname -i)
16
+ MASTER_PORT=12802
15
17
16
- MASTER_IP=$( ip -4 -brief addr show | grep -E ' hsn0|ib0' | grep -oP ' ([\d]+.[\d.]+)' )
17
- MASTER_PORT=29400
18
-
19
- srun accelerate.sh --multi_gpu --num_processes=8 --num_machines=2 \
20
- --mixed_precision=no --dynamo_backend=no \
21
- --main_process_ip=$MASTER_IP --main_process_port=$MASTER_PORT \
22
- mnist_accelerate.py --epochs=100
18
+ # Note: --machine_rank must be evaluated on each node, hence the LAUNCH_CMD setup
19
+ export LAUNCH_CMD="
20
+ accelerate launch \
21
+ --multi_gpu --mixed_precision no \
22
+ --num_machines=${SLURM_NNODES} \
23
+ --num_processes=$( expr ${SLURM_NNODES} \* ${GPUS_PER_NODE} ) \
24
+ --machine_rank=\$ {SLURM_NODEID} \
25
+ --main_process_ip=${MASTER_ADDR} \
26
+ --main_process_port=${MASTER_PORT} \
27
+ mnist_accelerate.py --epochs=100 \
28
+ "
29
+ echo ${LAUNCH_CMD}
30
+ srun singularity_wrapper exec bash -c " ${LAUNCH_CMD} "
You can’t perform that action at this time.
0 commit comments