We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
2 parents 6b97d4b + d4fdb31 commit cd548c3Copy full SHA for cd548c3
orchestration/slurm/launchers/accelerate-launcher.slurm
@@ -33,6 +33,7 @@ ACCELERATE_CONFIG_FILE=accelerate.yaml
33
# EDIT if it's not 8-gpus per node
34
GPUS_PER_NODE=8
35
NNODES=$SLURM_NNODES
36
+NUM_PROCESSES=$(($NNODES * $GPUS_PER_NODE))
37
38
# define the node 0 hostname:port
39
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
@@ -45,6 +46,8 @@ MASTER_PORT=6000
45
46
LAUNCHER="python -u -m accelerate.commands.launch \
47
--rdzv_conf "rdzv_backend=c10d,rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT" \
48
--config_file $ACCELERATE_CONFIG_FILE \
49
+ --num_processes $NUM_PROCESSES \
50
+ --num_machines $NNODES \
51
--main_process_ip $MASTER_ADDR \
52
--main_process_port $MASTER_PORT \
53
--machine_rank \$SLURM_PROCID \
0 commit comments