1
1
#!/usr/bin/env python
2
2
3
- #
4
- # This a `torch.distributed` diagnostics script that checks that all GPUs in the cluster (one or
5
- # many nodes) can talk to each other via nccl and allocate gpu memory.
6
- #
7
- # To run first adjust the number of processes and nodes:
8
- #
9
- # python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
10
- #
11
- # You may need to add --master_addr $MASTER_ADDR --master_port $MASTER_PORT if using a custom addr:port
12
- #
13
- # You can also use the rdzv API: --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT --rdzv_backend c10d
14
- #
15
- # use torch.distributed.launch instead of torch.distributed.run for torch < 1.9
16
- #
17
- # If you get a hanging in `barrier` calls you have some network issues, you may try to debug this with:
18
- #
19
- # NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
20
- #
21
- # which should tell you what's going on behind the scenes.
22
- #
23
- #
24
- # This script can be run via `srun` in the SLURM environment as well. Here is a SLURM script that
25
- # runs on 2 nodes of 4 gpus per node:
26
-
27
- # #!/bin/bash
28
- # #SBATCH --job-name=test-nodes # name
29
- # #SBATCH --nodes=2 # nodes
30
- # #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
31
- # #SBATCH --cpus-per-task=10 # number of cores per tasks
32
- # #SBATCH --gres=gpu:4 # number of gpus
33
- # #SBATCH --time 0:05:00 # maximum execution time (HH:MM:SS)
34
- # #SBATCH --output=%x-%j.out # output file name
35
- #
36
- # export GPUS_PER_NODE=4
37
- # export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
38
- # export MASTER_PORT=6000
39
- #
40
- # srun --jobid $SLURM_JOBID bash -c 'python -m torch.distributed.run \
41
- # --nproc_per_node $GPUS_PER_NODE --nnodes $SLURM_NNODES --node_rank $SLURM_PROCID \
42
- # --master_addr $MASTER_ADDR --master_port $MASTER_PORT \
43
- # torch-distributed-gpu-test.py'
44
- #
45
- # can also add this for automatic prefixing of all logs with [hostname:rank] (in addition to `--master_addr` etc)
46
- # --role `hostname -s`: --tee 3 \
47
- #
3
+ """
4
+
5
+ This a `torch.distributed` diagnostics script that checks that all GPUs in the cluster (one or
6
+ many nodes) can talk to each other via nccl and allocate gpu memory. It also prints other useful information like NUMA affinities.
7
+
8
+ To run it you just need to adjust the number of processes and nodes according to your use case:
9
+
10
+ ```
11
+ python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
12
+ ```
13
+
14
+ You may need to add `--master_addr $MASTER_ADDR --master_port $MASTER_PORT` if using a custom addr:port
15
+
16
+ You can also use the rdzv API: `--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT --rdzv_backend c10d`
17
+
18
+ If you get a hanging in `barrier` calls you have some network issues, you may try to debug this with:
19
+
20
+ ```
21
+ NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
22
+ ```
23
+
24
+ which should tell you what's going on behind the scenes.
25
+
26
+ This script can be run via `srun` in the SLURM environment as well. Here is a SLURM script that
27
+ runs on 2 nodes of 8 gpus per node:
28
+
29
+ ```
30
+ #!/bin/bash
31
+ #SBATCH --job-name=test-nodes # name
32
+ #SBATCH --nodes=2 # EDIT to the number of nodes
33
+ #SBATCH --ntasks-per-node=1 # crucial - only 1 task per node for this script
34
+ #SBATCH --cpus-per-task=10 # EDIT this to how many cpu cores the node has
35
+ #SBATCH --gres=gpu:8 # EDIT this if it's not an 8-GPUs node setup
36
+ #SBATCH --partition=dev # EDIT to the desired partition name
37
+ #SBATCH --time 0:05:00 # 5 min should be enough
38
+ #SBATCH --output=%x-%j.out # output file name
39
+
40
+ export GPUS_PER_NODE=8
41
+ export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
42
+ export MASTER_PORT=6000
43
+
44
+ srun --jobid $SLURM_JOBID bash -c 'python -m torch.distributed.run \
45
+ --nproc_per_node $GPUS_PER_NODE --nnodes $SLURM_NNODES --node_rank $SLURM_PROCID \
46
+ --master_addr $MASTER_ADDR --master_port $MASTER_PORT \
47
+ torch-distributed-gpu-test.py'
48
+ ```
49
+
50
+ You can also add this to the launcher for automatic prefixing of all logs with `[hostname:rank] ` (e.g. after `--master_addr`):
51
+
52
+ ```
53
+ --role `hostname -s`: --tee 3
54
+ ```
55
+
56
+ """
48
57
49
58
import builtins
50
59
import fcntl
@@ -67,9 +76,12 @@ def print(*args, **kwargs):
67
76
device = torch .device ("cuda" , local_rank )
68
77
hostname = socket .gethostname ()
69
78
70
- gpu = f"[{ hostname } - { local_rank } ]"
79
+ gpu = f"[{ hostname } : { local_rank } ]"
71
80
72
81
try :
82
+ # XXX: possibly change the dist timeout to something much shorter to get this script to fail
83
+ # fast if there is a problem and not wait for the default 30min
84
+
73
85
# test distributed
74
86
dist .init_process_group ("nccl" )
75
87
@@ -96,5 +108,5 @@ def print(*args, **kwargs):
96
108
print (f"pytorch compute capabilities={ torch .cuda .get_arch_list ()} " )
97
109
98
110
except Exception :
99
- print (f"{ gpu } is broken" )
111
+ print (f"{ gpu } is broken (but it could also mean that it failed because another gpu didn't respond) " )
100
112
raise
0 commit comments