Use new group instead of split group on non-CUDA device (pytorch#141469)

zhangxiaoli73 · pytorchmergebot · commit b7ad52abb012 · 2024-12-13T05:11:33.000Z
Motivation: Currently, `split_group` only works for NCCL backend. https://github.com/pytorch/pytorch/blob/main/torch/distributed/distributed_c10d.py#L4745. Then we need to use `use_group` on other non-CUDA device. Pull Request resolved: pytorch#141469 Approved by: https://github.com/kwen2501, https://github.com/gujinghui, https://github.com/albanD
diff --git a/torch/distributed/device_mesh.py b/torch/distributed/device_mesh.py
@@ -560,17 +560,19 @@ def _init_process_groups(self):
                     # numbers of API calls are equal to the number of subgroups for each mesh dimension. In a 2 * 4
                     # mesh, we need to make 2 + 4 = 6 API calls per ranks to create all the subgroups.
                     dim_group = None
+                    has_split_group = False
                     if (
                         bound_device_id := getattr(
                             default_group, "bound_device_id", None
                         )
-                    ) is not None:
+                    ) is not None and torch.cuda.is_available():
                         dim_group = split_group(
                             parent_pg=default_group,
                             pg_options=pg_options,
                             split_ranks=pg_ranks_by_dim.tolist(),
                             group_desc=group_desc,
                         )
+                        has_split_group = True
 
                     # If the subgroup has been already created through `split_group`, we simply loop over `pg_ranks_by_dim`
                     # and append the `(group_tag, subgroup_ranks, and group_name)` tuple to the `dim_group_infos` list when
@@ -583,7 +585,7 @@ def _init_process_groups(self):
                         # We temporarily revert the re-use subgroup, since it breaks two internal tests.
                         # Temporarily reverting to resolve test timeout while root-causing.
                         # TODO: Add two tests to cover internal tests scenarios and re-enable reuse subgroup if exists.
-                        if bound_device_id is None:
+                        if bound_device_id is None or not has_split_group:
                             dim_group = new_group(
                                 ranks=subgroup_ranks,
                                 backend=backend,
diff --git a/torch/distributed/tensor/parallel/fsdp.py b/torch/distributed/tensor/parallel/fsdp.py
@@ -186,7 +186,7 @@ def _chunk_tensor(
             inner_param,
             rank,
             world_size,
-            torch.cuda.device_count(),
+            torch.accelerator.device_count(),
             pg,
         )
 

Original file line number	Diff line number	Diff line change
`@@ -186,7 +186,7 @@ def _chunk_tensor(`
`186`	`186`	`inner_param,`
`187`	`187`	`rank,`
`188`	`188`	`world_size,`
`189`		`- torch.cuda.device_count(),`
	`189`	`+ torch.accelerator.device_count(),`
`190`	`190`	`pg,`
`191`	`191`	`)`
`192`	`192`