@@ -1806,22 +1806,13 @@ def _new_process_group_helper(
1806
1806
group_rank ,
1807
1807
group_size ,
1808
1808
)
1809
- backend_config = BackendConfig (backend )
1810
1809
# Set the default backend when only single backend is passed in.
1811
1810
if "," not in str (backend ) and ":" not in str (backend ):
1812
1811
assert backend in Backend .backend_type_map , f"Unknown backend type { backend } "
1813
- if backend == Backend .UNDEFINED :
1814
- # Currently when backend is UNDEFINED, both ``gloo`` and ``nccl`` backends
1815
- # will be created, we use nccl(if cuda is available) or gloo as default
1816
- # backend so we can correctly call getDefaultBackend which in ProcessGroup.
1817
- if Backend .NCCL in backend_config .get_device_backend_map ().values ():
1818
- pg ._set_default_backend (Backend .backend_type_map [Backend .NCCL ])
1819
- else :
1820
- pg ._set_default_backend (Backend .backend_type_map [Backend .GLOO ])
1821
- else :
1822
- pg ._set_default_backend (Backend .backend_type_map [backend ])
1812
+ pg ._set_default_backend (Backend .backend_type_map [backend ])
1823
1813
if device_id :
1824
1814
pg .bound_device_id = device_id
1815
+ backend_config = BackendConfig (backend )
1825
1816
backend_class : torch ._C ._distributed_c10d .Backend
1826
1817
for device , backend_str in backend_config .get_device_backend_map ().items ():
1827
1818
# Use the group name as prefix in the default store, such that
@@ -2028,7 +2019,7 @@ def destroy_process_group(group: Optional[ProcessGroup] = None):
2028
2019
# alive until all works and hooks are done. The current implementation does the
2029
2020
# latter. Therefore, we explicitly call _wait_for_pending_works() here to wait
2030
2021
# for the pending hooks to finish.
2031
- if type ( pg ) == ProcessGroup and pg ._has_hooks ():
2022
+ if pg . name (). lower () == "nccl" and pg ._has_hooks ():
2032
2023
pg ._wait_for_pending_works ()
2033
2024
2034
2025
if group is None or group == GroupMember .WORLD :
@@ -2568,17 +2559,15 @@ def batch_isend_irecv(p2p_op_list):
2568
2559
"""
2569
2560
_check_p2p_op_list (p2p_op_list )
2570
2561
group = p2p_op_list [0 ].group
2571
- if group is None :
2572
- group = _get_default_group ()
2573
2562
device = p2p_op_list [0 ].tensor .device
2574
- if group . _get_backend ( device ). supports_coalescing :
2575
- # backend support coalescing
2563
+ if device . type == "cuda" :
2564
+ # NCCL style coalescing
2576
2565
with _coalescing_manager (group , device , async_ops = True ) as cm :
2577
2566
for p2p_op in p2p_op_list :
2578
2567
p2p_op .op (p2p_op .tensor , p2p_op .peer , p2p_op .group , p2p_op .tag )
2579
2568
return cm .works
2580
2569
else :
2581
- # backend not support coalescing
2570
+ # Backward support for Gloo
2582
2571
reqs = []
2583
2572
for p2p_op in p2p_op_list :
2584
2573
work = p2p_op .op (p2p_op .tensor , p2p_op .peer , p2p_op .group , p2p_op .tag )
0 commit comments