Skip to content

Commit a33bc50

Browse files
pytorchmergebotpobin6
authored and
pobin6
committed
Revert "add supports_coalescing property in c10d::Backend to determine whether backend supports coalescing (pytorch#135338)"
This reverts commit e557444. Reverted pytorch#135338 on behalf of https://github.com/ZainRizvi due to Sorry but this is failing internally. Please see D65663382 for more details ([comment](pytorch#135338 (comment)))
1 parent 0a16f8d commit a33bc50

File tree

5 files changed

+8
-33
lines changed

5 files changed

+8
-33
lines changed

torch/csrc/distributed/c10d/Backend.hpp

-4
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,6 @@ class TORCH_API Backend : public torch::CustomClassHolder {
5858
return false;
5959
}
6060

61-
virtual bool supportsCoalescing() const {
62-
return false;
63-
}
64-
6561
virtual void startCoalescing() {
6662
TORCH_CHECK(
6763
false,

torch/csrc/distributed/c10d/ProcessGroup.hpp

+2-4
Original file line numberDiff line numberDiff line change
@@ -811,7 +811,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
811811
TORCH_CHECK(
812812
backendTypeToBackend_.find(backendType_) != backendTypeToBackend_.end(),
813813
"Could not find the default backend type ",
814-
uint16_t(backendType_),
814+
backendType_,
815815
" for Process Group with name ",
816816
getBackendName(),
817817
".");
@@ -832,9 +832,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
832832
TORCH_CHECK(
833833
backendTypeToBackend_.find(backendType) != backendTypeToBackend_.end(),
834834
"Could not find backend type ",
835-
uint16_t(backendType),
836-
" for Process Group with name ",
837-
backendTypeToString(backendType),
835+
backendType,
838836
".");
839837
return backendTypeToBackend_.at(backendType);
840838
}

torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp

-4
Original file line numberDiff line numberDiff line change
@@ -598,10 +598,6 @@ class TORCH_API ProcessGroupNCCL : public Backend {
598598
return true;
599599
}
600600

601-
bool supportsCoalescing() const override {
602-
return true;
603-
}
604-
605601
void startCoalescing() override;
606602

607603
c10::intrusive_ptr<Work> endCoalescing() override;

torch/csrc/distributed/c10d/init.cpp

-4
Original file line numberDiff line numberDiff line change
@@ -2321,10 +2321,6 @@ The hook must have the following signature:
23212321
"supports_splitting",
23222322
&::c10d::Backend::supportsSplitting,
23232323
"(test whether the backend supports splitting)")
2324-
.def_property_readonly(
2325-
"supports_coalescing",
2326-
&::c10d::Backend::supportsCoalescing,
2327-
"(test whether the backend supports coalescing)")
23282324
.def(
23292325
"broadcast",
23302326
&::c10d::Backend::broadcast,

torch/distributed/distributed_c10d.py

+6-17
Original file line numberDiff line numberDiff line change
@@ -1806,22 +1806,13 @@ def _new_process_group_helper(
18061806
group_rank,
18071807
group_size,
18081808
)
1809-
backend_config = BackendConfig(backend)
18101809
# Set the default backend when only single backend is passed in.
18111810
if "," not in str(backend) and ":" not in str(backend):
18121811
assert backend in Backend.backend_type_map, f"Unknown backend type {backend}"
1813-
if backend == Backend.UNDEFINED:
1814-
# Currently when backend is UNDEFINED, both ``gloo`` and ``nccl`` backends
1815-
# will be created, we use nccl(if cuda is available) or gloo as default
1816-
# backend so we can correctly call getDefaultBackend which in ProcessGroup.
1817-
if Backend.NCCL in backend_config.get_device_backend_map().values():
1818-
pg._set_default_backend(Backend.backend_type_map[Backend.NCCL])
1819-
else:
1820-
pg._set_default_backend(Backend.backend_type_map[Backend.GLOO])
1821-
else:
1822-
pg._set_default_backend(Backend.backend_type_map[backend])
1812+
pg._set_default_backend(Backend.backend_type_map[backend])
18231813
if device_id:
18241814
pg.bound_device_id = device_id
1815+
backend_config = BackendConfig(backend)
18251816
backend_class: torch._C._distributed_c10d.Backend
18261817
for device, backend_str in backend_config.get_device_backend_map().items():
18271818
# Use the group name as prefix in the default store, such that
@@ -2028,7 +2019,7 @@ def destroy_process_group(group: Optional[ProcessGroup] = None):
20282019
# alive until all works and hooks are done. The current implementation does the
20292020
# latter. Therefore, we explicitly call _wait_for_pending_works() here to wait
20302021
# for the pending hooks to finish.
2031-
if type(pg) == ProcessGroup and pg._has_hooks():
2022+
if pg.name().lower() == "nccl" and pg._has_hooks():
20322023
pg._wait_for_pending_works()
20332024

20342025
if group is None or group == GroupMember.WORLD:
@@ -2568,17 +2559,15 @@ def batch_isend_irecv(p2p_op_list):
25682559
"""
25692560
_check_p2p_op_list(p2p_op_list)
25702561
group = p2p_op_list[0].group
2571-
if group is None:
2572-
group = _get_default_group()
25732562
device = p2p_op_list[0].tensor.device
2574-
if group._get_backend(device).supports_coalescing:
2575-
# backend support coalescing
2563+
if device.type == "cuda":
2564+
# NCCL style coalescing
25762565
with _coalescing_manager(group, device, async_ops=True) as cm:
25772566
for p2p_op in p2p_op_list:
25782567
p2p_op.op(p2p_op.tensor, p2p_op.peer, p2p_op.group, p2p_op.tag)
25792568
return cm.works
25802569
else:
2581-
# backend not support coalescing
2570+
# Backward support for Gloo
25822571
reqs = []
25832572
for p2p_op in p2p_op_list:
25842573
work = p2p_op.op(p2p_op.tensor, p2p_op.peer, p2p_op.group, p2p_op.tag)

0 commit comments

Comments
 (0)