Raise warning when calling collectives on non-member group objects (pytorch#67639)

mrshenli · facebook-github-bot · commit 18955d3564ad · 2021-11-02T20:04:07.000-07:00
Summary: Pull Request resolved: pytorch#67639 Due to BC considerations, we cannot directly error out, as that might break existing applications. Raise warnings first to improve debuggability. cc pietern mrshenli pritamdamania87 zhaojuanmao satgera rohan-varma gqchen aazzolini osalpekar jiayisuse SciPioneer H-Huang Test Plan: Imported from OSS Reviewed By: rohan-varma Differential Revision: D32075151 Pulled By: mrshenli fbshipit-source-id: 5680d420f5f6cd3f74a36616c03350e8a976b363
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
@@ -656,6 +656,36 @@ def _test_sequence_num_set_new_group(self, backend):
             dist.all_gather_object(obj_list, subgroup_seq, group=subgroup)
             self.assertEqual(len(set(obj_list)), 1)
 
+    def _test_warn_not_in_group(self, backend):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            backend,
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        in_group_ranks = list(filter(lambda x: x % 2 == 0, range(self.world_size)))
+        group = dist.new_group(in_group_ranks)
+
+        x = torch.zeros(2, 2).cuda(self.rank)
+        xs = [torch.zeros(2, 2).cuda(self.rank) for _ in range(len(in_group_ranks))]
+        if self.rank not in in_group_ranks:
+            msg = ".*{}.*does not belong to.*"
+            with self.assertWarnsOnceRegex(UserWarning, msg.format("all_gather")):
+                dist.all_gather(xs, x, group=group)
+            with self.assertWarnsOnceRegex(UserWarning, msg.format("all_reduce")):
+                dist.all_reduce(x, group=group)
+            with self.assertWarnsOnceRegex(UserWarning, msg.format("barrier")):
+                dist.barrier(group=group)
+            with self.assertWarnsOnceRegex(UserWarning, msg.format("broadcast")):
+                dist.broadcast(x, src=0, group=group)
+        else:
+            dist.all_gather(xs, x, group=group)
+            dist.all_reduce(x, group=group)
+            dist.barrier(group=group)
+            dist.broadcast(x, src=0, group=group)
+
+
 class CommTest(AbstractCommTest, MultiProcessTestCase):
     def setUp(self):
         super(CommTest, self).setUp()
diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
@@ -2320,6 +2320,11 @@ def test_gloo_barrier_device_ids(self):
         with self.assertRaisesRegex(RuntimeError, "device_ids not supported"):
             c10d.barrier(device_ids=[self.rank])
 
+    @skip_if_lt_x_gpu(2)
+    @requires_gloo()
+    def test_gloo_warn_not_in_group(self):
+        self._test_warn_not_in_group(backend="gloo")
+
 
 if __name__ == "__main__":
     assert (
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
@@ -2629,6 +2629,23 @@ def test_nccl_barrier_device_ids_function_argument(self):
         with self.assertRaisesRegex(RuntimeError, "Invalid function argument"):
             c10d.barrier(device_ids=self.rank)
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    @with_dist_debug_levels(levels=["DETAIL"])
+    def test_nccl_warn_not_in_group_debug_detail(self):
+        self._test_warn_not_in_group(backend="nccl")
+
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    @with_dist_debug_levels(levels=["INFO"])
+    def test_nccl_warn_not_in_group_debug_info(self):
+        self._test_warn_not_in_group(backend="nccl")
+
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    @with_dist_debug_levels(levels=["OFF"])
+    def test_nccl_warn_not_in_group_debug_off(self):
+        self._test_warn_not_in_group(backend="nccl")
 
 if __name__ == "__main__":
     assert (
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
@@ -271,6 +271,14 @@ def _rank_not_in_group(group: ProcessGroup):
     return group == GroupMember.NON_GROUP_MEMBER
 
 
+def _warn_not_in_group(op_name):
+    global_rank = -1 if GroupMember.WORLD is None else GroupMember.WORLD.rank()
+    warnings.warn(
+        f"Running {op_name} on global rank {global_rank} which does not "
+        "belong to the given group."
+    )
+
+
 def _get_group_rank(group: ProcessGroup, rank):
     """
     Helper that gets a given group's local rank in the group from a given global
@@ -879,6 +887,7 @@ def isend(tensor, dst, group=None, tag=0):
     """
     _check_single_tensor(tensor, "tensor")
     if _rank_not_in_group(group):
+        _warn_not_in_group("isend")
         return
 
     if group is None or group is GroupMember.WORLD:
@@ -908,6 +917,7 @@ def irecv(tensor, src=None, group=None, tag=0):
     """
     _check_single_tensor(tensor, "tensor")
     if _rank_not_in_group(group):
+        _warn_not_in_group("irecv")
         return
 
     if group is None or group is GroupMember.WORLD:
@@ -939,6 +949,7 @@ def send(tensor, dst, group=None, tag=0):
     """
     _check_single_tensor(tensor, "tensor")
     if _rank_not_in_group(group):
+        _warn_not_in_group("send")
         return
 
     if group is None or group is GroupMember.WORLD:
@@ -968,6 +979,7 @@ def recv(tensor, src=None, group=None, tag=0):
     """
     _check_single_tensor(tensor, "tensor")
     if _rank_not_in_group(group):
+        _warn_not_in_group("recv")
         return -1
 
     if group is None:
@@ -1119,6 +1131,7 @@ def broadcast_multigpu(tensor_list, src, group=None, async_op=False, src_tensor=
 
     """
     if _rank_not_in_group(group):
+        _warn_not_in_group("broadcast_multigpu")
         return
 
     opts = BroadcastOptions()
@@ -1160,6 +1173,7 @@ def broadcast(tensor, src, group=None, async_op=False):
     """
     _check_single_tensor(tensor, "tensor")
     if _rank_not_in_group(group):
+        _warn_not_in_group("broadcast")
         return
 
     opts = BroadcastOptions()
@@ -1283,6 +1297,7 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, async_op=False):
     """
     _check_single_tensor(tensor, "tensor")
     if _rank_not_in_group(group):
+        _warn_not_in_group("all_reduce")
         return
 
     if tensor.is_complex():
@@ -1339,6 +1354,7 @@ def all_reduce_coalesced(tensors, op=ReduceOp.SUM, group=None, async_op=False):
     """
     _check_tensor_list(tensors, "tensor")
     if _rank_not_in_group(group):
+        _warn_not_in_group("all_reduce_coalesced")
         return
 
     if any([t.is_complex() for t in tensors]) and not supports_complex(op):
@@ -1394,6 +1410,7 @@ def reduce_multigpu(
 
     """
     if _rank_not_in_group(group):
+        _warn_not_in_group("reduce_multigpu")
         return
 
     opts = ReduceOptions()
@@ -1439,6 +1456,7 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, async_op=False):
     """
     _check_single_tensor(tensor, "tensor")
     if _rank_not_in_group(group):
+        _warn_not_in_group("reduce")
         return
 
     opts = ReduceOptions()
@@ -1505,6 +1523,7 @@ def all_gather_multigpu(
 
     """
     if _rank_not_in_group(group):
+        _warn_not_in_group("all_gather_multigpu")
         return
 
     output_tensor_lists = [
@@ -1591,6 +1610,7 @@ def all_gather_object(object_list, obj, group=None):
         ['foo', 12, {1: 2}]
     """
     if _rank_not_in_group(group):
+        _warn_not_in_group("all_gather_object")
         return
 
     input_tensor, local_size = _object_to_tensor(obj)
@@ -1684,6 +1704,7 @@ def gather_object(obj, object_gather_list=None, dst=0, group=None):
         ['foo', 12, {1: 2}]
     """
     if _rank_not_in_group(group):
+        _warn_not_in_group("gather_object")
         return
 
     # Ensure object_gather_list is specified appopriately.
@@ -1792,6 +1813,7 @@ def broadcast_object_list(object_list, src=0, group=None, device=None):
         ['foo', 12, {1: 2}]
     """
     if _rank_not_in_group(group):
+        _warn_not_in_group("broadcast_object_list")
         return
 
     my_rank = get_rank()
@@ -1903,6 +1925,7 @@ def scatter_object_list(
         [{1: 2}]
     """
     if _rank_not_in_group(group):
+        _warn_not_in_group("scatter_object_list")
         return
 
     if (
@@ -2003,6 +2026,7 @@ def all_gather(tensor_list, tensor, group=None, async_op=False):
     _check_tensor_list(tensor_list, "tensor_list")
     _check_single_tensor(tensor, "tensor")
     if _rank_not_in_group(group):
+        _warn_not_in_group("all_gather")
         return
 
     tensor_list = [
@@ -2062,6 +2086,7 @@ def _all_gather_base(output_tensor, input_tensor, group=None, async_op=False):
     _check_single_tensor(input_tensor, "input_tensor")
     _check_single_tensor(output_tensor, "output_tensor")
     if _rank_not_in_group(group):
+        _warn_not_in_group("_all_gather_base")
         return
 
     output_tensor = (
@@ -2136,6 +2161,7 @@ def all_gather_coalesced(
     # We only check basic compatibility with C++ params here, C++ code will
     # do shape and type checking.
     if _rank_not_in_group(group):
+        _warn_not_in_group("all_gather_coalesced")
         return
     _check_tensor_list(input_tensor_list, "tensor_list")
     if not isinstance(output_tensor_lists, list):
@@ -2206,6 +2232,7 @@ def gather(tensor, gather_list=None, dst=0, group=None, async_op=False):
         gather_list = []
 
     if _rank_not_in_group(group):
+        _warn_not_in_group("gather")
         return
 
     my_rank = get_rank()
@@ -2262,6 +2289,7 @@ def scatter(tensor, scatter_list=None, src=0, group=None, async_op=False):
         scatter_list = []
 
     if _rank_not_in_group(group):
+        _warn_not_in_group("scatter")
         return
     scatter_list = [
         t if not t.is_complex() else torch.view_as_real(t) for t in scatter_list
@@ -2347,6 +2375,7 @@ def reduce_scatter_multigpu(
 
     """
     if _rank_not_in_group(group):
+        _warn_not_in_group("reduce_scatter_multigpu")
         return
 
     opts = ReduceScatterOptions()
@@ -2383,6 +2412,7 @@ def reduce_scatter(output, input_list, op=ReduceOp.SUM, group=None, async_op=Fal
     _check_single_tensor(output, "output")
     _check_tensor_list(input_list, "input_list")
     if _rank_not_in_group(group):
+        _warn_not_in_group("reduce_scatter")
         return
 
     opts = ReduceScatterOptions()
@@ -2420,6 +2450,7 @@ def _reduce_scatter_base(output, input, op=ReduceOp.SUM, group=None, async_op=Fa
     _check_single_tensor(input, "input")
 
     if _rank_not_in_group(group):
+        _warn_not_in_group("_reduce_scatter_base")
         return
 
     opts = ReduceScatterOptions()
@@ -2534,6 +2565,7 @@ def all_to_all_single(
         tensor([4+4j, 8+8j, 12+12j, 16+16j])                            # Rank 3
     """
     if _rank_not_in_group(group):
+        _warn_not_in_group("all_to_all_single")
         return
 
     opts = AllToAllOptions()
@@ -2655,6 +2687,7 @@ def all_to_all(output_tensor_list, input_tensor_list, group=None, async_op=False
 
     """
     if _rank_not_in_group(group):
+        _warn_not_in_group("all_to_all")
         return
 
     opts = AllToAllOptions()
@@ -2700,6 +2733,7 @@ def barrier(group=GroupMember.WORLD, async_op=False, device_ids=None):
         None, if not async_op or if not part of the group
     """
     if _rank_not_in_group(group):
+        _warn_not_in_group("barrier")
         return
 
     opts = BarrierOptions()
@@ -2780,6 +2814,7 @@ def monitored_barrier(group=GroupMember.WORLD, timeout=None, wait_all_ranks=Fals
     # Need to call rank not in group before using the group, otherwise
     # "Invalid process group" error is raised.
     if _rank_not_in_group(group):
+        _warn_not_in_group("monitored_barrier")
         return
 
     if get_backend(group) != Backend.GLOO: