replace all_gather with more efficient collective api _all_gather_base (pytorch#57769)

zhaojuanmao · facebook-github-bot · commit 2733555ed1fa · 2021-05-24T11:34:45.000-07:00
Summary: Pull Request resolved: pytorch#57769 _all_gather_base saved copies in all_gather, so it is more efficient Test Plan: unit test Reviewed By: SciPioneer Differential Revision: D28227193 fbshipit-source-id: ddd8590095a5b45676497a71ed792a457f9825c6
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
@@ -1194,6 +1194,10 @@ that adds a prefix to each key inserted to the store.
               },
               py::arg("timeout") = ::c10d::kUnsetTimeout,
               py::arg("wait_all_ranks") = false,
+              py::call_guard<py::gil_scoped_release>())
+          .def(
+              "_get_backend_name",
+              &::c10d::ProcessGroup::getBackendName,
               py::call_guard<py::gil_scoped_release>());
 
   // base ProcessGroup::Options binding
diff --git a/torch/nn/modules/_functions.py b/torch/nn/modules/_functions.py
@@ -27,15 +27,31 @@ def forward(self, input, weight, bias, running_mean, running_var, eps, momentum,
         num_channels = input.shape[1]
         # C, C, 1 -> (2C + 1)
         combined = torch.cat([mean, invstd, count], dim=0)
-        # world_size * (2C + 1)
-        combined_list = [
-            torch.empty_like(combined) for k in range(world_size)
-        ]
-        # Use allgather instead of allreduce since I don't trust in-place operations ..
-        dist.all_gather(combined_list, combined, process_group, async_op=False)
-        combined = torch.stack(combined_list, dim=0)
-        # world_size * (2C + 1) -> world_size * C, world_size * C, world_size * 1
-        mean_all, invstd_all, count_all = torch.split(combined, num_channels, dim=1)
+        # Use allgather instead of allreduce because count could be different across
+        # ranks, simple all reduce op can not give correct results.
+        # batch_norm_gather_stats_with_counts calculates global mean & invstd based on
+        # all gathered mean, invstd and count.
+        # for nccl backend, use the optimized version of all gather.
+        if process_group._get_backend_name() == 'nccl':
+            # world_size * (2C + 1)
+            combined_size = combined.numel()
+            combined_flat = torch.empty(1,
+                                        combined_size * world_size,
+                                        dtype=combined.dtype,
+                                        device=combined.device)
+            dist._all_gather_base(combined_flat, combined, process_group, async_op=False)
+            combined = torch.reshape(combined_flat, (world_size, combined_size))
+            # world_size * (2C + 1) -> world_size * C, world_size * C, world_size * 1
+            mean_all, invstd_all, count_all = torch.split(combined, num_channels, dim=1)
+        else:
+            # world_size * (2C + 1)
+            combined_list = [
+                torch.empty_like(combined) for k in range(world_size)
+            ]
+            dist.all_gather(combined_list, combined, process_group, async_op=False)
+            combined = torch.stack(combined_list, dim=0)
+            # world_size * (2C + 1) -> world_size * C, world_size * C, world_size * 1
+            mean_all, invstd_all, count_all = torch.split(combined, num_channels, dim=1)
 
         # calculate global mean & invstd
         mean, invstd = torch.batch_norm_gather_stats_with_counts(
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
@@ -6456,7 +6456,10 @@ def test_ddp_sync_bn_training_vs_eval(self):
 
             # SyncBN allgathers stats across all ranks, so verify call to
             # all_gather in profiler.
-            all_gather_calls = get_profiling_event("all_gather", prof)
+            if BACKEND == 'nccl':
+                all_gather_calls = get_profiling_event("_all_gather_base", prof)
+            else:
+                all_gather_calls = get_profiling_event("all_gather", prof)
             self.assertNotEqual([], all_gather_calls)
 
             # Only do inference on one rank. If SyncBN did collective stats sync,
@@ -6472,7 +6475,10 @@ def test_ddp_sync_bn_training_vs_eval(self):
                         loss.backward()
 
                 # Ensure sync does not occur in eval() mode.
-                all_gather_calls = get_profiling_event("all_gather", prof)
+                if BACKEND == 'nccl':
+                    all_gather_calls = get_profiling_event("_all_gather_base", prof)
+                else:
+                    all_gather_calls = get_profiling_event("all_gather", prof)
                 self.assertEqual([], all_gather_calls)
 
         @skip_if_lt_x_gpu(2)