Add XPU support for barrier (#96)

zhuhong61 · web-flow · commit 1dd4657ccb1d · 2023-06-25T14:55:42.000+08:00
* Fix the issue of gpu hang with barrier as the first call

* Usage: For XPU package (built with COMPUTE_BACKEND=dpcpp): use XPU
barrier; for CPU package: use CPU barrier.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -10,6 +10,7 @@ target_compile_options(oneccl_bindings_for_pytorch PUBLIC -Wall
 
 if(COMPUTE_BACKEND STREQUAL "dpcpp")
     add_subdirectory(./gpu)
+    add_definitions (-DUSE_GPU)
 endif()
 
 target_include_directories(oneccl_bindings_for_pytorch PUBLIC ./)
diff --git a/src/ProcessGroupCCL.cpp b/src/ProcessGroupCCL.cpp
@@ -349,6 +349,19 @@ c10::intrusive_ptr<C10D_Work> recv_any_source_xpu_(
 TORCH_LIBRARY_IMPL(c10d, XPU, m) {
   m.impl("recv_any_source_", recv_any_source_xpu_);
 }
+
+c10::intrusive_ptr<Work> barrier_xpu(
+    at::Tensor /* unused */,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    const std::vector<int64_t>& device_ids,
+    int64_t timeout) {
+  return process_group->getBackend(c10::DeviceType::XPU)
+      ->barrier(BarrierOptions{device_ids, std::chrono::milliseconds(timeout)});
+}
+
+TORCH_LIBRARY_IMPL(c10d, XPU, m) {
+  m.impl("barrier", barrier_xpu);
+}
 } // namespace ops
 
 
diff --git a/src/dispatch_stub.cpp b/src/dispatch_stub.cpp
@@ -548,7 +548,13 @@ c10::intrusive_ptr<ProcessGroupCCL::AsyncWorkCCL> DispatchStub::recv(std::vector
 
 c10::intrusive_ptr<ProcessGroupCCL::AsyncWorkCCL> DispatchStub::barrier(const BarrierOptions& opts,
                                                               ProcessGroupCCL& pg_ccl) {
+#ifdef USE_GPU
+  std::cout << "Barrier: using xpu" << std::endl;
+  c10::DeviceType dev_type = c10::DeviceType::XPU;
+#else
+  std::cout << "Barrier: using cpu" << std::endl;
   c10::DeviceType dev_type = c10::DeviceType::CPU;
+#endif
   return get_ccl_stub(dev_type)->barrier_(opts, pg_ccl);
 }
 
diff --git a/src/gpu/dpcpp_ccl.cpp b/src/gpu/dpcpp_ccl.cpp
@@ -401,6 +401,9 @@ class XPUCCLStubs final: public DispatchStub {
       int tag,
       ProcessGroupCCL& pg) override;
 
+  c10::intrusive_ptr<ProcessGroupCCL::AsyncWorkCCL> barrier_(const BarrierOptions& opts,
+                                                                ProcessGroupCCL& pg) override;
+
   void destroy();
   void reset() override {}
   void runLoop();
@@ -1116,6 +1119,35 @@ c10::intrusive_ptr<ProcessGroupCCL::AsyncWorkCCL> XPUCCLStubs::recv_(std::vector
   return work;
 }
 
+c10::intrusive_ptr<ProcessGroupCCL::AsyncWorkCCL> XPUCCLStubs::barrier_(const BarrierOptions& opts,
+                                                                   ProcessGroupCCL& pg) {
+
+  c10::intrusive_ptr<AsyncBarrierWork> work = c10::make_intrusive<AsyncBarrierWork>();
+
+  if (pg.ccl_member_->ccl_comms.size() == 0) {
+    std::vector<at::Device> xpu_devices{at::Device(at::kXPU)};
+    const auto key = get_key_from_devs(xpu_devices);
+    get_ccl_comms(pg, key, xpu_devices);
+  }
+
+  auto& comms_map = pg.ccl_member_->ccl_comms;
+  for(auto iter = comms_map.begin(); iter != comms_map.end(); iter++){
+      for(size_t i =0 ; i < iter->second->comms.size(); i++){
+         work->getEvents().emplace_back(
+                 call_with_lock(c10d::ProcessGroupCCL::globalMutex, [&](){
+                   if (i < iter->second->streams.size()) {
+                     CCL_CHECK(return ccl::barrier(iter->second->comms[i],
+                                                   iter->second->streams[i]););
+                   } else {
+                     CCL_CHECK(return ccl::barrier(iter->second->comms[i]););
+                   }
+                 })
+                 );
+     }
+  }
+  return work;
+}
+
 RegisterXPUMethods xpu_register;
 
 }
diff --git a/tests/README.md b/tests/README.md
@@ -23,6 +23,19 @@ For cross-nodes p2p test, run:
 mpiexec -host nodeA,nodeB -np 24 -ppn 12 python -u test_p2p_crossnodes.py --dist_url $NODE_IP --world_size 24
 ```
 
+## functionality validation of barrier
+For cpu barrier, run:
+
+```bash
+mpirun -np 2 python test_barrier.py
+```
+
+For xpu barrier (built with "COMPUTE_BACKEND=dpcpp"), run:
+
+```bash
+mpirun -np 2 python test_barrier.py --device xpu
+```
+
 ## broadcast/allreduce profiling
 To start the test_allreduce.py test, run:
 
diff --git a/tests/test_barrier.py b/tests/test_barrier.py
@@ -0,0 +1,28 @@
+import torch
+import intel_extension_for_pytorch
+import oneccl_bindings_for_pytorch
+import torch.distributed as dist
+import os
+
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('--device', '-dev', type=str, default='cpu', help='Device type to use: cpu, xpu')
+args = parser.parse_args()
+
+os.environ['RANK'] = str(os.environ.get('PMI_RANK', 0))
+os.environ['WORLD_SIZE'] = str(os.environ.get('PMI_SIZE', 1))
+os.environ['MASTER_ADDR'] = '127.0.0.1'
+os.environ['MASTER_PORT'] = '29500'
+
+dist.init_process_group("ccl")
+rank = dist.get_rank()
+size = dist.get_world_size()
+
+if args.device == 'xpu':
+    device = "xpu:{}".format(rank)
+else:
+    device = 'cpu'
+
+print("Barrier using device: ", args.device)
+dist.barrier()
+print("Finish")