[c10d][ez] Add comments to CudaEventCache class (pytorch#134172)

fduwjj · pytorchmergebot · commit e7929809f31b · 2024-08-22T22:44:12.000Z
Pull Request resolved: pytorch#134172 Approved by: https://github.com/d4l3k, https://github.com/kwen2501
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -763,6 +763,10 @@ void ProcessGroupNCCL::WorkNCCL::abort() {
 
 ProcessGroupNCCL::CUDAEventCache::CUDAEventCache() {}
 
+// CUDA event is used to record the start/end of one Work.
+// Instead of let the CUDA event gets destroyed, we now reuse it after the Work
+// has been erased from workMetaList_.
+// This is to avoid the potential deadlock caused by CudaEventDestroy.
 std::shared_ptr<at::cuda::CUDAEvent> ProcessGroupNCCL::CUDAEventCache::create(
     bool timing) {
   auto deleter = [this, timing](at::cuda::CUDAEvent* event) {