We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent b319fa3 commit e792980Copy full SHA for e792980
torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -763,6 +763,10 @@ void ProcessGroupNCCL::WorkNCCL::abort() {
763
764
ProcessGroupNCCL::CUDAEventCache::CUDAEventCache() {}
765
766
+// CUDA event is used to record the start/end of one Work.
767
+// Instead of let the CUDA event gets destroyed, we now reuse it after the Work
768
+// has been erased from workMetaList_.
769
+// This is to avoid the potential deadlock caused by CudaEventDestroy.
770
std::shared_ptr<at::cuda::CUDAEvent> ProcessGroupNCCL::CUDAEventCache::create(
771
bool timing) {
772
auto deleter = [this, timing](at::cuda::CUDAEvent* event) {
0 commit comments