Skip to content

Commit 38c18f3

Browse files
fduwjjpytorchmergebot
authored andcommitted
[c10d] Add a timeout check interval variable for timeout dump (pytorch#117093)
The current timeout check frequency is relied on monitoring thread's timeout thread which can be too long (even if we set it to 2mins) so let's use a separate timeout variable which users can configure it. And we only only let default PG to check TCPStore so even more frequent check should be fine. (Our stress test is performed on every half second). Pull Request resolved: pytorch#117093 Approved by: https://github.com/wconstab, https://github.com/kwen2501
1 parent 003c900 commit 38c18f3

File tree

2 files changed

+12
-1
lines changed

2 files changed

+12
-1
lines changed

torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -770,6 +770,7 @@ ProcessGroupNCCL::ProcessGroupNCCL(
770770
getCvarInt(TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC, 60 * 10 /*10 Mins*/);
771771
waitTimeoutDumpInMilSec_ =
772772
getCvarInt(TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC, 2000);
773+
coordCheckIntervalMilSec_ = getCvarInt(TORCH_NCCL_COORD_CHECK_MILSEC, 1000);
773774
ncclTraceBufferSize_ = getCvarInt(TORCH_NCCL_TRACE_BUFFER_SIZE, 0);
774775
enableCollecticeHashDebug_ = (dist_debug_level_ >= DebugLevel::Detail);
775776
// store_ usually is wrapped with PrefixStore and the prefix is different
@@ -859,6 +860,7 @@ ProcessGroupNCCL::ProcessGroupNCCL(
859860
<< monitorThreadEnabled_.load()
860861
<< ", TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: " << heartbeatTimeoutInSec_
861862
<< ", TORCH_NCCL_TRACE_BUFFER_SIZE: " << ncclTraceBufferSize_
863+
<< ", TORCH_NCCL_COORD_CHECK_MILSEC: " << coordCheckIntervalMilSec_
862864
<< ", ID=" << this->getID();
863865

864866
if (options_->global_ranks_in_group.empty()) {
@@ -1549,7 +1551,7 @@ void ProcessGroupNCCL::watchdogHandler() {
15491551
(currentTime - lastTimePollStore))
15501552
.count();
15511553
if (timeSinceLastWorkListUpdate >= kWatchdogThreadSleepMillis &&
1552-
timeSinceLastPollStore >= heartbeatTimeoutInSec_ * 1000) {
1554+
timeSinceLastPollStore >= coordCheckIntervalMilSec_) {
15531555
lastTimePollStore = currentTime;
15541556
if (globalStore_->check({std::string(TIMEOUT_DUMP)}) &&
15551557
!optAsyncDebugDump) {

torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp

+9
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,11 @@ static std::vector<std::string> TORCH_NCCL_TRACE_BUFFER_SIZE = {
8989
static std::vector<std::string> TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC = {
9090
"TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC"};
9191

92+
// Control the interval inside the watchdog thread to check the coordinated
93+
// signal from other ranks, e.g. to dump the debugging information.
94+
static std::vector<std::string> TORCH_NCCL_COORD_CHECK_MILSEC = {
95+
"TORCH_NCCL_COORD_CHECK_MILSEC"};
96+
9297
constexpr const char* NCCL_BACKEND_NAME = "nccl";
9398

9499
constexpr const char* TIMEOUT_DUMP = "timeout_dump";
@@ -853,6 +858,10 @@ class TORCH_API ProcessGroupNCCL : public Backend {
853858
// Extra time of sleep when waiting for timeout dump to finish.
854859
int waitTimeoutDumpInMilSec_;
855860

861+
// Interval of check coordinated signals in ProcessGroupNCCL from other ranks
862+
// e.g., trigger the dump of the debugging info for timeout when notified.
863+
int coordCheckIntervalMilSec_;
864+
856865
// Size of ring buffer where we store NCCL Traces for debugging.
857866
int ncclTraceBufferSize_;
858867

0 commit comments

Comments
 (0)