Skip to content

Commit 5cd25c3

Browse files
raballewclaude
andcommitted
fix: add inter-retry delay for UNAVAILABLE errors in StatusMonitor poll loop
Remove the `continue` statement from the UNAVAILABLE handler in _poll_loop so it falls through to the standard sleep block. Previously, UNAVAILABLE retries had no delay between attempts, so 10 retries could be exhausted in under 1ms -- far too fast to tolerate an exporter restart that takes several seconds. Now retries use the poll_interval sleep, making the 10-retry threshold span a meaningful duration. Generated-By: Forge/20260416_202053_681470_11575359_i242 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 07aecfe commit 5cd25c3

2 files changed

Lines changed: 33 additions & 1 deletion

File tree

python/packages/jumpstarter/jumpstarter/client/status_monitor.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,6 @@ async def _poll_loop(self): # noqa: C901
406406
logger.warning("GetStatus UNAVAILABLE %d times consecutively", unavailable_retries)
407407
else:
408408
logger.debug("GetStatus UNAVAILABLE (attempt %d), retrying...", unavailable_retries)
409-
continue
410409
elif e.code() == StatusCode.DEADLINE_EXCEEDED:
411410
# DEADLINE_EXCEEDED is a transient error (RPC timed out), not a
412411
# permanent connection loss. Keep polling - the shell's own timeout

python/packages/jumpstarter/jumpstarter/client/status_monitor_test.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -810,6 +810,39 @@ async def test_wait_for_any_of_updates_status_message(self) -> None:
810810
assert monitor.status_message == "hook script exited with code 1"
811811

812812

813+
class TestStatusMonitorUnavailableRetryDelay:
814+
async def test_unavailable_retries_include_inter_retry_delay(self) -> None:
815+
"""Test that UNAVAILABLE retries sleep between attempts.
816+
817+
Without inter-retry delay, 10 UNAVAILABLE errors (which return
818+
near-instantly) would be exhausted in milliseconds, providing
819+
no time for an exporter to restart. The poll loop must sleep
820+
between UNAVAILABLE retries so the threshold spans a meaningful
821+
wall-clock duration.
822+
"""
823+
import time
824+
825+
retry_count = 10
826+
poll_interval = 0.05
827+
responses = [
828+
create_mock_rpc_error(StatusCode.UNAVAILABLE)
829+
for _ in range(retry_count)
830+
]
831+
stub = MockExporterStub(responses, repeat_last=False)
832+
monitor = StatusMonitor(stub, poll_interval=poll_interval)
833+
834+
start = time.monotonic()
835+
await monitor.start()
836+
elapsed = time.monotonic() - start
837+
838+
assert monitor.connection_lost
839+
minimum_expected = poll_interval * (retry_count - 1)
840+
assert elapsed >= minimum_expected, (
841+
f"UNAVAILABLE retries completed in {elapsed:.3f}s, "
842+
f"expected at least {minimum_expected:.3f}s with inter-retry delay"
843+
)
844+
845+
813846
class TestStatusMonitorPRIssues:
814847
"""Regression tests for issues reported during PR review of hooks feature."""
815848

0 commit comments

Comments
 (0)