Skip to content

Commit 990d7c4

Browse files
raballewclaude
andcommitted
fix: retry Dial on transient UNAVAILABLE in Lease.handle_async
When the exporter briefly restarts, the Dial RPC may fail with UNAVAILABLE. Instead of immediately giving up, retry with exponential backoff bounded by the existing dial_timeout parameter. This mirrors the existing FAILED_PRECONDITION retry logic. Fixes #242 Generated-By: Forge/20260416_202053_681470_11575359_i242 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent b28a17b commit 990d7c4

2 files changed

Lines changed: 89 additions & 1 deletion

File tree

python/packages/jumpstarter/jumpstarter/client/lease.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,22 @@ async def handle_async(self, stream):
336336
await sleep(delay)
337337
attempt += 1
338338
continue
339-
# Exporter went offline or lease ended - log and exit gracefully
339+
if e.code() == grpc.StatusCode.UNAVAILABLE:
340+
remaining = deadline - time.monotonic()
341+
if remaining <= 0:
342+
logger.warning(
343+
"Exporter unavailable and dial timeout (%.1fs) exceeded after %d attempts",
344+
self.dial_timeout, attempt + 1
345+
)
346+
return
347+
delay = min(base_delay * (2 ** attempt), max_delay, remaining)
348+
logger.debug(
349+
"Exporter unavailable, retrying Dial in %.1fs (attempt %d, %.1fs remaining)",
350+
delay, attempt + 1, remaining
351+
)
352+
await sleep(delay)
353+
attempt += 1
354+
continue
340355
if "permission denied" in str(e.details()).lower():
341356
self.lease_transferred = True
342357
logger.warning(

python/packages/jumpstarter/jumpstarter/client/lease_test.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,28 @@
44
from datetime import datetime, timedelta, timezone
55
from unittest.mock import AsyncMock, Mock, patch
66

7+
import grpc
78
import pytest
9+
from grpc.aio import AioRpcError
810
from rich.console import Console
911

1012
from jumpstarter.client.lease import Lease, LeaseAcquisitionSpinner
1113

1214

15+
class MockAioRpcError(AioRpcError):
16+
"""Mock gRPC error for testing that properly inherits from AioRpcError."""
17+
18+
def __init__(self, status_code, message=""):
19+
self._status_code = status_code
20+
self._message = message
21+
22+
def code(self):
23+
return self._status_code
24+
25+
def details(self):
26+
return self._message
27+
28+
1329
class TestLeaseAcquisitionSpinner:
1430
"""Test cases for LeaseAcquisitionSpinner class."""
1531

@@ -522,3 +538,60 @@ async def get_then_fail():
522538
callback.assert_called()
523539
_, remain_arg = callback.call_args[0]
524540
assert remain_arg == timedelta(0)
541+
542+
543+
class TestHandleAsyncUnavailableRetry:
544+
"""Tests for Lease.handle_async UNAVAILABLE retry behavior."""
545+
546+
def _make_lease_for_handle(self):
547+
lease = object.__new__(Lease)
548+
lease.name = "test-lease"
549+
lease.dial_timeout = 5.0
550+
lease.lease_transferred = False
551+
lease.tls_config = Mock()
552+
lease.grpc_options = {}
553+
lease.controller = Mock()
554+
return lease
555+
556+
@pytest.mark.anyio
557+
async def test_handle_async_retries_unavailable_then_succeeds(self):
558+
"""Dial returns UNAVAILABLE once then succeeds on retry."""
559+
lease = self._make_lease_for_handle()
560+
dial_call_count = 0
561+
562+
async def mock_dial(request):
563+
nonlocal dial_call_count
564+
dial_call_count += 1
565+
if dial_call_count == 1:
566+
raise MockAioRpcError(grpc.StatusCode.UNAVAILABLE, "temporarily unavailable")
567+
return Mock(router_endpoint="endpoint", router_token="token")
568+
569+
lease.controller.Dial = mock_dial
570+
571+
with patch("jumpstarter.client.lease.connect_router_stream") as mock_connect:
572+
mock_connect.return_value.__aenter__ = AsyncMock()
573+
mock_connect.return_value.__aexit__ = AsyncMock(return_value=False)
574+
stream = Mock()
575+
576+
await lease.handle_async(stream)
577+
578+
assert dial_call_count == 2
579+
580+
@pytest.mark.anyio
581+
async def test_handle_async_unavailable_exceeds_dial_timeout(self):
582+
"""Dial returns UNAVAILABLE until dial_timeout is exceeded."""
583+
lease = self._make_lease_for_handle()
584+
lease.dial_timeout = 0.5
585+
dial_call_count = 0
586+
587+
async def mock_dial(request):
588+
nonlocal dial_call_count
589+
dial_call_count += 1
590+
raise MockAioRpcError(grpc.StatusCode.UNAVAILABLE, "permanently unavailable")
591+
592+
lease.controller.Dial = mock_dial
593+
stream = Mock()
594+
595+
await lease.handle_async(stream)
596+
597+
assert dial_call_count >= 2

0 commit comments

Comments
 (0)