Skip to content

Commit b635b01

Browse files
(PTFE-3162) Fix volume delete for scaleway (#712)
This pull request improves the reliability of deleting block storage volumes in the Scaleway backend by adding retry logic when volumes are temporarily stuck in the "in_use" state. It also adds comprehensive unit tests to ensure the new retry behavior works as expected in both success and failure scenarios. **Improvements to Scaleway block volume deletion:** * Added exponential backoff retry logic to the `delete` method in `scaleway.py` so that if a block storage volume is still "in_use" or marked as a "protected_resource", the system will retry deletion up to five times with increasing wait times between attempts. This handles cases where Scaleway takes time to detach volumes after server deletion. [[1]](diffhunk://#diff-ad3a8001c753a3db1894703fb801fbb1a9a010b659c4d24050ba74c194035cffR467-R471) [[2]](diffhunk://#diff-ad3a8001c753a3db1894703fb801fbb1a9a010b659c4d24050ba74c194035cffR480-R504) * Ensured that if all retries are exhausted and the volume still cannot be deleted, an appropriate warning is logged and the process continues gracefully. [[1]](diffhunk://#diff-ad3a8001c753a3db1894703fb801fbb1a9a010b659c4d24050ba74c194035cffR480-R504) [[2]](diffhunk://#diff-ad3a8001c753a3db1894703fb801fbb1a9a010b659c4d24050ba74c194035cffL492-L493) **Testing enhancements:** * Added `test_delete_volume_in_use_retry_succeeds` to verify that the retry logic successfully deletes a volume after temporary "in_use" errors. * Added `test_delete_volume_in_use_retry_exhausted` to verify that a warning is logged and the correct result is returned if the volume remains "in_use" after all retries.
2 parents a938a6c + f9abec7 commit b635b01

2 files changed

Lines changed: 156 additions & 36 deletions

File tree

runner_manager/backend/scaleway.py

Lines changed: 40 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
)
1919
from scaleway.instance.v1.custom_api import InstanceUtilsV1API
2020
from scaleway.marketplace.v2 import MarketplaceV2API
21+
from scaleway_core.api import ScalewayException
2122

2223
from runner_manager.backend.base import BaseBackend
2324
from runner_manager.models.backend import (
@@ -464,37 +465,51 @@ def delete(self, runner: Runner) -> int:
464465
for volume_id in volume_ids:
465466
try:
466467
# First, try to delete using Block Storage API (for sbs_volume)
467-
try:
468-
self.block_client.delete_volume(
469-
zone=self.config.zone,
470-
volume_id=volume_id,
471-
)
472-
log.info(
473-
f"Block storage volume {volume_id} deleted successfully"
474-
)
475-
except Exception as block_error:
476-
block_error_msg = str(block_error)
477-
# If volume not found in Block API, try Instance API (for l_ssd volumes)
478-
if (
479-
"404" in block_error_msg
480-
or "not_found" in block_error_msg.lower()
481-
):
482-
log.debug(
483-
f"Volume {volume_id} not found in Block API, trying Instance API"
484-
)
485-
self.client.delete_volume(
468+
# Retry with exponential backoff because SBS volumes can remain in
469+
# "in_use" status briefly after server deletion while Scaleway detaches them.
470+
max_attempts = 5
471+
deleted = False
472+
for attempt in range(max_attempts):
473+
try:
474+
self.block_client.delete_volume(
486475
zone=self.config.zone,
487476
volume_id=volume_id,
488477
)
489478
log.info(
490-
f"Instance volume {volume_id} deleted successfully"
479+
f"Block storage volume {volume_id} deleted successfully"
491480
)
492-
else:
493-
raise block_error
494-
except Exception as vol_error:
495-
error_msg = str(vol_error)
481+
deleted = True
482+
break
483+
except ScalewayException as block_error:
484+
if block_error.status_code == 404:
485+
# Not a block storage volume, fall through to Instance API
486+
break
487+
if (
488+
block_error.status_code == 412
489+
and attempt + 1 < max_attempts
490+
):
491+
wait = 2**attempt
492+
log.debug(
493+
f"Volume {volume_id} still in_use, retrying in {wait}s "
494+
f"(attempt {attempt + 1}/{max_attempts})"
495+
)
496+
time.sleep(wait)
497+
else:
498+
raise block_error
499+
500+
if not deleted:
501+
# Volume not found in Block API, try Instance API (for l_ssd volumes)
502+
log.debug(
503+
f"Volume {volume_id} not found in Block API, trying Instance API"
504+
)
505+
self.client.delete_volume(
506+
zone=self.config.zone,
507+
volume_id=volume_id,
508+
)
509+
log.info(f"Instance volume {volume_id} deleted successfully")
510+
except ScalewayException as vol_error:
496511
# Volume may already be deleted automatically (especially l_ssd volumes)
497-
if "404" in error_msg or "not_found" in error_msg.lower():
512+
if vol_error.status_code == 404:
498513
log.info(
499514
f"Volume {volume_id} not found - may have been auto-deleted with server or already cleaned up"
500515
)

tests/unit/backend/test_scaleway.py

Lines changed: 116 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import pytest
55
from redis_om import NotFoundError
66
from scaleway.instance.v1 import ServerState
7+
from scaleway_core.api import ScalewayException
78

89
from runner_manager.backend.scaleway import ScalewayBackend
910
from runner_manager.models.backend import (
@@ -15,6 +16,14 @@
1516
from runner_manager.models.runner_group import RunnerGroup
1617

1718

19+
def make_scaleway_error(status_code: int, text: str = "") -> ScalewayException:
20+
"""Create a ScalewayException with a given HTTP status code for use in tests."""
21+
response = MagicMock()
22+
response.status_code = status_code
23+
response.text = text
24+
return ScalewayException(response=response)
25+
26+
1827
@pytest.fixture()
1928
def scaleway_group(settings) -> RunnerGroup:
2029
"""Create a runner group with Scaleway backend configuration."""
@@ -78,8 +87,8 @@ def fake_scaleway_group(scaleway_group, monkeypatch):
7887
# By default, simulate that volumes are not found in Block API (like l_ssd volumes)
7988
# This will trigger fallback to Instance API, maintaining compatibility with existing tests
8089
mock_block_client = MagicMock()
81-
mock_block_client.delete_volume.side_effect = Exception(
82-
"404 not_found: Volume not in Block API"
90+
mock_block_client.delete_volume.side_effect = make_scaleway_error(
91+
404, "not_found: Volume not in Block API"
8392
)
8493

8594
# Patch the client property
@@ -406,7 +415,9 @@ def test_delete_with_volume_error(
406415

407416
mock_client = backend.client
408417
mock_client.get_server.return_value = MagicMock(server=mock_server)
409-
mock_client.delete_volume.side_effect = Exception("Volume deletion failed")
418+
mock_client.delete_volume.side_effect = make_scaleway_error(
419+
500, "Volume deletion failed"
420+
)
410421

411422
# Restore wait_for_server_state mock
412423
def mock_wait(self, server_id, target_state, timeout=300):
@@ -439,7 +450,7 @@ def test_delete_with_volume_not_found_404(
439450

440451
mock_client = backend.client
441452
mock_client.get_server.return_value = MagicMock(server=mock_server)
442-
mock_client.delete_volume.side_effect = Exception("Error 404: Volume not found")
453+
mock_client.delete_volume.side_effect = make_scaleway_error(404, "Volume not found")
443454

444455
# Restore wait_for_server_state mock
445456
def mock_wait(self, server_id, target_state, timeout=300):
@@ -573,8 +584,8 @@ def test_delete_with_volume_fallback_to_instance_api(
573584

574585
# Mock Block Storage API client to return 404 (volume not found in Block API)
575586
mock_block_client = MagicMock()
576-
mock_block_client.delete_volume.side_effect = Exception(
577-
"404 not_found: Volume not found in Block API"
587+
mock_block_client.delete_volume.side_effect = make_scaleway_error(
588+
404, "not_found: Volume not found in Block API"
578589
)
579590

580591
# Patch both clients
@@ -974,12 +985,14 @@ def test_delete_with_block_api_non_404_error(
974985
mock_client.delete_server.return_value = None
975986
mock_client.server_action.return_value = None
976987
# Instance API also fails with permission error
977-
mock_client.delete_volume.side_effect = Exception("Permission denied")
988+
mock_client.delete_volume.side_effect = make_scaleway_error(
989+
403, "Permission denied"
990+
)
978991

979992
# Mock Block Storage API client - fails with permission error (not 404)
980993
mock_block_client = MagicMock()
981-
mock_block_client.delete_volume.side_effect = Exception(
982-
"Permission denied: insufficient permissions"
994+
mock_block_client.delete_volume.side_effect = make_scaleway_error(
995+
403, "Permission denied: insufficient permissions"
983996
)
984997

985998
# Patch both clients
@@ -1027,8 +1040,8 @@ def test_delete_with_block_api_no_fallback_on_permission_error(
10271040

10281041
# Mock Block Storage API client - fails with permission error (not 404)
10291042
mock_block_client = MagicMock()
1030-
mock_block_client.delete_volume.side_effect = Exception(
1031-
"403 Permission denied: insufficient permissions"
1043+
mock_block_client.delete_volume.side_effect = make_scaleway_error(
1044+
403, "Permission denied: insufficient permissions"
10321045
)
10331046

10341047
# Patch both clients
@@ -1053,3 +1066,95 @@ def mock_wait(self, server_id, target_state, timeout=300):
10531066
assert "Failed to delete volume test-volume-id" in caplog.text
10541067
assert "Permission denied" in caplog.text
10551068
assert result == 1
1069+
1070+
1071+
def test_delete_volume_in_use_retry_succeeds(
1072+
scaleway_runner, fake_scaleway_group, caplog, monkeypatch
1073+
):
1074+
"""Test that volume deletion retries when volume is in_use, and succeeds eventually."""
1075+
backend = fake_scaleway_group.backend
1076+
scaleway_runner.instance_id = "test-server-id"
1077+
scaleway_runner.save()
1078+
1079+
mock_volume = MagicMock()
1080+
mock_volume.id = "test-block-volume-id"
1081+
mock_server = MagicMock()
1082+
mock_server.id = "test-server-id"
1083+
mock_server.state = ServerState.STOPPED
1084+
mock_server.volumes = {"0": mock_volume}
1085+
1086+
mock_client = MagicMock()
1087+
mock_client.get_server.return_value = MagicMock(server=mock_server)
1088+
mock_client.delete_server.return_value = None
1089+
mock_client.server_action.return_value = None
1090+
1091+
# Fail twice with 412 (in_use), then succeed on 3rd attempt
1092+
in_use_error = make_scaleway_error(
1093+
412, "precondition is not respected: protected_resource"
1094+
)
1095+
mock_block_client = MagicMock()
1096+
mock_block_client.delete_volume.side_effect = [in_use_error, in_use_error, None]
1097+
1098+
monkeypatch.setattr(ScalewayBackend, "client", property(lambda self: mock_client))
1099+
monkeypatch.setattr(
1100+
ScalewayBackend, "block_client", property(lambda self: mock_block_client)
1101+
)
1102+
1103+
def mock_wait(self, server_id, target_state, timeout=300):
1104+
return mock_server
1105+
1106+
monkeypatch.setattr(ScalewayBackend, "wait_for_server_state", mock_wait)
1107+
monkeypatch.setattr("runner_manager.backend.scaleway.time.sleep", lambda _: None)
1108+
1109+
result = backend.delete(scaleway_runner)
1110+
1111+
assert mock_block_client.delete_volume.call_count == 3
1112+
assert (
1113+
"Block storage volume test-block-volume-id deleted successfully" in caplog.text
1114+
)
1115+
assert "Failed to delete volume" not in caplog.text
1116+
assert result == 1
1117+
1118+
1119+
def test_delete_volume_in_use_retry_exhausted(
1120+
scaleway_runner, fake_scaleway_group, caplog, monkeypatch
1121+
):
1122+
"""Test that a warning is logged when volume stays in_use after all retries."""
1123+
backend = fake_scaleway_group.backend
1124+
scaleway_runner.instance_id = "test-server-id"
1125+
scaleway_runner.save()
1126+
1127+
mock_volume = MagicMock()
1128+
mock_volume.id = "test-block-volume-id"
1129+
mock_server = MagicMock()
1130+
mock_server.id = "test-server-id"
1131+
mock_server.state = ServerState.STOPPED
1132+
mock_server.volumes = {"0": mock_volume}
1133+
1134+
mock_client = MagicMock()
1135+
mock_client.get_server.return_value = MagicMock(server=mock_server)
1136+
mock_client.delete_server.return_value = None
1137+
mock_client.server_action.return_value = None
1138+
1139+
in_use_error = make_scaleway_error(
1140+
412, "precondition is not respected: protected_resource"
1141+
)
1142+
mock_block_client = MagicMock()
1143+
mock_block_client.delete_volume.side_effect = in_use_error
1144+
1145+
monkeypatch.setattr(ScalewayBackend, "client", property(lambda self: mock_client))
1146+
monkeypatch.setattr(
1147+
ScalewayBackend, "block_client", property(lambda self: mock_block_client)
1148+
)
1149+
1150+
def mock_wait(self, server_id, target_state, timeout=300):
1151+
return mock_server
1152+
1153+
monkeypatch.setattr(ScalewayBackend, "wait_for_server_state", mock_wait)
1154+
monkeypatch.setattr("runner_manager.backend.scaleway.time.sleep", lambda _: None)
1155+
1156+
result = backend.delete(scaleway_runner)
1157+
1158+
assert mock_block_client.delete_volume.call_count == 5 # max_attempts
1159+
assert "Failed to delete volume test-block-volume-id" in caplog.text
1160+
assert result == 1

0 commit comments

Comments
 (0)