Skip to content

Commit b881dd2

Browse files
JohnGarbuttrloo
authored andcommitted
Ironic: retry when node not available
After a baremetal instance is deleted, and its allocation is removed in placement, the ironic node might start cleaning. Eventually nova will notice and update the inventory to be reserved. During this window, a new instance may have already picked this ironic node. When that race happens today the build fails with an error: "Failed to reserve node ..." This change tries to ensure the remaining alternative hosts are attempted before aborting the build. Clearly the race is still there, but this makes it less painful. Related-Bug: #1974070 Change-Id: Ie5cdc17219c86927ab3769605808cb9d9fa9fa4d (cherry picked from commit 8a47606) (cherry picked from commit d71e9f6)
1 parent 19828f2 commit b881dd2

File tree

4 files changed

+70
-3
lines changed

4 files changed

+70
-3
lines changed

nova/compute/manager.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2708,7 +2708,8 @@ def _build_resources(self, context, instance, requested_networks,
27082708
block_device_mapping)
27092709
resources['block_device_info'] = block_device_info
27102710
except (exception.InstanceNotFound,
2711-
exception.UnexpectedDeletingTaskStateError):
2711+
exception.UnexpectedDeletingTaskStateError,
2712+
exception.ComputeResourcesUnavailable):
27122713
with excutils.save_and_reraise_exception():
27132714
self._build_resources_cleanup(instance, network_info)
27142715
except (exception.UnexpectedTaskStateError,

nova/tests/unit/compute/test_compute_mgr.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7667,6 +7667,42 @@ def test_failed_bdm_prep_from_delete_raises_unexpected(self, mock_clean,
76677667
mock_prepspawn.assert_called_once_with(self.instance)
76687668
mock_failedspawn.assert_called_once_with(self.instance)
76697669

7670+
@mock.patch.object(virt_driver.ComputeDriver, 'failed_spawn_cleanup')
7671+
@mock.patch.object(virt_driver.ComputeDriver, 'prepare_for_spawn')
7672+
@mock.patch.object(virt_driver.ComputeDriver,
7673+
'prepare_networks_before_block_device_mapping')
7674+
@mock.patch.object(virt_driver.ComputeDriver,
7675+
'clean_networks_preparation')
7676+
def test_failed_prepare_for_spawn(self, mock_clean, mock_prepnet,
7677+
mock_prepspawn, mock_failedspawn):
7678+
mock_prepspawn.side_effect = exception.ComputeResourcesUnavailable(
7679+
reason="asdf")
7680+
with mock.patch.object(self.compute,
7681+
'_build_networks_for_instance',
7682+
return_value=self.network_info
7683+
) as _build_networks_for_instance:
7684+
7685+
try:
7686+
with self.compute._build_resources(self.context, self.instance,
7687+
self.requested_networks, self.security_groups,
7688+
self.image, self.block_device_mapping,
7689+
self.resource_provider_mapping, self.accel_uuids):
7690+
pass
7691+
except Exception as e:
7692+
self.assertIsInstance(e,
7693+
exception.ComputeResourcesUnavailable)
7694+
7695+
_build_networks_for_instance.assert_has_calls(
7696+
[mock.call(self.context, self.instance,
7697+
self.requested_networks, self.security_groups,
7698+
self.resource_provider_mapping,
7699+
self.network_arqs)])
7700+
7701+
mock_prepnet.assert_not_called()
7702+
mock_clean.assert_called_once_with(self.instance, self.network_info)
7703+
mock_prepspawn.assert_called_once_with(self.instance)
7704+
mock_failedspawn.assert_called_once_with(self.instance)
7705+
76707706
@mock.patch.object(virt_driver.ComputeDriver, 'failed_spawn_cleanup')
76717707
@mock.patch.object(virt_driver.ComputeDriver, 'prepare_for_spawn')
76727708
@mock.patch.object(manager.ComputeManager, '_build_networks_for_instance')

nova/tests/unit/virt/ironic/test_driver.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2499,7 +2499,10 @@ def test_ironicclient_bad_response(self, mock_error):
24992499

25002500
@mock.patch.object(cw.IronicClientWrapper, 'call')
25012501
def test_prepare_for_spawn(self, mock_call):
2502-
node = ironic_utils.get_test_node(driver='fake')
2502+
node = ironic_utils.get_test_node(
2503+
driver='fake', instance_uuid=None,
2504+
provision_state=ironic_states.AVAILABLE,
2505+
power_state=ironic_states.POWER_OFF)
25032506
self.mock_conn.get_node.return_value = node
25042507
instance = fake_instance.fake_instance_obj(self.ctx,
25052508
node=node.uuid)
@@ -2531,14 +2534,29 @@ def test_prepare_for_spawn_invalid_instance(self):
25312534
instance)
25322535

25332536
def test_prepare_for_spawn_conflict(self):
2534-
node = ironic_utils.get_test_node(driver='fake')
2537+
node = ironic_utils.get_test_node(
2538+
driver='fake', instance_uuid=None,
2539+
provision_state=ironic_states.AVAILABLE,
2540+
power_state=ironic_states.POWER_OFF)
25352541
self.mock_conn.get_node.return_value = node
25362542
self.mock_conn.update_node.side_effect = sdk_exc.ConflictException
25372543
instance = fake_instance.fake_instance_obj(self.ctx, node=node.id)
25382544
self.assertRaises(exception.InstanceDeployFailure,
25392545
self.driver.prepare_for_spawn,
25402546
instance)
25412547

2548+
def test_prepare_for_spawn_not_available(self):
2549+
node = ironic_utils.get_test_node(
2550+
driver='fake', instance_uuid=None,
2551+
provision_state=ironic_states.CLEANWAIT,
2552+
power_state=ironic_states.POWER_OFF)
2553+
self.mock_conn.get_node.return_value = node
2554+
self.mock_conn.update_node.side_effect = sdk_exc.ConflictException
2555+
instance = fake_instance.fake_instance_obj(self.ctx, node=node.id)
2556+
self.assertRaises(exception.ComputeResourcesUnavailable,
2557+
self.driver.prepare_for_spawn,
2558+
instance)
2559+
25422560
@mock.patch.object(ironic_driver.IronicDriver, '_cleanup_deploy')
25432561
def test_failed_spawn_cleanup(self, mock_cleanup):
25442562
node = ironic_utils.get_test_node(driver='fake')

nova/virt/ironic/driver.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,18 @@ def prepare_for_spawn(self, instance):
397397
_("Ironic node uuid not supplied to "
398398
"driver for instance %s.") % instance.uuid)
399399
node = self._get_node(node_uuid)
400+
401+
# Its possible this node has just moved from deleting
402+
# to cleaning. Placement will update the inventory
403+
# as all reserved, but this instance might have got here
404+
# before that happened, but after the previous allocation
405+
# got deleted. We trigger a re-schedule to another node.
406+
if (self._node_resources_used(node) or
407+
self._node_resources_unavailable(node)):
408+
msg = "Chosen ironic node %s is not available" % node_uuid
409+
LOG.info(msg, instance=instance)
410+
raise exception.ComputeResourcesUnavailable(reason=msg)
411+
400412
self._set_instance_id(node, instance)
401413

402414
def failed_spawn_cleanup(self, instance):

0 commit comments

Comments
 (0)