Skip to content

Commit de928ad

Browse files
Zuulopenstack-gerrit
authored andcommitted
Merge "Fix failed count for anti-affinity check" into stable/yoga
2 parents dee89dd + cd0403d commit de928ad

File tree

5 files changed

+265
-14
lines changed

5 files changed

+265
-14
lines changed

nova/compute/build_results.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,11 @@
2424
ACTIVE = 'active' # Instance is running
2525
FAILED = 'failed' # Instance failed to build and was not rescheduled
2626
RESCHEDULED = 'rescheduled' # Instance failed to build, but was rescheduled
27+
# Instance failed by policy violation (such as affinity or anti-affinity)
28+
# and was not rescheduled. In this case, the node's failed count won't be
29+
# increased.
30+
FAILED_BY_POLICY = 'failed_by_policy'
31+
# Instance failed by policy violation (such as affinity or anti-affinity)
32+
# but was rescheduled. In this case, the node's failed count won't be
33+
# increased.
34+
RESCHEDULED_BY_POLICY = 'rescheduled_by_policy'

nova/compute/manager.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1804,11 +1804,8 @@ def _do_validation(context, instance, group):
18041804
else:
18051805
max_server = 1
18061806
if len(members_on_host) >= max_server:
1807-
msg = _("Anti-affinity instance group policy "
1808-
"was violated.")
1809-
raise exception.RescheduledException(
1810-
instance_uuid=instance.uuid,
1811-
reason=msg)
1807+
raise exception.GroupAffinityViolation(
1808+
instance_uuid=instance.uuid, policy='Anti-affinity')
18121809

18131810
# NOTE(ganso): The check for affinity below does not work and it
18141811
# can easily be violated because the lock happens in different
@@ -1818,10 +1815,8 @@ def _do_validation(context, instance, group):
18181815
elif group.policy and 'affinity' == group.policy:
18191816
group_hosts = group.get_hosts(exclude=[instance.uuid])
18201817
if group_hosts and self.host not in group_hosts:
1821-
msg = _("Affinity instance group policy was violated.")
1822-
raise exception.RescheduledException(
1823-
instance_uuid=instance.uuid,
1824-
reason=msg)
1818+
raise exception.GroupAffinityViolation(
1819+
instance_uuid=instance.uuid, policy='Affinity')
18251820

18261821
_do_validation(context, instance, group)
18271822

@@ -2256,6 +2251,9 @@ def _locked_do_build_and_run_instance(*args, **kwargs):
22562251
self.reportclient.delete_allocation_for_instance(
22572252
context, instance.uuid, force=True)
22582253

2254+
if result in (build_results.FAILED_BY_POLICY,
2255+
build_results.RESCHEDULED_BY_POLICY):
2256+
return
22592257
if result in (build_results.FAILED,
22602258
build_results.RESCHEDULED):
22612259
self._build_failed(node)
@@ -2354,6 +2352,8 @@ def _do_build_and_run_instance(self, context, instance, image,
23542352
self._nil_out_instance_obj_host_and_node(instance)
23552353
self._set_instance_obj_error_state(instance,
23562354
clean_task_state=True)
2355+
if isinstance(e, exception.RescheduledByPolicyException):
2356+
return build_results.FAILED_BY_POLICY
23572357
return build_results.FAILED
23582358
LOG.debug(e.format_message(), instance=instance)
23592359
# This will be used for logging the exception
@@ -2380,6 +2380,10 @@ def _do_build_and_run_instance(self, context, instance, image,
23802380
injected_files, requested_networks, security_groups,
23812381
block_device_mapping, request_spec=request_spec,
23822382
host_lists=[host_list])
2383+
2384+
if isinstance(e, exception.RescheduledByPolicyException):
2385+
return build_results.RESCHEDULED_BY_POLICY
2386+
23832387
return build_results.RESCHEDULED
23842388
except (exception.InstanceNotFound,
23852389
exception.UnexpectedDeletingTaskStateError):
@@ -2597,6 +2601,17 @@ def _build_and_run_instance(self, context, instance, image, injected_files,
25972601
bdms=block_device_mapping)
25982602
raise exception.BuildAbortException(instance_uuid=instance.uuid,
25992603
reason=e.format_message())
2604+
except exception.GroupAffinityViolation as e:
2605+
LOG.exception('Failed to build and run instance',
2606+
instance=instance)
2607+
self._notify_about_instance_usage(context, instance,
2608+
'create.error', fault=e)
2609+
compute_utils.notify_about_instance_create(
2610+
context, instance, self.host,
2611+
phase=fields.NotificationPhase.ERROR, exception=e,
2612+
bdms=block_device_mapping)
2613+
raise exception.RescheduledByPolicyException(
2614+
instance_uuid=instance.uuid, reason=str(e))
26002615
except Exception as e:
26012616
LOG.exception('Failed to build and run instance',
26022617
instance=instance)

nova/exception.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1477,6 +1477,15 @@ class RescheduledException(NovaException):
14771477
"%(reason)s")
14781478

14791479

1480+
class RescheduledByPolicyException(RescheduledException):
1481+
msg_fmt = _("Build of instance %(instance_uuid)s was re-scheduled: "
1482+
"%(reason)s")
1483+
1484+
1485+
class GroupAffinityViolation(NovaException):
1486+
msg_fmt = _("%(policy)s instance group policy was violated")
1487+
1488+
14801489
class InstanceFaultRollback(NovaException):
14811490
def __init__(self, inner_exception=None):
14821491
message = _("Instance rollback performed due to: %s")

nova/tests/functional/test_server_group.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from nova.compute import instance_actions
2020
from nova import context
2121
from nova.db.main import api as db
22+
from nova import objects
2223
from nova import test
2324
from nova.tests import fixtures as nova_fixtures
2425
from nova.tests.functional.api import client
@@ -494,6 +495,85 @@ def test_soft_affinity_not_supported(self):
494495
self.assertIn('Invalid input', ex.response.text)
495496
self.assertIn('soft-affinity', ex.response.text)
496497

498+
@mock.patch('nova.scheduler.filters.affinity_filter.'
499+
'ServerGroupAffinityFilter.host_passes', return_value=True)
500+
def test_failed_count_with_affinity_violation(self, mock_host_passes):
501+
"""Check failed count not incremented after violation of the late
502+
affinity check. https://bugs.launchpad.net/nova/+bug/1996732
503+
"""
504+
505+
created_group = self.api.post_server_groups(self.affinity)
506+
flavor = self.api.get_flavors()[2]
507+
508+
# Ensure the first instance is on compute1
509+
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
510+
compute2_service_id = self.admin_api.get_services(
511+
host=self.compute2.host, binary='nova-compute')[0]['id']
512+
self.admin_api.put_service(compute2_service_id,
513+
{'status': 'disabled'})
514+
515+
self._boot_a_server_to_group(created_group, flavor=flavor)
516+
517+
# Ensure the second instance is on compute2
518+
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
519+
self.admin_api.put_service(compute2_service_id,
520+
{'status': 'enabled'})
521+
compute1_service_id = self.admin_api.get_services(
522+
host=self.compute.host, binary='nova-compute')[0]['id']
523+
self.admin_api.put_service(compute1_service_id,
524+
{'status': 'disabled'})
525+
526+
# Expects GroupAffinityViolation exception
527+
failed_server = self._boot_a_server_to_group(created_group,
528+
flavor=flavor,
529+
expected_status='ERROR')
530+
531+
self.assertEqual('Exceeded maximum number of retries. Exhausted all '
532+
'hosts available for retrying build failures for '
533+
'instance %s.' % failed_server['id'],
534+
failed_server['fault']['message'])
535+
536+
ctxt = context.get_admin_context()
537+
computes = objects.ComputeNodeList.get_all(ctxt)
538+
539+
for node in computes:
540+
self.assertEqual(node.stats.get('failed_builds'), '0')
541+
542+
@mock.patch('nova.scheduler.filters.affinity_filter.'
543+
'ServerGroupAntiAffinityFilter.host_passes', return_value=True)
544+
def test_failed_count_with_anti_affinity_violation(self, mock_host_passes):
545+
"""Check failed count after violation of the late affinity check.
546+
https://bugs.launchpad.net/nova/+bug/1996732
547+
"""
548+
549+
created_group = self.api.post_server_groups(self.anti_affinity)
550+
flavor = self.api.get_flavors()[2]
551+
552+
# Ensure two instances are scheduled on the same host
553+
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
554+
compute2_service_id = self.admin_api.get_services(
555+
host=self.compute2.host, binary='nova-compute')[0]['id']
556+
self.admin_api.put_service(compute2_service_id,
557+
{'status': 'disabled'})
558+
559+
self._boot_a_server_to_group(created_group, flavor=flavor)
560+
561+
# Expects GroupAffinityViolation exception
562+
failed_server = self._boot_a_server_to_group(created_group,
563+
flavor=flavor,
564+
expected_status='ERROR')
565+
566+
self.assertEqual('Exceeded maximum number of retries. Exhausted all '
567+
'hosts available for retrying build failures for '
568+
'instance %s.' % failed_server['id'],
569+
failed_server['fault']['message'])
570+
571+
ctxt = context.get_admin_context()
572+
computes = objects.ComputeNodeList.get_all(ctxt)
573+
574+
for node in computes:
575+
self.assertEqual(node.stats.get('failed_builds'), '0')
576+
497577

498578
class ServerGroupAffinityConfTest(ServerGroupTestBase):
499579
api_major_version = 'v2.1'

0 commit comments

Comments
 (0)