From e8999f0264a20bac56108fbc56e4a8a9fac61ae3 Mon Sep 17 00:00:00 2001 From: Matthew N Heler Date: Wed, 11 Sep 2024 12:28:15 -0500 Subject: [PATCH] Fix regression with live migration on shared storage The commit c1ccc1a3165ec1556c605b3b036274e992b0a09d introduced a regression when NUMA live migration was done on shared storage The live migration support for the power mgmt feature means we need to call driver.cleanup() for all NUMA instances to potentially offline pcpus that are not used any more after the instance is migrated away. However this change exposed an issue with the disk cleanup logic. Nova should never delete the instance directory if that directory is on shared storage (e.g. the nova instances path is backed by NFS). This patch will fix that behavior so live migration will function 2023.x only: Conflict resolved in compute/manager.py due to bluperint libvirt-mdev-live-migrate is not in 2023.x and the related unit test is removed. Closes-Bug: #2080436 Change-Id: Ia2bbb5b4ac728563a8aabd857ed0503449991df1 (cherry picked from commit 035b8404fce878b0a88c4741bea46135b6af51e8) (cherry picked from commit 57e037b507e5c31c9057fe463aec9cadaf2e6c63) (cherry picked from commit 0e4406a33e6907d3b335455a3f3c3ff80045ea0f) --- nova/compute/manager.py | 4 +++- nova/tests/unit/compute/test_compute_mgr.py | 4 ++-- nova/tests/unit/virt/libvirt/test_driver.py | 22 ++++++++++++++++++- nova/virt/libvirt/driver.py | 10 ++++----- .../notes/bug-2080436-568b03b5b5ba5760.yaml | 8 +++++++ 5 files changed, 39 insertions(+), 9 deletions(-) create mode 100644 releasenotes/notes/bug-2080436-568b03b5b5ba5760.yaml diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 6dad43eddcc..49d4d434d74 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -9105,7 +9105,9 @@ def _live_migration_cleanup_flags(self, migrate_data, migr_ctxt=None): # vpmem must be cleaned do_cleanup = (not migrate_data.is_shared_instance_path or has_vpmem or power_management_possible) - destroy_disks = not migrate_data.is_shared_block_storage + destroy_disks = not ( + migrate_data.is_shared_block_storage or + migrate_data.is_shared_instance_path) elif isinstance(migrate_data, migrate_data_obj.HyperVLiveMigrateData): # NOTE(claudiub): We need to cleanup any zombie Planned VM. do_cleanup = True diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index ec743172f4e..b98517721a8 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -11201,7 +11201,7 @@ def test_live_migration_cleanup_flags_shared_path_and_vpmem_libvirt(self): do_cleanup, destroy_disks = self.compute._live_migration_cleanup_flags( migrate_data, migr_ctxt) self.assertTrue(do_cleanup) - self.assertTrue(destroy_disks) + self.assertFalse(destroy_disks) def test_live_migration_cleanup_flags_block_migrate_libvirt(self): migrate_data = objects.LibvirtLiveMigrateData( @@ -11228,7 +11228,7 @@ def test_live_migration_cleanup_flags_shared_path_libvirt(self): do_cleanup, destroy_disks = self.compute._live_migration_cleanup_flags( migrate_data) self.assertFalse(do_cleanup) - self.assertTrue(destroy_disks) + self.assertFalse(destroy_disks) def test_live_migration_cleanup_flags_shared_libvirt(self): migrate_data = objects.LibvirtLiveMigrateData( diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index 0c12e57a3d7..5720a7005fa 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -20448,7 +20448,8 @@ def test_cleanup_migrate_data_shared_block_storage(self, # is_shared_block_storage=True and destroy_disks=False. instance = objects.Instance(self.context, **self.test_instance) migrate_data = objects.LibvirtLiveMigrateData( - is_shared_block_storage=True) + is_shared_block_storage=True, + is_shared_instance_path=False) drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI()) drvr.cleanup( self.context, instance, network_info={}, destroy_disks=False, @@ -20458,6 +20459,25 @@ def test_cleanup_migrate_data_shared_block_storage(self, self.assertTrue(instance.cleaned) save.assert_called_once_with() + @mock.patch.object(libvirt_driver.LibvirtDriver, 'delete_instance_files', + return_value=True) + @mock.patch.object(objects.Instance, 'save') + @mock.patch.object(libvirt_driver.LibvirtDriver, '_undefine_domain') + def test_cleanup_migrate_data_block_storage_and_share_instance_dir( + self, _undefine_domain, save, delete_instance_files + ): + # Test the case when the instance directory is on shared storage + # (e.g. NFS) and the instance is booted form volume. + instance = objects.Instance(self.context, **self.test_instance) + migrate_data = objects.LibvirtLiveMigrateData( + is_shared_block_storage=True, + is_shared_instance_path=True) + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI()) + drvr.cleanup( + self.context, instance, network_info={}, destroy_disks=False, + migrate_data=migrate_data, destroy_vifs=False) + delete_instance_files.assert_not_called() + @mock.patch.object(libvirt_driver.LibvirtDriver, 'delete_instance_files', return_value=True) @mock.patch.object(objects.Instance, 'save') diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 96e8ec9b276..7306a1cec0f 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -1614,12 +1614,12 @@ def cleanup(self, context, instance, network_info, block_device_info=None, cleanup_instance_dir = True cleanup_instance_disks = True else: - # NOTE(mdbooth): I think the theory here was that if this is a - # migration with shared block storage then we need to delete the - # instance directory because that's not shared. I'm pretty sure - # this is wrong. + # NOTE(mheler): For shared block storage we only need to clean up + # the instance directory when it's not on a shared path. if migrate_data and 'is_shared_block_storage' in migrate_data: - cleanup_instance_dir = migrate_data.is_shared_block_storage + cleanup_instance_dir = ( + migrate_data.is_shared_block_storage and + not migrate_data.is_shared_instance_path) # NOTE(lyarwood): The following workaround allows operators to # ensure that non-shared instance directories are removed after an diff --git a/releasenotes/notes/bug-2080436-568b03b5b5ba5760.yaml b/releasenotes/notes/bug-2080436-568b03b5b5ba5760.yaml new file mode 100644 index 00000000000..fad6e75b002 --- /dev/null +++ b/releasenotes/notes/bug-2080436-568b03b5b5ba5760.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Fixes a regression for live migration on shared storage that + was removing the backing disk and instance folder during the + cleanup of a virtual machine post live migration. + `bug 2080436 + `__ for details.