From 8823da84e9d07309b860c3ce3ad4c9ebd3652f86 Mon Sep 17 00:00:00 2001
From: melanie witt <melwittt@gmail.com>
Date: Fri, 10 Mar 2023 00:49:14 +0000
Subject: [PATCH 1/5] testing: Fix and robustify archive_deleted_rows test

The regexes in test_archive_deleted_rows for multiple cells were
incorrect in that they were not isolating the search pattern and rather
could match with other rows in the result table as well, resulting in a
false positive.

This fixes the regexes and also adds one more server to the test
scenario in order to make sure archive_deleted_rows iterates at least
once to expose bugs that may be present in its internal iteration.

This patch is in preparation for a future patch that will change the
logic in archive_deleted_rows. Making this test more robust will more
thoroughly test for regression.

Change-Id: If39f6afb6359c67aa38cf315ec90ffa386d5c142
(cherry picked from commit f6620d48c86fb1c5034c09da6411ea46b4d9c2ed)
---
 nova/tests/functional/test_nova_manage.py | 94 +++++++++++++----------
 1 file changed, 54 insertions(+), 40 deletions(-)

diff --git a/nova/tests/functional/test_nova_manage.py b/nova/tests/functional/test_nova_manage.py
index 888b43cea09..31996df2d72 100644
--- a/nova/tests/functional/test_nova_manage.py
+++ b/nova/tests/functional/test_nova_manage.py
@@ -2238,57 +2238,71 @@ def setUp(self):
 
     def test_archive_deleted_rows(self):
         admin_context = context.get_admin_context(read_deleted='yes')
-        # Boot a server to cell1
-        server_ids = {}
-        server = self._build_server(az='nova:host1')
-        created_server = self.api.post_server({'server': server})
-        self._wait_for_state_change(created_server, 'ACTIVE')
-        server_ids['cell1'] = created_server['id']
-        # Boot a server to cell2
-        server = self._build_server(az='nova:host2')
-        created_server = self.api.post_server({'server': server})
-        self._wait_for_state_change(created_server, 'ACTIVE')
-        server_ids['cell2'] = created_server['id']
-        # Boot a server to cell0 (cause ERROR state prior to schedule)
-        server = self._build_server()
-        # Flavor m1.xlarge cannot be fulfilled
-        server['flavorRef'] = 'http://fake.server/5'
-        created_server = self.api.post_server({'server': server})
-        self._wait_for_state_change(created_server, 'ERROR')
-        server_ids['cell0'] = created_server['id']
+        server_ids_by_cell = collections.defaultdict(list)
+        # Create two servers per cell to make sure archive for table iterates
+        # at least once.
+        for i in range(2):
+            # Boot a server to cell1
+            server = self._build_server(az='nova:host1')
+            created_server = self.api.post_server({'server': server})
+            self._wait_for_state_change(created_server, 'ACTIVE')
+            server_ids_by_cell['cell1'].append(created_server['id'])
+            # Boot a server to cell2
+            server = self._build_server(az='nova:host2')
+            created_server = self.api.post_server({'server': server})
+            self._wait_for_state_change(created_server, 'ACTIVE')
+            server_ids_by_cell['cell2'].append(created_server['id'])
+            # Boot a server to cell0 (cause ERROR state prior to schedule)
+            server = self._build_server()
+            # Flavor m1.xlarge cannot be fulfilled
+            server['flavorRef'] = 'http://fake.server/5'
+            created_server = self.api.post_server({'server': server})
+            self._wait_for_state_change(created_server, 'ERROR')
+            server_ids_by_cell['cell0'].append(created_server['id'])
+
         # Verify all the servers are in the databases
-        for cell_name, server_id in server_ids.items():
-            with context.target_cell(admin_context,
-                                     self.cell_mappings[cell_name]) as cctxt:
-                objects.Instance.get_by_uuid(cctxt, server_id)
+        for cell_name, server_ids in server_ids_by_cell.items():
+            for server_id in server_ids:
+                with context.target_cell(
+                    admin_context,
+                    self.cell_mappings[cell_name]
+                ) as cctxt:
+                    objects.Instance.get_by_uuid(cctxt, server_id)
         # Delete the servers
-        for cell_name in server_ids.keys():
-            self.api.delete_server(server_ids[cell_name])
+        for cell_name, server_ids in server_ids_by_cell.items():
+            for server_id in server_ids:
+                self.api.delete_server(server_id)
         # Verify all the servers are in the databases still (as soft deleted)
-        for cell_name, server_id in server_ids.items():
-            with context.target_cell(admin_context,
-                                     self.cell_mappings[cell_name]) as cctxt:
-                objects.Instance.get_by_uuid(cctxt, server_id)
+        for cell_name, server_ids in server_ids_by_cell.items():
+            for server_id in server_ids:
+                with context.target_cell(
+                    admin_context,
+                    self.cell_mappings[cell_name]
+                ) as cctxt:
+                    objects.Instance.get_by_uuid(cctxt, server_id)
         # Archive the deleted rows
         self.cli.archive_deleted_rows(verbose=True, all_cells=True)
-        # Three instances should have been archived (cell0, cell1, cell2)
+        # 6 instances should have been archived (cell0, cell1, cell2)
         self.assertRegex(self.output.getvalue(),
-                         r"| cell0\.instances.*\| 1.*")
+                         r"\| cell0\.instances\s+\| 2")
         self.assertRegex(self.output.getvalue(),
-                         r"| cell1\.instances.*\| 1.*")
+                         r"\| cell1\.instances\s+\| 2")
         self.assertRegex(self.output.getvalue(),
-                         r"| cell2\.instances.*\| 1.*")
+                         r"\| cell2\.instances\s+\| 2")
         self.assertRegex(self.output.getvalue(),
-                         r"| API_DB\.instance_mappings.*\| 3.*")
+                         r"\| API_DB\.instance_mappings\s+\| 6")
         self.assertRegex(self.output.getvalue(),
-                         r"| API_DB\.request_specs.*\| 3.*")
+                         r"\| API_DB\.request_specs\s+\| 6")
         # Verify all the servers are gone from the cell databases
-        for cell_name, server_id in server_ids.items():
-            with context.target_cell(admin_context,
-                                     self.cell_mappings[cell_name]) as cctxt:
-                self.assertRaises(exception.InstanceNotFound,
-                                  objects.Instance.get_by_uuid,
-                                  cctxt, server_id)
+        for cell_name, server_ids in server_ids_by_cell.items():
+            for server_id in server_ids:
+                with context.target_cell(
+                    admin_context,
+                    self.cell_mappings[cell_name]
+                ) as cctxt:
+                    self.assertRaises(exception.InstanceNotFound,
+                                      objects.Instance.get_by_uuid,
+                                      cctxt, server_id)
 
 
 class TestDBArchiveDeletedRowsMultiCellTaskLog(

From 75e4c86d90ae0229069fc2eb06bfb41436be7319 Mon Sep 17 00:00:00 2001
From: melanie witt <melwittt@gmail.com>
Date: Wed, 22 Feb 2023 23:58:33 +0000
Subject: [PATCH 2/5] database: Archive parent and child rows "trees" one at a
 time

Previously, we archived deleted rows in batches of max_rows parents +
their child rows in a single database transaction. Doing it that way
limited how high a value of max_rows could be specified by the caller
because of the size of the database transaction it could generate.

For example, in a large scale deployment with hundreds of thousands of
deleted rows and constant server creation and deletion activity, a
value of max_rows=1000 might exceed the database's configured maximum
packet size or timeout due to a database deadlock, forcing the operator
to use a much lower max_rows value like 100 or 50.

And when the operator has e.g. 500,000 deleted instances rows (and
millions of deleted rows total) they are trying to archive, being
forced to use a max_rows value several orders of magnitude lower than
the number of rows they need to archive was a poor user experience.

This changes the logic to archive one parent row and its foreign key
related child rows one at a time in a single database transaction
while limiting the total number of rows per table as soon as it reaches
>= max_rows. Doing this will allow operators to choose more predictable
values for max_rows and get more progress per invocation of
archive_deleted_rows.

Closes-Bug: #2024258

Change-Id: I2209bf1b3320901cf603ec39163cf923b25b0359
(cherry picked from commit 697fa3c000696da559e52b664c04cbd8d261c037)
---
 nova/cmd/manage.py                            |  35 +++--
 nova/db/main/api.py                           | 131 +++++++++++++-----
 nova/tests/functional/db/test_archive.py      |  44 ++++++
 nova/tests/unit/cmd/test_manage.py            |  20 +++
 ...formance-degradation-3fdabc43398149b1.yaml |  21 +++
 5 files changed, 203 insertions(+), 48 deletions(-)
 create mode 100644 releasenotes/notes/db-archive-performance-degradation-3fdabc43398149b1.yaml

diff --git a/nova/cmd/manage.py b/nova/cmd/manage.py
index 45ae678ab4c..dda640d6767 100644
--- a/nova/cmd/manage.py
+++ b/nova/cmd/manage.py
@@ -229,10 +229,10 @@ def version(self):
         print(migration.db_version())
 
     @args('--max_rows', type=int, metavar='<number>', dest='max_rows',
-          help='Maximum number of deleted rows to archive. Defaults to 1000. '
-               'Note that this number does not include the corresponding '
-               'rows, if any, that are removed from the API database for '
-               'deleted instances.')
+          help='Maximum number of deleted rows to archive per table. Defaults '
+               'to 1000. Note that this number is a soft limit and does not '
+               'include the corresponding rows, if any, that are removed '
+               'from the API database for deleted instances.')
     @args('--before', metavar='<date>',
           help=('Archive rows that have been deleted before this date. '
                 'Accepts date strings in the default format output by the '
@@ -404,7 +404,10 @@ def _do_archive(
              'cell1.instances': 5}
         :param cctxt: Cell-targeted nova.context.RequestContext if archiving
             across all cells
-        :param max_rows: Maximum number of deleted rows to archive
+        :param max_rows: Maximum number of deleted rows to archive per table.
+            Note that this number is a soft limit and does not include the
+            corresponding rows, if any, that are removed from the API database
+            for deleted instances.
         :param until_complete: Whether to run continuously until all deleted
             rows are archived
         :param verbose: Whether to print how many rows were archived per table
@@ -417,15 +420,26 @@ def _do_archive(
         """
         ctxt = context.get_admin_context()
         while True:
-            run, deleted_instance_uuids, total_rows_archived = \
+            # table_to_rows = {table_name: number_of_rows_archived}
+            # deleted_instance_uuids = ['uuid1', 'uuid2', ...]
+            table_to_rows, deleted_instance_uuids, total_rows_archived = \
                 db.archive_deleted_rows(
                     cctxt, max_rows, before=before_date, task_log=task_log)
-            for table_name, rows_archived in run.items():
+
+            for table_name, rows_archived in table_to_rows.items():
                 if cell_name:
                     table_name = cell_name + '.' + table_name
                 table_to_rows_archived.setdefault(table_name, 0)
                 table_to_rows_archived[table_name] += rows_archived
-            if deleted_instance_uuids:
+
+            # deleted_instance_uuids does not necessarily mean that any
+            # instances rows were archived because it is obtained by a query
+            # separate from the archive queries. For example, if a
+            # DBReferenceError was raised while processing the instances table,
+            # we would have skipped the table and had 0 rows archived even
+            # though deleted instances rows were found.
+            instances_archived = table_to_rows.get('instances', 0)
+            if deleted_instance_uuids and instances_archived:
                 table_to_rows_archived.setdefault(
                     'API_DB.instance_mappings', 0)
                 table_to_rows_archived.setdefault(
@@ -448,8 +462,9 @@ def _do_archive(
 
             # If we're not archiving until there is nothing more to archive, we
             # have reached max_rows in this cell DB or there was nothing to
-            # archive.
-            if not until_complete or not run:
+            # archive. We check the values() in case we get something like
+            # table_to_rows = {'instances': 0} back somehow.
+            if not until_complete or not any(table_to_rows.values()):
                 break
             if verbose:
                 sys.stdout.write('.')
diff --git a/nova/db/main/api.py b/nova/db/main/api.py
index 7d24f974f91..ae6533f49cd 100644
--- a/nova/db/main/api.py
+++ b/nova/db/main/api.py
@@ -4407,6 +4407,9 @@ def _archive_deleted_rows_for_table(
     select = select.order_by(column).limit(max_rows)
     with conn.begin():
         rows = conn.execute(select).fetchall()
+
+    # This is a list of IDs of rows that should be archived from this table,
+    # limited to a length of max_rows.
     records = [r[0] for r in rows]
 
     # We will archive deleted rows for this table and also generate insert and
@@ -4419,51 +4422,103 @@ def _archive_deleted_rows_for_table(
 
     # Keep track of any extra tablenames to number of rows that we archive by
     # following FK relationships.
-    # {tablename: extra_rows_archived}
+    #
+    # extras = {tablename: number_of_extra_rows_archived}
     extras = collections.defaultdict(int)
-    if records:
-        insert = shadow_table.insert().from_select(
-            columns, sql.select(table).where(column.in_(records))
-        ).inline()
-        delete = table.delete().where(column.in_(records))
+
+    if not records:
+        # Nothing to archive, so return.
+        return rows_archived, deleted_instance_uuids, extras
+
+    # Keep track of how many rows we accumulate for the insert+delete database
+    # transaction and cap it as soon as it is >= max_rows. Because we will
+    # archive all child rows of a parent row along with the parent at the same
+    # time, we end up with extra rows to archive in addition to len(records).
+    num_rows_in_batch = 0
+    # The sequence of query statements we will execute in a batch. These are
+    # ordered: [child1, child1, parent1, child2, child2, child2, parent2, ...]
+    # Parent + child "trees" are kept together to avoid FK constraint
+    # violations.
+    statements_in_batch = []
+    # The list of records in the batch. This is used for collecting deleted
+    # instance UUIDs in the case of the 'instances' table.
+    records_in_batch = []
+
+    # (melwitt): We will gather rows related by foreign key relationship for
+    # each deleted row, one at a time. We do it this way to keep track of and
+    # limit the total number of rows that will be archived in a single database
+    # transaction. In a large scale database with potentially hundreds of
+    # thousands of deleted rows, if we don't limit the size of the transaction
+    # based on max_rows, we can get into a situation where we get stuck not
+    # able to make much progress. The value of max_rows has to be 1) small
+    # enough to not exceed the database's max packet size limit or timeout with
+    # a deadlock but 2) large enough to make progress in an environment with a
+    # constant high volume of create and delete traffic. By archiving each
+    # parent + child rows tree one at a time, we can ensure meaningful progress
+    # can be made while allowing the caller to predictably control the size of
+    # the database transaction with max_rows.
+    for record in records:
         # Walk FK relationships and add insert/delete statements for rows that
         # refer to this table via FK constraints. fk_inserts and fk_deletes
         # will be prepended to by _get_fk_stmts if referring rows are found by
         # FK constraints.
         fk_inserts, fk_deletes = _get_fk_stmts(
-            metadata, conn, table, column, records)
-
-        # NOTE(tssurya): In order to facilitate the deletion of records from
-        # instance_mappings, request_specs and instance_group_member tables in
-        # the nova_api DB, the rows of deleted instances from the instances
-        # table are stored prior to their deletion. Basically the uuids of the
-        # archived instances are queried and returned.
-        if tablename == "instances":
-            query_select = sql.select(table.c.uuid).where(
-                table.c.id.in_(records)
-            )
-            with conn.begin():
-                rows = conn.execute(query_select).fetchall()
-            deleted_instance_uuids = [r[0] for r in rows]
+            metadata, conn, table, column, [record])
+        statements_in_batch.extend(fk_inserts + fk_deletes)
+        # statement to add parent row to shadow table
+        insert = shadow_table.insert().from_select(
+            columns, sql.select(table).where(column.in_([record]))).inline()
+        statements_in_batch.append(insert)
+        # statement to remove parent row from main table
+        delete = table.delete().where(column.in_([record]))
+        statements_in_batch.append(delete)
 
-        try:
-            # Group the insert and delete in a transaction.
-            with conn.begin():
-                for fk_insert in fk_inserts:
-                    conn.execute(fk_insert)
-                for fk_delete in fk_deletes:
-                    result_fk_delete = conn.execute(fk_delete)
-                    extras[fk_delete.table.name] += result_fk_delete.rowcount
-                conn.execute(insert)
-                result_delete = conn.execute(delete)
-            rows_archived += result_delete.rowcount
-        except db_exc.DBReferenceError as ex:
-            # A foreign key constraint keeps us from deleting some of
-            # these rows until we clean up a dependent table.  Just
-            # skip this table for now; we'll come back to it later.
-            LOG.warning("IntegrityError detected when archiving table "
-                        "%(tablename)s: %(error)s",
-                        {'tablename': tablename, 'error': str(ex)})
+        records_in_batch.append(record)
+
+        # Check whether were have a full batch >= max_rows. Rows are counted as
+        # the number of rows that will be moved in the database transaction.
+        # So each insert+delete pair represents one row that will be moved.
+        # 1 parent + its fks
+        num_rows_in_batch += 1 + len(fk_inserts)
+
+        if max_rows is not None and num_rows_in_batch >= max_rows:
+            break
+
+    # NOTE(tssurya): In order to facilitate the deletion of records from
+    # instance_mappings, request_specs and instance_group_member tables in the
+    # nova_api DB, the rows of deleted instances from the instances table are
+    # stored prior to their deletion. Basically the uuids of the archived
+    # instances are queried and returned.
+    if tablename == "instances":
+        query_select = sql.select(table.c.uuid).where(
+            table.c.id.in_(records_in_batch))
+        with conn.begin():
+            rows = conn.execute(query_select).fetchall()
+        # deleted_instance_uuids = ['uuid1', 'uuid2', ...]
+        deleted_instance_uuids = [r[0] for r in rows]
+
+    try:
+        # Group the insert and delete in a transaction.
+        with conn.begin():
+            for statement in statements_in_batch:
+                result = conn.execute(statement)
+                result_tablename = statement.table.name
+                # Add to archived row counts if not a shadow table.
+                if not result_tablename.startswith(_SHADOW_TABLE_PREFIX):
+                    if result_tablename == tablename:
+                        # Number of tablename (parent) rows archived.
+                        rows_archived += result.rowcount
+                    else:
+                        # Number(s) of child rows archived.
+                        extras[result_tablename] += result.rowcount
+
+    except db_exc.DBReferenceError as ex:
+        # A foreign key constraint keeps us from deleting some of these rows
+        # until we clean up a dependent table. Just skip this table for now;
+        # we'll come back to it later.
+        LOG.warning("IntegrityError detected when archiving table "
+                    "%(tablename)s: %(error)s",
+                    {'tablename': tablename, 'error': str(ex)})
 
     conn.close()
 
diff --git a/nova/tests/functional/db/test_archive.py b/nova/tests/functional/db/test_archive.py
index 4a813d10a7c..1bcce9f3ee0 100644
--- a/nova/tests/functional/db/test_archive.py
+++ b/nova/tests/functional/db/test_archive.py
@@ -174,6 +174,50 @@ def test_archive_deleted_rows_incomplete(self):
                 break
         self.assertFalse(exceptions)
 
+    def test_archive_deleted_rows_parent_child_trees_one_at_time(self):
+        """Test that we are archiving parent + child rows "trees" one at a time
+
+        Previously, we archived deleted rows in batches of max_rows parents +
+        their child rows in a single database transaction. Doing it that way
+        limited how high a value of max_rows could be specified by the caller
+        because of the size of the database transaction it could generate.
+
+        For example, in a large scale deployment with hundreds of thousands of
+        deleted rows and constant server creation and deletion activity, a
+        value of max_rows=1000 might exceed the database's configured maximum
+        packet size or timeout due to a database deadlock, forcing the operator
+        to use a much lower max_rows value like 100 or 50.
+
+        And when the operator has e.g. 500,000 deleted instances rows (and
+        millions of deleted rows total) they are trying to archive, being
+        forced to use a max_rows value serveral orders of magnitude lower than
+        the number of rows they need to archive was a poor user experience.
+
+        This tests that we are archiving each parent plus their child rows one
+        at a time while limiting the total number of rows per table in a batch
+        as soon as the total number of archived rows is >= max_rows.
+        """
+        # Boot two servers and delete them, then try to archive rows.
+        for i in range(2):
+            server = self._create_server()
+            self._delete_server(server)
+
+        # Use max_rows=5 to limit the archive to one complete parent +
+        # child rows tree.
+        table_to_rows, _, _ = db.archive_deleted_rows(max_rows=5)
+
+        # We should have archived only one instance because the parent +
+        # child tree will have >= max_rows of 5.
+        self.assertEqual(1, table_to_rows['instances'])
+
+        # Archive once more to archive the other instance (and its child rows).
+        table_to_rows, _, _ = db.archive_deleted_rows(max_rows=5)
+        self.assertEqual(1, table_to_rows['instances'])
+
+        # There should be nothing else to archive.
+        _, _, total_rows_archived = db.archive_deleted_rows(max_rows=100)
+        self.assertEqual(0, total_rows_archived)
+
     def _get_table_counts(self):
         engine = db.get_engine()
         conn = engine.connect()
diff --git a/nova/tests/unit/cmd/test_manage.py b/nova/tests/unit/cmd/test_manage.py
index 10c1a77c948..0bd4db284e7 100644
--- a/nova/tests/unit/cmd/test_manage.py
+++ b/nova/tests/unit/cmd/test_manage.py
@@ -501,6 +501,26 @@ def test_archive_deleted_rows_verbose_no_results(self, mock_get_all,
         self.assertIn('Nothing was archived.', output)
         self.assertEqual(0, result)
 
+    @mock.patch.object(db, 'archive_deleted_rows')
+    @mock.patch.object(objects.CellMappingList, 'get_all')
+    def test_archive_deleted_rows_deleted_instances_no_rows(self, mock_get_all,
+                                                            mock_db_archive):
+        # Simulate a scenario where we didn't archive any rows (example:
+        # DBReferenceError was raised while processing the instances table) but
+        # the separate query for deleted_instance_uuids found rows.
+        mock_db_archive.return_value = (
+            {}, [uuidsentinel.instance1, uuidsentinel.instance2], 0)
+
+        result = self.commands.archive_deleted_rows(20, verbose=True)
+
+        mock_db_archive.assert_called_once_with(
+            test.MatchType(context.RequestContext), 20, before=None,
+            task_log=False)
+        output = self.output.getvalue()
+        # If nothing was archived, there should be no purge messages
+        self.assertIn('Nothing was archived.', output)
+        self.assertEqual(0, result)
+
     @mock.patch.object(db, 'archive_deleted_rows')
     @mock.patch.object(objects.RequestSpec, 'destroy_bulk')
     @mock.patch.object(objects.InstanceGroup, 'destroy_members_bulk')
diff --git a/releasenotes/notes/db-archive-performance-degradation-3fdabc43398149b1.yaml b/releasenotes/notes/db-archive-performance-degradation-3fdabc43398149b1.yaml
new file mode 100644
index 00000000000..537d67b3838
--- /dev/null
+++ b/releasenotes/notes/db-archive-performance-degradation-3fdabc43398149b1.yaml
@@ -0,0 +1,21 @@
+---
+fixes:
+  - |
+    `Bug #2024258`_: Fixes an issue with performance degradation archiving
+    databases with large numbers of foreign key related records.
+
+    Previously, deleted rows were archived in batches of max_rows parents +
+    their child rows in a single database transaction. It limited how high
+    a value of max_rows could be specified by the user because of the size of
+    the database transaction it could generate. Symptoms of the behavior were
+    exceeding the maximum configured packet size of the database or timing out
+    due to a deadlock.
+
+    The behavior has been changed to archive batches of complete parent +
+    child rows trees while limiting each batch when it has reached >= max_rows
+    records. This allows the size of the database transaction to be controlled
+    by the user and enables more rows to be archived per invocation of
+    ``nova-manage db archive_deleted_rows`` when there are a large number of
+    foreign key related records.
+
+    .. _Bug #2024258: https://bugs.launchpad.net/nova/+bug/2024258

From 68b9934baa6badc6fa35af0500c2b79220099428 Mon Sep 17 00:00:00 2001
From: Sean Mooney <work@seanmooney.info>
Date: Tue, 21 May 2024 17:53:07 +0100
Subject: [PATCH 3/5] add functional repoducer for bug 2065927

Today if the write sys call to offline a cpu when
deleting an instnace fails due to an OSERROR or ValueERROR
the instance delete fails and the instance goes to error.

as reported in bug: #2065927 this can happen as a result of
OSError: [Errno 16] Device or resource busy if the vm is
deleted shortly after its started.

Related-Bug: #2065927
Change-Id: I1352a3a1e28cfe14ec8f32042ed35cb25e70338e
(cherry picked from commit ee581a5c9d1c0b7c0d8830a08f55fe8bc2fbcd0f)
(cherry picked from commit f1c46802b109db0b8e62f461ef0b432fa7c0984e)
(cherry picked from commit 254ea7bbfa697a717d8c1512d9c33e67b1f57ac9)
---
 .../functional/libvirt/test_power_manage.py   | 33 +++++++++++++++++--
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/nova/tests/functional/libvirt/test_power_manage.py b/nova/tests/functional/libvirt/test_power_manage.py
index f6e4b1ba5b4..d707168778b 100644
--- a/nova/tests/functional/libvirt/test_power_manage.py
+++ b/nova/tests/functional/libvirt/test_power_manage.py
@@ -10,15 +10,16 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.
 
-from unittest import mock
-
 import fixtures
 
+from unittest import mock
+
 from nova import context as nova_context
 from nova import exception
 from nova import objects
 from nova.tests import fixtures as nova_fixtures
 from nova.tests.fixtures import libvirt as fakelibvirt
+from nova.tests.functional.api import client
 from nova.tests.functional.libvirt import base
 from nova.virt import hardware
 from nova.virt.libvirt.cpu import api as cpu_api
@@ -240,7 +241,6 @@ def setUp(self):
                                          cpu_cores=5, cpu_threads=2)
         self.compute1 = self.start_compute(host_info=self.host_info,
                                            hostname='compute1')
-
         # All cores are shutdown at startup, let's check.
         cpu_dedicated_set = hardware.get_cpu_dedicated_set()
         self._assert_cpu_set_state(cpu_dedicated_set, expected='offline')
@@ -266,6 +266,33 @@ def test_create_server(self):
         cpu_dedicated_set = hardware.get_cpu_dedicated_set()
         unused_cpus = cpu_dedicated_set - instance_pcpus
         self._assert_cpu_set_state(unused_cpus, expected='offline')
+        return server
+
+    def test_delete_server(self):
+        server = self.test_create_server()
+        self._delete_server(server)
+        # Let's verify that the pinned CPUs are now offline
+        cpu_dedicated_set = hardware.get_cpu_dedicated_set()
+        self._assert_cpu_set_state(cpu_dedicated_set, expected='offline')
+
+    def test_delete_server_device_busy(self):
+        server = self.test_create_server()
+
+        inst = objects.Instance.get_by_uuid(self.ctxt, server['id'])
+        instance_pcpus = inst.numa_topology.cpu_pinning
+        self._assert_cpu_set_state(instance_pcpus, expected='online')
+        with mock.patch(
+            'nova.filesystem.write_sys',
+            side_effect=exception.FileNotFound(file_path='fake')):
+            # This is bug 2065927
+            self.assertRaises(
+                client.OpenStackApiException, self._delete_server, server)
+        cpu_dedicated_set = hardware.get_cpu_dedicated_set()
+        # Verify that the unused CPUs are still offline
+        unused_cpus = cpu_dedicated_set - instance_pcpus
+        self._assert_cpu_set_state(unused_cpus, expected='offline')
+        # but the instance cpus are still online
+        self._assert_cpu_set_state(instance_pcpus, expected='online')
 
     def test_create_server_with_emulator_threads_isolate(self):
         server = self._create_server(

From 4c02c2d7b30f3463de508e1e711b7dbbd3d39f5f Mon Sep 17 00:00:00 2001
From: Sean Mooney <work@seanmooney.info>
Date: Wed, 22 May 2024 18:59:02 +0100
Subject: [PATCH 4/5] retry write_sys call on device busy

This change adds a retry_if_busy decorator
to the read_sys and write_sys functions in the filesystem
module that will retry reads and writes up to 5 times with
an linear backoff.

This allows nova to tolerate short periods of time where
sysfs retruns device busy. If the reties are exausted
and offlineing a core fails a warning is log and the failure is
ignored. onling a core is always treated as a hard error if
retries are exausted.

Closes-Bug: #2065927
Change-Id: I2a6a9f243cb403167620405e167a8dd2bbf3fa79
(cherry picked from commit 44c1b48b3121682cf959c90b3adaf2a3f92e318c)
(cherry picked from commit 1581f6695f00c6b4fb694b6e946a4ed204c1680f)
(cherry picked from commit 6a475acb480a375702886a97a3a3f7d5755b4da3)
---
 nova/exception.py                             |  4 ++
 nova/filesystem.py                            | 55 ++++++++++++++++++-
 .../functional/libvirt/test_power_manage.py   | 18 +++---
 nova/tests/unit/test_filesystem.py            | 47 ++++++++++++++++
 nova/virt/libvirt/cpu/core.py                 | 12 +++-
 5 files changed, 123 insertions(+), 13 deletions(-)

diff --git a/nova/exception.py b/nova/exception.py
index c3ada86c1e2..3ba9ed64cfe 100644
--- a/nova/exception.py
+++ b/nova/exception.py
@@ -1131,6 +1131,10 @@ class FileNotFound(NotFound):
     msg_fmt = _("File %(file_path)s could not be found.")
 
 
+class DeviceBusy(NovaException):
+    msg_fmt = _("device %(file_path)s is busy.")
+
+
 class ClassNotFound(NotFound):
     msg_fmt = _("Class %(class_name)s could not be found: %(exception)s")
 
diff --git a/nova/filesystem.py b/nova/filesystem.py
index 5394d2d8351..2db3093236e 100644
--- a/nova/filesystem.py
+++ b/nova/filesystem.py
@@ -12,37 +12,79 @@
 
 """Functions to address filesystem calls, particularly sysfs."""
 
+import functools
 import os
+import time
 
 from oslo_log import log as logging
 
 from nova import exception
 
 LOG = logging.getLogger(__name__)
+SYS = '/sys'
+RETRY_LIMIT = 5
 
 
-SYS = '/sys'
+# a retry decorator to handle EBUSY
+def retry_if_busy(func):
+    """Decorator to retry a function if it raises DeviceBusy.
+
+    This decorator will retry the function RETRY_LIMIT=5 times if it raises
+    DeviceBusy. It will sleep for 1 second on the first retry, 2 seconds on
+    the second retry, and so on, up to RETRY_LIMIT seconds. If the function
+    still raises DeviceBusy after RETRY_LIMIT retries, the exception will be
+    raised.
+    """
 
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        for i in range(RETRY_LIMIT):
+            try:
+                return func(*args, **kwargs)
+            except exception.DeviceBusy as e:
+                # if we have retried RETRY_LIMIT times, raise the exception
+                # otherwise, sleep and retry, i is 0-based so we need
+                # to add 1 to it
+                count = i + 1
+                if count < RETRY_LIMIT:
+                    LOG.debug(
+                        f"File {e.kwargs['file_path']} is busy, "
+                        f"sleeping {count} seconds before retrying")
+                    time.sleep(count)
+                    continue
+                raise
+    return wrapper
 
 # NOTE(bauzas): this method is deliberately not wrapped in a privsep entrypoint
+
+
+@retry_if_busy
 def read_sys(path: str) -> str:
     """Reads the content of a file in the sys filesystem.
 
     :param path: relative or absolute. If relative, will be prefixed by /sys.
     :returns: contents of that file.
     :raises: nova.exception.FileNotFound if we can't read that file.
+    :raises: nova.exception.DeviceBusy if the file is busy.
     """
     try:
         # The path can be absolute with a /sys prefix but that's fine.
         with open(os.path.join(SYS, path), mode='r') as data:
             return data.read()
-    except (OSError, ValueError) as exc:
+    except OSError as exc:
+        # errno 16 is EBUSY, which means the file is busy.
+        if exc.errno == 16:
+            raise exception.DeviceBusy(file_path=path) from exc
+        # convert permission denied to file not found
+        raise exception.FileNotFound(file_path=path) from exc
+    except ValueError as exc:
         raise exception.FileNotFound(file_path=path) from exc
 
 
 # NOTE(bauzas): this method is deliberately not wrapped in a privsep entrypoint
 # In order to correctly use it, you need to decorate the caller with a specific
 # privsep entrypoint.
+@retry_if_busy
 def write_sys(path: str, data: str) -> None:
     """Writes the content of a file in the sys filesystem with data.
 
@@ -50,10 +92,17 @@ def write_sys(path: str, data: str) -> None:
     :param data: the data to write.
     :returns: contents of that file.
     :raises: nova.exception.FileNotFound if we can't write that file.
+    :raises: nova.exception.DeviceBusy if the file is busy.
     """
     try:
         # The path can be absolute with a /sys prefix but that's fine.
         with open(os.path.join(SYS, path), mode='w') as fd:
             fd.write(data)
-    except (OSError, ValueError) as exc:
+    except OSError as exc:
+        # errno 16 is EBUSY, which means the file is busy.
+        if exc.errno == 16:
+            raise exception.DeviceBusy(file_path=path) from exc
+        # convert permission denied to file not found
+        raise exception.FileNotFound(file_path=path) from exc
+    except ValueError as exc:
         raise exception.FileNotFound(file_path=path) from exc
diff --git a/nova/tests/functional/libvirt/test_power_manage.py b/nova/tests/functional/libvirt/test_power_manage.py
index d707168778b..3839c798317 100644
--- a/nova/tests/functional/libvirt/test_power_manage.py
+++ b/nova/tests/functional/libvirt/test_power_manage.py
@@ -19,7 +19,6 @@
 from nova import objects
 from nova.tests import fixtures as nova_fixtures
 from nova.tests.fixtures import libvirt as fakelibvirt
-from nova.tests.functional.api import client
 from nova.tests.functional.libvirt import base
 from nova.virt import hardware
 from nova.virt.libvirt.cpu import api as cpu_api
@@ -276,23 +275,24 @@ def test_delete_server(self):
         self._assert_cpu_set_state(cpu_dedicated_set, expected='offline')
 
     def test_delete_server_device_busy(self):
+        # This test verifies bug 2065927 is resolved.
         server = self.test_create_server()
-
         inst = objects.Instance.get_by_uuid(self.ctxt, server['id'])
         instance_pcpus = inst.numa_topology.cpu_pinning
         self._assert_cpu_set_state(instance_pcpus, expected='online')
         with mock.patch(
-            'nova.filesystem.write_sys',
-            side_effect=exception.FileNotFound(file_path='fake')):
-            # This is bug 2065927
-            self.assertRaises(
-                client.OpenStackApiException, self._delete_server, server)
+            'nova.filesystem.write_sys.__wrapped__',
+            side_effect=[
+                exception.DeviceBusy(file_path='fake'),
+                None]):
+
+            self._delete_server(server)
         cpu_dedicated_set = hardware.get_cpu_dedicated_set()
         # Verify that the unused CPUs are still offline
         unused_cpus = cpu_dedicated_set - instance_pcpus
         self._assert_cpu_set_state(unused_cpus, expected='offline')
-        # but the instance cpus are still online
-        self._assert_cpu_set_state(instance_pcpus, expected='online')
+        # and the pinned CPUs are offline
+        self._assert_cpu_set_state(instance_pcpus, expected='offline')
 
     def test_create_server_with_emulator_threads_isolate(self):
         server = self._create_server(
diff --git a/nova/tests/unit/test_filesystem.py b/nova/tests/unit/test_filesystem.py
index 85f16157eeb..5c3107472d8 100644
--- a/nova/tests/unit/test_filesystem.py
+++ b/nova/tests/unit/test_filesystem.py
@@ -10,6 +10,7 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.
 
+import io
 import os
 from unittest import mock
 
@@ -35,6 +36,29 @@ def test_read_sys_error(self):
         expected_path = os.path.join(filesystem.SYS, 'foo')
         m_open.assert_called_once_with(expected_path, mode='r')
 
+    def test_read_sys_retry(self):
+        open_mock = mock.mock_open()
+        with mock.patch('builtins.open', open_mock) as m_open:
+            m_open.side_effect = [
+                OSError(16, 'Device or resource busy'),
+                io.StringIO('bar'),
+            ]
+            self.assertEqual('bar', filesystem.read_sys('foo'))
+        expected_path = os.path.join(filesystem.SYS, 'foo')
+        m_open.assert_has_calls([
+            mock.call(expected_path, mode='r'),
+            mock.call(expected_path, mode='r'),
+        ])
+
+    def test_read_sys_retry_limit(self):
+        open_mock = mock.mock_open()
+        with mock.patch('builtins.open', open_mock) as m_open:
+            m_open.side_effect = (
+                [OSError(16, 'Device or resource busy')] *
+                (filesystem.RETRY_LIMIT + 1))
+            self.assertRaises(
+                exception.DeviceBusy, filesystem.read_sys, 'foo')
+
     def test_write_sys(self):
         open_mock = mock.mock_open()
         with mock.patch('builtins.open', open_mock) as m_open:
@@ -50,3 +74,26 @@ def test_write_sys_error(self):
                               filesystem.write_sys, 'foo', 'bar')
         expected_path = os.path.join(filesystem.SYS, 'foo')
         m_open.assert_called_once_with(expected_path, mode='w')
+
+    def test_write_sys_retry(self):
+        open_mock = mock.mock_open()
+        with mock.patch('builtins.open', open_mock) as m_open:
+            m_open.side_effect = [
+                OSError(16, 'Device or resource busy'),
+                io.StringIO(),
+            ]
+            self.assertIsNone(filesystem.write_sys('foo', 'bar'))
+        expected_path = os.path.join(filesystem.SYS, 'foo')
+        m_open.assert_has_calls([
+            mock.call(expected_path, mode='w'),
+            mock.call(expected_path, mode='w'),
+        ])
+
+    def test_write_sys_retry_limit(self):
+        open_mock = mock.mock_open()
+        with mock.patch('builtins.open', open_mock) as m_open:
+            m_open.side_effect = (
+                [OSError(16, 'Device or resource busy')] *
+                (filesystem.RETRY_LIMIT + 1))
+            self.assertRaises(
+                exception.DeviceBusy, filesystem.write_sys, 'foo', 'bar')
diff --git a/nova/virt/libvirt/cpu/core.py b/nova/virt/libvirt/cpu/core.py
index 8274236850c..2d71bd60e40 100644
--- a/nova/virt/libvirt/cpu/core.py
+++ b/nova/virt/libvirt/cpu/core.py
@@ -56,13 +56,23 @@ def get_online(core: int) -> bool:
 
 @nova.privsep.sys_admin_pctxt.entrypoint
 def set_online(core: int) -> bool:
+    # failure to online a core should be considered a failure
+    # so we don't catch any exception here.
     filesystem.write_sys(os.path.join(gen_cpu_path(core), 'online'), data='1')
     return get_online(core)
 
 
 @nova.privsep.sys_admin_pctxt.entrypoint
 def set_offline(core: int) -> bool:
-    filesystem.write_sys(os.path.join(gen_cpu_path(core), 'online'), data='0')
+    try:
+        filesystem.write_sys(os.path.join(
+            gen_cpu_path(core), 'online'), data='0')
+    except exception.DeviceBusy:
+        # if nova is not able to offline a core it should not break anything
+        # so we just log a warning and return False to indicate that the core
+        # is not offline.
+        LOG.warning('Unable to offline CPU: %s', core)
+        return False
     return not get_online(core)
 
 

From 1d788f11890385658f9485a281c1beeede94a830 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <gibi@redhat.com>
Date: Fri, 4 Oct 2024 13:46:44 +0200
Subject: [PATCH 5/5] [doc]Fix the device_spec config doc about placement

Change-Id: Id08761918ccc3477a39d9c778f3ac92679c22511
(cherry picked from commit fca941adb73b49ec272850126c1c98c6c8ef4b01)
(cherry picked from commit ea0f46f80d645cf2f51c9e06df2d5e45a77f54cf)
(cherry picked from commit 2c98f91826b71843b988b77d7f26659c1f466adc)
(cherry picked from commit ed7371f4f9f1fd69ff6d1594f65c21a89cbfdd41)
---
 nova/conf/pci.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/nova/conf/pci.py b/nova/conf/pci.py
index 533bf52eadd..64cb89a787b 100644
--- a/nova/conf/pci.py
+++ b/nova/conf/pci.py
@@ -175,8 +175,9 @@
       its corresponding PF), otherwise they will be ignored and not
       available for allocation.
     - ``resource_class`` - optional Placement resource class name to be used
-      to track the matching PCI devices in Placement when [pci]device_spec is
-      True. It can be a standard resource class from the
+      to track the matching PCI devices in Placement when
+      [pci]report_in_placement is True.
+      It can be a standard resource class from the
       ``os-resource-classes`` lib. Or can be any string. In that case Nova will
       normalize it to a proper Placement resource class by making it upper
       case, replacing any consecutive character outside of ``[A-Z0-9_]`` with a