database: Archive parent and child rows "trees" one at a time

melwitt · melwitt · commit 75e4c86d90ae · 2023-07-07T18:07:14.000Z
Previously, we archived deleted rows in batches of max_rows parents + their child rows in a single database transaction. Doing it that way limited how high a value of max_rows could be specified by the caller because of the size of the database transaction it could generate. For example, in a large scale deployment with hundreds of thousands of deleted rows and constant server creation and deletion activity, a value of max_rows=1000 might exceed the database's configured maximum packet size or timeout due to a database deadlock, forcing the operator to use a much lower max_rows value like 100 or 50. And when the operator has e.g. 500,000 deleted instances rows (and millions of deleted rows total) they are trying to archive, being forced to use a max_rows value several orders of magnitude lower than the number of rows they need to archive was a poor user experience. This changes the logic to archive one parent row and its foreign key related child rows one at a time in a single database transaction while limiting the total number of rows per table as soon as it reaches >= max_rows. Doing this will allow operators to choose more predictable values for max_rows and get more progress per invocation of archive_deleted_rows. Closes-Bug: #2024258 Change-Id: I2209bf1b3320901cf603ec39163cf923b25b0359 (cherry picked from commit 697fa3c)
diff --git a/nova/cmd/manage.py b/nova/cmd/manage.py
@@ -229,10 +229,10 @@ def version(self):
         print(migration.db_version())
 
     @args('--max_rows', type=int, metavar='<number>', dest='max_rows',
-          help='Maximum number of deleted rows to archive. Defaults to 1000. '
-               'Note that this number does not include the corresponding '
-               'rows, if any, that are removed from the API database for '
-               'deleted instances.')
+          help='Maximum number of deleted rows to archive per table. Defaults '
+               'to 1000. Note that this number is a soft limit and does not '
+               'include the corresponding rows, if any, that are removed '
+               'from the API database for deleted instances.')
     @args('--before', metavar='<date>',
           help=('Archive rows that have been deleted before this date. '
                 'Accepts date strings in the default format output by the '
@@ -404,7 +404,10 @@ def _do_archive(
              'cell1.instances': 5}
         :param cctxt: Cell-targeted nova.context.RequestContext if archiving
             across all cells
-        :param max_rows: Maximum number of deleted rows to archive
+        :param max_rows: Maximum number of deleted rows to archive per table.
+            Note that this number is a soft limit and does not include the
+            corresponding rows, if any, that are removed from the API database
+            for deleted instances.
         :param until_complete: Whether to run continuously until all deleted
             rows are archived
         :param verbose: Whether to print how many rows were archived per table
@@ -417,15 +420,26 @@ def _do_archive(
         """
         ctxt = context.get_admin_context()
         while True:
-            run, deleted_instance_uuids, total_rows_archived = \
+            # table_to_rows = {table_name: number_of_rows_archived}
+            # deleted_instance_uuids = ['uuid1', 'uuid2', ...]
+            table_to_rows, deleted_instance_uuids, total_rows_archived = \
                 db.archive_deleted_rows(
                     cctxt, max_rows, before=before_date, task_log=task_log)
-            for table_name, rows_archived in run.items():
+
+            for table_name, rows_archived in table_to_rows.items():
                 if cell_name:
                     table_name = cell_name + '.' + table_name
                 table_to_rows_archived.setdefault(table_name, 0)
                 table_to_rows_archived[table_name] += rows_archived
-            if deleted_instance_uuids:
+
+            # deleted_instance_uuids does not necessarily mean that any
+            # instances rows were archived because it is obtained by a query
+            # separate from the archive queries. For example, if a
+            # DBReferenceError was raised while processing the instances table,
+            # we would have skipped the table and had 0 rows archived even
+            # though deleted instances rows were found.
+            instances_archived = table_to_rows.get('instances', 0)
+            if deleted_instance_uuids and instances_archived:
                 table_to_rows_archived.setdefault(
                     'API_DB.instance_mappings', 0)
                 table_to_rows_archived.setdefault(
@@ -448,8 +462,9 @@ def _do_archive(
 
             # If we're not archiving until there is nothing more to archive, we
             # have reached max_rows in this cell DB or there was nothing to
-            # archive.
-            if not until_complete or not run:
+            # archive. We check the values() in case we get something like
+            # table_to_rows = {'instances': 0} back somehow.
+            if not until_complete or not any(table_to_rows.values()):
                 break
             if verbose:
                 sys.stdout.write('.')
diff --git a/nova/db/main/api.py b/nova/db/main/api.py
@@ -4407,6 +4407,9 @@ def _archive_deleted_rows_for_table(
     select = select.order_by(column).limit(max_rows)
     with conn.begin():
         rows = conn.execute(select).fetchall()
+
+    # This is a list of IDs of rows that should be archived from this table,
+    # limited to a length of max_rows.
     records = [r[0] for r in rows]
 
     # We will archive deleted rows for this table and also generate insert and
@@ -4419,51 +4422,103 @@ def _archive_deleted_rows_for_table(
 
     # Keep track of any extra tablenames to number of rows that we archive by
     # following FK relationships.
-    # {tablename: extra_rows_archived}
+    #
+    # extras = {tablename: number_of_extra_rows_archived}
     extras = collections.defaultdict(int)
-    if records:
-        insert = shadow_table.insert().from_select(
-            columns, sql.select(table).where(column.in_(records))
-        ).inline()
-        delete = table.delete().where(column.in_(records))
+
+    if not records:
+        # Nothing to archive, so return.
+        return rows_archived, deleted_instance_uuids, extras
+
+    # Keep track of how many rows we accumulate for the insert+delete database
+    # transaction and cap it as soon as it is >= max_rows. Because we will
+    # archive all child rows of a parent row along with the parent at the same
+    # time, we end up with extra rows to archive in addition to len(records).
+    num_rows_in_batch = 0
+    # The sequence of query statements we will execute in a batch. These are
+    # ordered: [child1, child1, parent1, child2, child2, child2, parent2, ...]
+    # Parent + child "trees" are kept together to avoid FK constraint
+    # violations.
+    statements_in_batch = []
+    # The list of records in the batch. This is used for collecting deleted
+    # instance UUIDs in the case of the 'instances' table.
+    records_in_batch = []
+
+    # (melwitt): We will gather rows related by foreign key relationship for
+    # each deleted row, one at a time. We do it this way to keep track of and
+    # limit the total number of rows that will be archived in a single database
+    # transaction. In a large scale database with potentially hundreds of
+    # thousands of deleted rows, if we don't limit the size of the transaction
+    # based on max_rows, we can get into a situation where we get stuck not
+    # able to make much progress. The value of max_rows has to be 1) small
+    # enough to not exceed the database's max packet size limit or timeout with
+    # a deadlock but 2) large enough to make progress in an environment with a
+    # constant high volume of create and delete traffic. By archiving each
+    # parent + child rows tree one at a time, we can ensure meaningful progress
+    # can be made while allowing the caller to predictably control the size of
+    # the database transaction with max_rows.
+    for record in records:
         # Walk FK relationships and add insert/delete statements for rows that
         # refer to this table via FK constraints. fk_inserts and fk_deletes
         # will be prepended to by _get_fk_stmts if referring rows are found by
         # FK constraints.
         fk_inserts, fk_deletes = _get_fk_stmts(
-            metadata, conn, table, column, records)
-
-        # NOTE(tssurya): In order to facilitate the deletion of records from
-        # instance_mappings, request_specs and instance_group_member tables in
-        # the nova_api DB, the rows of deleted instances from the instances
-        # table are stored prior to their deletion. Basically the uuids of the
-        # archived instances are queried and returned.
-        if tablename == "instances":
-            query_select = sql.select(table.c.uuid).where(
-                table.c.id.in_(records)
-            )
-            with conn.begin():
-                rows = conn.execute(query_select).fetchall()
-            deleted_instance_uuids = [r[0] for r in rows]
+            metadata, conn, table, column, [record])
+        statements_in_batch.extend(fk_inserts + fk_deletes)
+        # statement to add parent row to shadow table
+        insert = shadow_table.insert().from_select(
+            columns, sql.select(table).where(column.in_([record]))).inline()
+        statements_in_batch.append(insert)
+        # statement to remove parent row from main table
+        delete = table.delete().where(column.in_([record]))
+        statements_in_batch.append(delete)
 
-        try:
-            # Group the insert and delete in a transaction.
-            with conn.begin():
-                for fk_insert in fk_inserts:
-                    conn.execute(fk_insert)
-                for fk_delete in fk_deletes:
-                    result_fk_delete = conn.execute(fk_delete)
-                    extras[fk_delete.table.name] += result_fk_delete.rowcount
-                conn.execute(insert)
-                result_delete = conn.execute(delete)
-            rows_archived += result_delete.rowcount
-        except db_exc.DBReferenceError as ex:
-            # A foreign key constraint keeps us from deleting some of
-            # these rows until we clean up a dependent table.  Just
-            # skip this table for now; we'll come back to it later.
-            LOG.warning("IntegrityError detected when archiving table "
-                        "%(tablename)s: %(error)s",
-                        {'tablename': tablename, 'error': str(ex)})
+        records_in_batch.append(record)
+
+        # Check whether were have a full batch >= max_rows. Rows are counted as
+        # the number of rows that will be moved in the database transaction.
+        # So each insert+delete pair represents one row that will be moved.
+        # 1 parent + its fks
+        num_rows_in_batch += 1 + len(fk_inserts)
+
+        if max_rows is not None and num_rows_in_batch >= max_rows:
+            break
+
+    # NOTE(tssurya): In order to facilitate the deletion of records from
+    # instance_mappings, request_specs and instance_group_member tables in the
+    # nova_api DB, the rows of deleted instances from the instances table are
+    # stored prior to their deletion. Basically the uuids of the archived
+    # instances are queried and returned.
+    if tablename == "instances":
+        query_select = sql.select(table.c.uuid).where(
+            table.c.id.in_(records_in_batch))
+        with conn.begin():
+            rows = conn.execute(query_select).fetchall()
+        # deleted_instance_uuids = ['uuid1', 'uuid2', ...]
+        deleted_instance_uuids = [r[0] for r in rows]
+
+    try:
+        # Group the insert and delete in a transaction.
+        with conn.begin():
+            for statement in statements_in_batch:
+                result = conn.execute(statement)
+                result_tablename = statement.table.name
+                # Add to archived row counts if not a shadow table.
+                if not result_tablename.startswith(_SHADOW_TABLE_PREFIX):
+                    if result_tablename == tablename:
+                        # Number of tablename (parent) rows archived.
+                        rows_archived += result.rowcount
+                    else:
+                        # Number(s) of child rows archived.
+                        extras[result_tablename] += result.rowcount
+
+    except db_exc.DBReferenceError as ex:
+        # A foreign key constraint keeps us from deleting some of these rows
+        # until we clean up a dependent table. Just skip this table for now;
+        # we'll come back to it later.
+        LOG.warning("IntegrityError detected when archiving table "
+                    "%(tablename)s: %(error)s",
+                    {'tablename': tablename, 'error': str(ex)})
 
     conn.close()
 
diff --git a/nova/tests/functional/db/test_archive.py b/nova/tests/functional/db/test_archive.py
@@ -174,6 +174,50 @@ def test_archive_deleted_rows_incomplete(self):
                 break
         self.assertFalse(exceptions)
 
+    def test_archive_deleted_rows_parent_child_trees_one_at_time(self):
+        """Test that we are archiving parent + child rows "trees" one at a time
+
+        Previously, we archived deleted rows in batches of max_rows parents +
+        their child rows in a single database transaction. Doing it that way
+        limited how high a value of max_rows could be specified by the caller
+        because of the size of the database transaction it could generate.
+
+        For example, in a large scale deployment with hundreds of thousands of
+        deleted rows and constant server creation and deletion activity, a
+        value of max_rows=1000 might exceed the database's configured maximum
+        packet size or timeout due to a database deadlock, forcing the operator
+        to use a much lower max_rows value like 100 or 50.
+
+        And when the operator has e.g. 500,000 deleted instances rows (and
+        millions of deleted rows total) they are trying to archive, being
+        forced to use a max_rows value serveral orders of magnitude lower than
+        the number of rows they need to archive was a poor user experience.
+
+        This tests that we are archiving each parent plus their child rows one
+        at a time while limiting the total number of rows per table in a batch
+        as soon as the total number of archived rows is >= max_rows.
+        """
+        # Boot two servers and delete them, then try to archive rows.
+        for i in range(2):
+            server = self._create_server()
+            self._delete_server(server)
+
+        # Use max_rows=5 to limit the archive to one complete parent +
+        # child rows tree.
+        table_to_rows, _, _ = db.archive_deleted_rows(max_rows=5)
+
+        # We should have archived only one instance because the parent +
+        # child tree will have >= max_rows of 5.
+        self.assertEqual(1, table_to_rows['instances'])
+
+        # Archive once more to archive the other instance (and its child rows).
+        table_to_rows, _, _ = db.archive_deleted_rows(max_rows=5)
+        self.assertEqual(1, table_to_rows['instances'])
+
+        # There should be nothing else to archive.
+        _, _, total_rows_archived = db.archive_deleted_rows(max_rows=100)
+        self.assertEqual(0, total_rows_archived)
+
     def _get_table_counts(self):
         engine = db.get_engine()
         conn = engine.connect()
diff --git a/nova/tests/unit/cmd/test_manage.py b/nova/tests/unit/cmd/test_manage.py
@@ -501,6 +501,26 @@ def test_archive_deleted_rows_verbose_no_results(self, mock_get_all,
         self.assertIn('Nothing was archived.', output)
         self.assertEqual(0, result)
 
+    @mock.patch.object(db, 'archive_deleted_rows')
+    @mock.patch.object(objects.CellMappingList, 'get_all')
+    def test_archive_deleted_rows_deleted_instances_no_rows(self, mock_get_all,
+                                                            mock_db_archive):
+        # Simulate a scenario where we didn't archive any rows (example:
+        # DBReferenceError was raised while processing the instances table) but
+        # the separate query for deleted_instance_uuids found rows.
+        mock_db_archive.return_value = (
+            {}, [uuidsentinel.instance1, uuidsentinel.instance2], 0)
+
+        result = self.commands.archive_deleted_rows(20, verbose=True)
+
+        mock_db_archive.assert_called_once_with(
+            test.MatchType(context.RequestContext), 20, before=None,
+            task_log=False)
+        output = self.output.getvalue()
+        # If nothing was archived, there should be no purge messages
+        self.assertIn('Nothing was archived.', output)
+        self.assertEqual(0, result)
+
     @mock.patch.object(db, 'archive_deleted_rows')
     @mock.patch.object(objects.RequestSpec, 'destroy_bulk')
     @mock.patch.object(objects.InstanceGroup, 'destroy_members_bulk')
diff --git a/releasenotes/notes/db-archive-performance-degradation-3fdabc43398149b1.yaml b/releasenotes/notes/db-archive-performance-degradation-3fdabc43398149b1.yaml
@@ -0,0 +1,21 @@
+---
+fixes:
+  - |
+    `Bug #2024258`_: Fixes an issue with performance degradation archiving
+    databases with large numbers of foreign key related records.
+
+    Previously, deleted rows were archived in batches of max_rows parents +
+    their child rows in a single database transaction. It limited how high
+    a value of max_rows could be specified by the user because of the size of
+    the database transaction it could generate. Symptoms of the behavior were
+    exceeding the maximum configured packet size of the database or timing out
+    due to a deadlock.
+
+    The behavior has been changed to archive batches of complete parent +
+    child rows trees while limiting each batch when it has reached >= max_rows
+    records. This allows the size of the database transaction to be controlled
+    by the user and enables more rows to be archived per invocation of
+    ``nova-manage db archive_deleted_rows`` when there are a large number of
+    foreign key related records.
+
+    .. _Bug #2024258: https://bugs.launchpad.net/nova/+bug/2024258