Merge pull request #733 from rohanpm/rhui-cache-fix

Fix rhui_alias resolution during cache flush [RHELDST-26143]
release-engineering · Aug 13, 2024 · e418e6e · e418e6e
2 parents e3fde79 + 2cec920
commit e418e6e
Show file tree

Hide file tree

Showing 5 changed files with 286 additions and 50 deletions.
diff --git a/exodus_gw/aws/dynamodb.py b/exodus_gw/aws/dynamodb.py
@@ -60,15 +60,77 @@ def _aliases(self, alias_types: list[str]) -> list[tuple[str, str]]:
     def aliases_for_write(self) -> list[tuple[str, str]]:
         # Aliases used when writing items to DynamoDB.
         #
-        # Exclude rhui aliases (for now? RHELDST-18849).
+        # Note that these aliases are traversed only in the src => dest
+        # direction, which intentionally results in a kind of
+        # canonicalization of paths at publish time.
+        #
+        # Example:
+        #
+        #   Given an alias of:
+        #     src:  /content/dist/rhel8/8
+        #     dest: /content/dist/rhel8/8.8
+        #
+        # If /content/dist/rhel8/8/foo is published, we write
+        # an item with path /content/dist/rhel8/8.8/foo and DO NOT
+        # write /content/dist/rhel8/8/foo.
+        #
+        # Note also that rhui_alias is not included here.
+        # It's not needed because, in practice, all of the content
+        # accessed via those aliases is always *published* on the
+        # destination side of an alias.
+        #
+        # Example:
+        #
+        #   Given an alias of:
+        #     src:  /content/dist/rhel8/rhui
+        #     dest: /content/dist/rhel8
+        #
+        # Content is always published under the /content/dist/rhel8 paths,
+        # therefore there is no need to resolve the above alias at publish
+        # time.
+        #
+        # It is possible that processing rhui_alias here would not be
+        # incorrect, but also possible that adding it might have some
+        # unintended effects.
         return self._aliases(["origin_alias", "releasever_alias"])
 
     @property
     def aliases_for_flush(self) -> list[tuple[str, str]]:
         # Aliases used when flushing cache.
-        return self._aliases(
-            ["origin_alias", "releasever_alias", "rhui_alias"]
-        )
+        out = self._aliases(["origin_alias", "releasever_alias", "rhui_alias"])
+
+        # When calculating paths for cache flush, the aliases should be resolved
+        # in *both* directions, so also return inverted copies of the aliases.
+        #
+        # Example:
+        #
+        #   Given an alias of:
+        #     src:  /content/dist/rhel8/8
+        #     dest: /content/dist/rhel8/8.8
+        #
+        #   - If /content/dist/rhel8/8/foo is published, then
+        #     /content/dist/rhel8/8.8/foo should also have cache flushed
+        #     (src => dest)
+        #   - If /content/dist/rhel8/8.8/foo is published, then
+        #     /content/dist/rhel8/8/foo should also have cache flushed
+        #     (dest => src)
+        #
+        # In practice, while it seems technically correct to do this for all aliases,
+        # this is mainly needed for RHUI content.
+        #
+        # Example:
+        #
+        #   Given an alias of:
+        #     src:  /content/dist/rhel8/rhui
+        #     dest: /content/dist/rhel8
+        #
+        # We are always publishing content only on the destination side of
+        # this alias. If we don't traverse the alias from dest => src then we
+        # will miss the fact that /content/dist/rhel8/rhui paths should also
+        # have cache flushed.
+        out = out + [(dest, src) for (src, dest) in out]
+
+        return out
 
     def query_definitions(self) -> dict[str, Any]:
         """Query the definitions in the config_table. If definitions are found, return them. Otherwise,
@@ -123,7 +185,10 @@ def create_request(
             # updated timestamp.
             from_date = str(item.updated)
 
-            web_uri = uri_alias(item.web_uri, uri_aliases)
+            # Resolve aliases. We only write to the deepest path
+            # after all alias resolution, hence only using the
+            # first result from uri_alias.
+            web_uri = uri_alias(item.web_uri, uri_aliases)[0]
 
             if delete:
                 request[table_name].append(

diff --git a/exodus_gw/aws/util.py b/exodus_gw/aws/util.py
@@ -161,43 +161,109 @@ def get_reader(cls, request):
         return cls(request)
 
 
-def uri_alias(uri: str, aliases: list[tuple[str, str]]):
+def uri_alias(uri: str, aliases: list[tuple[str, str]]) -> list[str]:
     # Resolve every alias between paths within the uri (e.g.
     # allow RHUI paths to be aliased to non-RHUI).
     #
     # Aliases are expected to come from cdn-definitions.
+    #
+    # Returns an ordered list of URIs, guaranteed to always contain
+    # at least one element (the original URI).
+    # URIs are returned for each intermediate step in alias resolution,
+    # and are ordered by depth, from deepest to most shallow.
+    #
+    # For example:
+    #  - if there are no applicable aliases, returns [uri]
+    #  - if there is one matching alias, returns [uri_beyond_alias, uri]
+    #  - if one alias matches, and then beyond that another alias matches
+    #    (meaning two levels deep of alias resolution), returns:
+    #    [uri_beyond_both_aliases, uri_beyond_first_alias, uri]
+    #
+    out: list[str] = [uri]
+    uri_alias_recurse(out, uri, aliases)
+    return out
 
-    new_uri = ""
-    remaining: list[tuple[str, str]] = aliases
-
-    # We do multiple passes here to ensure that nested aliases
-    # are resolved correctly, regardless of the order in which
-    # they're provided.
-    while remaining:
-        processed: list[tuple[str, str]] = []
-
-        for src, dest in remaining:
-            if uri.startswith(src + "/") or uri == src:
-                new_uri = uri.replace(src, dest, 1)
-                LOG.debug(
-                    "Resolved alias:\n\tsrc: %s\n\tdest: %s",
-                    uri,
+
+def uri_alias_recurse(
+    accum: list[str],
+    uri: str,
+    aliases: list[tuple[str, str]],
+    depth=0,
+    maxdepth=4,
+):
+    if depth > maxdepth:
+        # Runaway recursion breaker.
+        #
+        # There is no known path to get here, and if we ever do it's
+        # probably a bug.
+        #
+        # The point of this check is to avoid such a bug
+        # causing publish to completely break or hang.
+        LOG.warning(
+            "Aliases too deeply nested, bailing out at %s (URIs so far: %s)",
+            uri,
+            accum,
+        )
+        return
+
+    def add_out(new_uri: str) -> bool:
+        # Prepends a new URI to the output list
+        # (or shifts the existing URI to the front if it was already there)
+        # Return True if the URI was newly added.
+        out = True
+        if new_uri in accum:
+            accum.remove(new_uri)
+            out = False
+        accum.insert(0, new_uri)
+        return out
+
+    for src, dest in aliases:
+        if uri.startswith(src + "/") or uri == src:
+            new_uri = uri.replace(src, dest, 1)
+            LOG.debug(
+                "Resolved alias:\n\tsrc: %s\n\tdest: %s",
+                uri,
+                new_uri,
+                extra={"event": "publish", "success": True},
+            )
+            if add_out(new_uri):
+                # We've calculated a new URI and it's the first
+                # time we've seen it. We have to recursively apply
+                # the aliases again to this new URI.
+                #
+                # Before doing that, we'll subtract *this* alias from the set,
+                # meaning that it won't be considered recursively.
+                #
+                # We do this because otherwise RHUI aliases are infinitely
+                # recursive. For example, since:
+                #
+                #   /content/dist/rhel8/rhui
+                #     => /content/dist/rhel8
+                #
+                # That means this also works:
+                #   /content/dist/rhel8/rhui/rhui
+                #     => /content/dist/rhel8/rhui
+                #       => /content/dist/rhel8
+                #
+                # In fact you could insert as many "rhui" components into the path
+                # as you want and still ultimately resolve to the same path.
+                # That's the way the symlinks actually worked on the legacy CDN.
+                #
+                # But this is not desired behavior, we know that every alias is intended
+                # to be resolved in the URL a maximum of once, hence this adjustment.
+                sub_aliases = [
+                    (subsrc, subdest)
+                    for (subsrc, subdest) in aliases
+                    if (subsrc, subdest) != (src, dest)
+                ]
+
+                uri_alias_recurse(
+                    accum,
                     new_uri,
-                    extra={"event": "publish", "success": True},
+                    sub_aliases,
+                    depth=depth + 1,
+                    maxdepth=maxdepth,
                 )
-                uri = new_uri
-                processed.append((src, dest))
-
-        if not processed:
-            # We didn't resolve any alias, then we're done processing.
-            break
-
-        # We resolved at least one alias, so we need another round
-        # in case others apply now. But take out anything we've already
-        # processed, so it is not possible to recurse.
-        remaining = [r for r in remaining if r not in processed]
-
-    return uri
 
 
 def uris_with_aliases(
@@ -211,10 +277,7 @@ def uris_with_aliases(
         # We accept inputs both with and without leading '/', normalize.
         uri = "/" + uri.removeprefix("/")
 
-        # This always goes into the set we'll process.
-        out.add(uri)
-
-        # The value after alias resolution also goes into the set.
-        out.add(uri_alias(uri, aliases))
+        for resolved in uri_alias(uri, aliases):
+            out.add(resolved)
 
     return sorted(out)
diff --git a/tests/aws/test_uri_alias.py b/tests/aws/test_uri_alias.py
@@ -14,14 +14,20 @@
                 ("/content/origin", "/origin"),
                 ("/origin/rpm", "/origin/rpms"),
             ],
-            "/origin/rpms/path/to/file.iso",
+            [
+                "/origin/rpms/path/to/file.iso",
+                "/content/origin/rpms/path/to/file.iso",
+            ],
         ),
         (
             "/content/dist/rhel8/8/path/to/file.rpm",
             [
                 ("/content/dist/rhel8/8", "/content/dist/rhel8/8.5"),
             ],
-            "/content/dist/rhel8/8.5/path/to/file.rpm",
+            [
+                "/content/dist/rhel8/8.5/path/to/file.rpm",
+                "/content/dist/rhel8/8/path/to/file.rpm",
+            ],
         ),
     ],
     ids=["origin", "releasever"],
@@ -30,7 +36,104 @@ def test_uri_alias(input, aliases, output, caplog):
     caplog.set_level(DEBUG, logger="exodus-gw")
     assert uri_alias(input, aliases) == output
     assert (
-        f'"message": "Resolved alias:\\n\\tsrc: {input}\\n\\tdest: {output}", '
+        f'"message": "Resolved alias:\\n\\tsrc: {input}\\n\\tdest: {output[0]}", '
         '"event": "publish", '
         '"success": true' in caplog.text
     )
+
+
+def test_uri_alias_multi_level_write():
+    # uri_alias should support resolving aliases multiple levels deep
+    # and return values in the right order, using single-direction aliases
+    # (as is typical in the write case)
+    uri = "/content/other/1/repo"
+    aliases = [
+        # The data here is made up as there is not currently any identified
+        # realistic scenario having multi-level aliases during write.
+        ("/content/testproduct/1", "/content/testproduct/1.1.0"),
+        ("/content/other", "/content/testproduct"),
+    ]
+
+    out = uri_alias(uri, aliases)
+    assert out == [
+        "/content/testproduct/1.1.0/repo",
+        "/content/testproduct/1/repo",
+        "/content/other/1/repo",
+    ]
+
+
+def test_uri_alias_multi_level_flush():
+    # uri_alias should support resolving aliases multiple levels deep
+    # and return values in the right order, using bi-directional aliases
+    # (as is typical in the flush case).
+    #
+    # This data is realistic for the common case where releasever
+    # and rhui aliases are both in play.
+
+    uri = "/content/dist/rhel8/8/some-repo/"
+    aliases = [
+        # The caller is providing aliases in both src => dest and
+        # dest => src directions, as in the "cache flush" case.
+        ("/content/dist/rhel8/8", "/content/dist/rhel8/8.8"),
+        ("/content/dist/rhel8/8.8", "/content/dist/rhel8/8"),
+        ("/content/dist/rhel8/rhui", "/content/dist/rhel8"),
+        ("/content/dist/rhel8", "/content/dist/rhel8/rhui"),
+    ]
+
+    out = uri_alias(uri, aliases)
+    # We don't verify the order here because, with bi-directional aliases
+    # provided, it does not really make sense to consider either side of
+    # the alias as "deeper" than the other.
+    assert sorted(out) == sorted(
+        [
+            # It should return the repo on both sides of the
+            # releasever alias...
+            "/content/dist/rhel8/8/some-repo/",
+            "/content/dist/rhel8/8.8/some-repo/",
+            # And *also* on both sides of the releasever alias, beyond
+            # the RHUI alias.
+            "/content/dist/rhel8/rhui/8/some-repo/",
+            "/content/dist/rhel8/rhui/8.8/some-repo/",
+        ]
+    )
+
+
+def test_uri_alias_limit(caplog: pytest.LogCaptureFixture):
+    # uri_alias applies some limit on the alias resolution depth.
+    #
+    # This test exists to exercise the path of code intended to
+    # prevent runaway recursion. There is no known way how to
+    # actually trigger runaway recursion, so we are just providing
+    # an unrealistic config with more levels of alias than are
+    # actually used on production.
+
+    uri = "/path/a/repo"
+    aliases = [
+        ("/path/a", "/path/b"),
+        ("/path/b", "/path/c"),
+        ("/path/c", "/path/d"),
+        ("/path/d", "/path/e"),
+        ("/path/e", "/path/f"),
+        ("/path/f", "/path/g"),
+        ("/path/g", "/path/h"),
+        ("/path/h", "/path/i"),
+    ]
+
+    out = uri_alias(uri, aliases)
+
+    # It should have stopped resolving aliases at some point.
+    # Note that exactly where it stops is rather abitrary, the
+    # max depth currently is simply hardcoded.
+    assert out == [
+        "/path/f/repo",
+        "/path/e/repo",
+        "/path/d/repo",
+        "/path/c/repo",
+        "/path/b/repo",
+        "/path/a/repo",
+    ]
+
+    # It should have warned us about this.
+    assert (
+        "Aliases too deeply nested, bailing out at /path/f/repo" in caplog.text
+    )