Skip to content

Commit

Permalink
Merge pull request #733 from rohanpm/rhui-cache-fix
Browse files Browse the repository at this point in the history
Fix rhui_alias resolution during cache flush [RHELDST-26143]
  • Loading branch information
rohanpm authored Aug 13, 2024
2 parents e3fde79 + 2cec920 commit e418e6e
Show file tree
Hide file tree
Showing 5 changed files with 286 additions and 50 deletions.
75 changes: 70 additions & 5 deletions exodus_gw/aws/dynamodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,77 @@ def _aliases(self, alias_types: list[str]) -> list[tuple[str, str]]:
def aliases_for_write(self) -> list[tuple[str, str]]:
# Aliases used when writing items to DynamoDB.
#
# Exclude rhui aliases (for now? RHELDST-18849).
# Note that these aliases are traversed only in the src => dest
# direction, which intentionally results in a kind of
# canonicalization of paths at publish time.
#
# Example:
#
# Given an alias of:
# src: /content/dist/rhel8/8
# dest: /content/dist/rhel8/8.8
#
# If /content/dist/rhel8/8/foo is published, we write
# an item with path /content/dist/rhel8/8.8/foo and DO NOT
# write /content/dist/rhel8/8/foo.
#
# Note also that rhui_alias is not included here.
# It's not needed because, in practice, all of the content
# accessed via those aliases is always *published* on the
# destination side of an alias.
#
# Example:
#
# Given an alias of:
# src: /content/dist/rhel8/rhui
# dest: /content/dist/rhel8
#
# Content is always published under the /content/dist/rhel8 paths,
# therefore there is no need to resolve the above alias at publish
# time.
#
# It is possible that processing rhui_alias here would not be
# incorrect, but also possible that adding it might have some
# unintended effects.
return self._aliases(["origin_alias", "releasever_alias"])

@property
def aliases_for_flush(self) -> list[tuple[str, str]]:
# Aliases used when flushing cache.
return self._aliases(
["origin_alias", "releasever_alias", "rhui_alias"]
)
out = self._aliases(["origin_alias", "releasever_alias", "rhui_alias"])

# When calculating paths for cache flush, the aliases should be resolved
# in *both* directions, so also return inverted copies of the aliases.
#
# Example:
#
# Given an alias of:
# src: /content/dist/rhel8/8
# dest: /content/dist/rhel8/8.8
#
# - If /content/dist/rhel8/8/foo is published, then
# /content/dist/rhel8/8.8/foo should also have cache flushed
# (src => dest)
# - If /content/dist/rhel8/8.8/foo is published, then
# /content/dist/rhel8/8/foo should also have cache flushed
# (dest => src)
#
# In practice, while it seems technically correct to do this for all aliases,
# this is mainly needed for RHUI content.
#
# Example:
#
# Given an alias of:
# src: /content/dist/rhel8/rhui
# dest: /content/dist/rhel8
#
# We are always publishing content only on the destination side of
# this alias. If we don't traverse the alias from dest => src then we
# will miss the fact that /content/dist/rhel8/rhui paths should also
# have cache flushed.
out = out + [(dest, src) for (src, dest) in out]

return out

def query_definitions(self) -> dict[str, Any]:
"""Query the definitions in the config_table. If definitions are found, return them. Otherwise,
Expand Down Expand Up @@ -123,7 +185,10 @@ def create_request(
# updated timestamp.
from_date = str(item.updated)

web_uri = uri_alias(item.web_uri, uri_aliases)
# Resolve aliases. We only write to the deepest path
# after all alias resolution, hence only using the
# first result from uri_alias.
web_uri = uri_alias(item.web_uri, uri_aliases)[0]

if delete:
request[table_name].append(
Expand Down
133 changes: 98 additions & 35 deletions exodus_gw/aws/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,43 +161,109 @@ def get_reader(cls, request):
return cls(request)


def uri_alias(uri: str, aliases: list[tuple[str, str]]):
def uri_alias(uri: str, aliases: list[tuple[str, str]]) -> list[str]:
# Resolve every alias between paths within the uri (e.g.
# allow RHUI paths to be aliased to non-RHUI).
#
# Aliases are expected to come from cdn-definitions.
#
# Returns an ordered list of URIs, guaranteed to always contain
# at least one element (the original URI).
# URIs are returned for each intermediate step in alias resolution,
# and are ordered by depth, from deepest to most shallow.
#
# For example:
# - if there are no applicable aliases, returns [uri]
# - if there is one matching alias, returns [uri_beyond_alias, uri]
# - if one alias matches, and then beyond that another alias matches
# (meaning two levels deep of alias resolution), returns:
# [uri_beyond_both_aliases, uri_beyond_first_alias, uri]
#
out: list[str] = [uri]
uri_alias_recurse(out, uri, aliases)
return out

new_uri = ""
remaining: list[tuple[str, str]] = aliases

# We do multiple passes here to ensure that nested aliases
# are resolved correctly, regardless of the order in which
# they're provided.
while remaining:
processed: list[tuple[str, str]] = []

for src, dest in remaining:
if uri.startswith(src + "/") or uri == src:
new_uri = uri.replace(src, dest, 1)
LOG.debug(
"Resolved alias:\n\tsrc: %s\n\tdest: %s",
uri,

def uri_alias_recurse(
accum: list[str],
uri: str,
aliases: list[tuple[str, str]],
depth=0,
maxdepth=4,
):
if depth > maxdepth:
# Runaway recursion breaker.
#
# There is no known path to get here, and if we ever do it's
# probably a bug.
#
# The point of this check is to avoid such a bug
# causing publish to completely break or hang.
LOG.warning(
"Aliases too deeply nested, bailing out at %s (URIs so far: %s)",
uri,
accum,
)
return

def add_out(new_uri: str) -> bool:
# Prepends a new URI to the output list
# (or shifts the existing URI to the front if it was already there)
# Return True if the URI was newly added.
out = True
if new_uri in accum:
accum.remove(new_uri)
out = False
accum.insert(0, new_uri)
return out

for src, dest in aliases:
if uri.startswith(src + "/") or uri == src:
new_uri = uri.replace(src, dest, 1)
LOG.debug(
"Resolved alias:\n\tsrc: %s\n\tdest: %s",
uri,
new_uri,
extra={"event": "publish", "success": True},
)
if add_out(new_uri):
# We've calculated a new URI and it's the first
# time we've seen it. We have to recursively apply
# the aliases again to this new URI.
#
# Before doing that, we'll subtract *this* alias from the set,
# meaning that it won't be considered recursively.
#
# We do this because otherwise RHUI aliases are infinitely
# recursive. For example, since:
#
# /content/dist/rhel8/rhui
# => /content/dist/rhel8
#
# That means this also works:
# /content/dist/rhel8/rhui/rhui
# => /content/dist/rhel8/rhui
# => /content/dist/rhel8
#
# In fact you could insert as many "rhui" components into the path
# as you want and still ultimately resolve to the same path.
# That's the way the symlinks actually worked on the legacy CDN.
#
# But this is not desired behavior, we know that every alias is intended
# to be resolved in the URL a maximum of once, hence this adjustment.
sub_aliases = [
(subsrc, subdest)
for (subsrc, subdest) in aliases
if (subsrc, subdest) != (src, dest)
]

uri_alias_recurse(
accum,
new_uri,
extra={"event": "publish", "success": True},
sub_aliases,
depth=depth + 1,
maxdepth=maxdepth,
)
uri = new_uri
processed.append((src, dest))

if not processed:
# We didn't resolve any alias, then we're done processing.
break

# We resolved at least one alias, so we need another round
# in case others apply now. But take out anything we've already
# processed, so it is not possible to recurse.
remaining = [r for r in remaining if r not in processed]

return uri


def uris_with_aliases(
Expand All @@ -211,10 +277,7 @@ def uris_with_aliases(
# We accept inputs both with and without leading '/', normalize.
uri = "/" + uri.removeprefix("/")

# This always goes into the set we'll process.
out.add(uri)

# The value after alias resolution also goes into the set.
out.add(uri_alias(uri, aliases))
for resolved in uri_alias(uri, aliases):
out.add(resolved)

return sorted(out)
109 changes: 106 additions & 3 deletions tests/aws/test_uri_alias.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,20 @@
("/content/origin", "/origin"),
("/origin/rpm", "/origin/rpms"),
],
"/origin/rpms/path/to/file.iso",
[
"/origin/rpms/path/to/file.iso",
"/content/origin/rpms/path/to/file.iso",
],
),
(
"/content/dist/rhel8/8/path/to/file.rpm",
[
("/content/dist/rhel8/8", "/content/dist/rhel8/8.5"),
],
"/content/dist/rhel8/8.5/path/to/file.rpm",
[
"/content/dist/rhel8/8.5/path/to/file.rpm",
"/content/dist/rhel8/8/path/to/file.rpm",
],
),
],
ids=["origin", "releasever"],
Expand All @@ -30,7 +36,104 @@ def test_uri_alias(input, aliases, output, caplog):
caplog.set_level(DEBUG, logger="exodus-gw")
assert uri_alias(input, aliases) == output
assert (
f'"message": "Resolved alias:\\n\\tsrc: {input}\\n\\tdest: {output}", '
f'"message": "Resolved alias:\\n\\tsrc: {input}\\n\\tdest: {output[0]}", '
'"event": "publish", '
'"success": true' in caplog.text
)


def test_uri_alias_multi_level_write():
# uri_alias should support resolving aliases multiple levels deep
# and return values in the right order, using single-direction aliases
# (as is typical in the write case)
uri = "/content/other/1/repo"
aliases = [
# The data here is made up as there is not currently any identified
# realistic scenario having multi-level aliases during write.
("/content/testproduct/1", "/content/testproduct/1.1.0"),
("/content/other", "/content/testproduct"),
]

out = uri_alias(uri, aliases)
assert out == [
"/content/testproduct/1.1.0/repo",
"/content/testproduct/1/repo",
"/content/other/1/repo",
]


def test_uri_alias_multi_level_flush():
# uri_alias should support resolving aliases multiple levels deep
# and return values in the right order, using bi-directional aliases
# (as is typical in the flush case).
#
# This data is realistic for the common case where releasever
# and rhui aliases are both in play.

uri = "/content/dist/rhel8/8/some-repo/"
aliases = [
# The caller is providing aliases in both src => dest and
# dest => src directions, as in the "cache flush" case.
("/content/dist/rhel8/8", "/content/dist/rhel8/8.8"),
("/content/dist/rhel8/8.8", "/content/dist/rhel8/8"),
("/content/dist/rhel8/rhui", "/content/dist/rhel8"),
("/content/dist/rhel8", "/content/dist/rhel8/rhui"),
]

out = uri_alias(uri, aliases)
# We don't verify the order here because, with bi-directional aliases
# provided, it does not really make sense to consider either side of
# the alias as "deeper" than the other.
assert sorted(out) == sorted(
[
# It should return the repo on both sides of the
# releasever alias...
"/content/dist/rhel8/8/some-repo/",
"/content/dist/rhel8/8.8/some-repo/",
# And *also* on both sides of the releasever alias, beyond
# the RHUI alias.
"/content/dist/rhel8/rhui/8/some-repo/",
"/content/dist/rhel8/rhui/8.8/some-repo/",
]
)


def test_uri_alias_limit(caplog: pytest.LogCaptureFixture):
# uri_alias applies some limit on the alias resolution depth.
#
# This test exists to exercise the path of code intended to
# prevent runaway recursion. There is no known way how to
# actually trigger runaway recursion, so we are just providing
# an unrealistic config with more levels of alias than are
# actually used on production.

uri = "/path/a/repo"
aliases = [
("/path/a", "/path/b"),
("/path/b", "/path/c"),
("/path/c", "/path/d"),
("/path/d", "/path/e"),
("/path/e", "/path/f"),
("/path/f", "/path/g"),
("/path/g", "/path/h"),
("/path/h", "/path/i"),
]

out = uri_alias(uri, aliases)

# It should have stopped resolving aliases at some point.
# Note that exactly where it stops is rather abitrary, the
# max depth currently is simply hardcoded.
assert out == [
"/path/f/repo",
"/path/e/repo",
"/path/d/repo",
"/path/c/repo",
"/path/b/repo",
"/path/a/repo",
]

# It should have warned us about this.
assert (
"Aliases too deeply nested, bailing out at /path/f/repo" in caplog.text
)
Loading

0 comments on commit e418e6e

Please sign in to comment.