Skip to content

Commit

Permalink
Routing RPW: Allow only ObjectNotFound for lookup failures (#7537)
Browse files Browse the repository at this point in the history
This stems from attempting to understand #7534/#7541. In reviewing
that, we have a hypothetical failure mode where any lookup which fails
due to a database failure (rather than a name lookup) could lead to gaps
in routes, which would require a router bump to retry.

This PR changes resolution of all items such that any unexpected DB
failures (transient or otherwise) *will* cause the router's resolution
to bail. It will take some time before it's recomputed, but we know it
will be done.
  • Loading branch information
FelixMcFelix authored Feb 20, 2025
1 parent ee1b02e commit 0e89dea
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 23 deletions.
7 changes: 6 additions & 1 deletion nexus/db-queries/src/db/datastore/network_interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -648,7 +648,12 @@ impl DataStore {
&*self.pool_connection_authorized(opctx).await?,
)
.await
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
.map_err(|e| match e {
DieselError::NotFound => Error::non_resourcetype_not_found(
"instance has no primary NIC",
),
e => public_error_from_diesel(e, ErrorHandler::Server),
})
}

/// Get network interface associated with a given probe.
Expand Down
69 changes: 47 additions & 22 deletions nexus/db-queries/src/db/datastore/vpc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ use diesel::prelude::*;
use diesel::result::DatabaseErrorKind;
use diesel::result::Error as DieselError;
use futures::stream::{self, StreamExt};
use futures::TryStreamExt;
use ipnetwork::IpNetwork;
use nexus_auth::authz::ApiResource;
use nexus_db_fixed_data::vpc::SERVICES_INTERNET_GATEWAY_DEFAULT_ROUTE_V4;
Expand Down Expand Up @@ -2560,6 +2561,21 @@ impl DataStore {
}
}

// Generally, we want all name lookups here to be fallible as we
// don't have strong links between destinations, targets, and their
// underlying resources. Such an error will be converted to a no-op
// for any child rules.
// However, we cannot allow 503s/CRDB communication failures to be
// be part of any router version -- we'll have omitted rules and
// sled-agent won't accept an update unless the version has bumped up.
// We'll eventually retry this router when the RPW is scheduled again.
fn allow_not_found<O>(e: Error) -> Option<Result<O, Error>> {
match e {
Error::ObjectNotFound { .. } | Error::NotFound { .. } => None,
e => Some(Err(e)),
}
}

// VpcSubnet routes are control-plane managed, and have a foreign-key relationship
// to subnets they mirror. Prefer this over name resolution when possible, knowing
// that some user routes still require name resolution.
Expand All @@ -2571,11 +2587,12 @@ impl DataStore {
.vpc_subnet_id(id)
.fetch()
.await
.ok()
.map(|(.., subnet)| (id, subnet))
.map_or_else(allow_not_found, |(.., subnet)| {
Some(Ok((id, subnet)))
})
})
.collect::<HashMap<_, _>>()
.await;
.try_collect::<HashMap<_, _>>()
.await?;

let mut subnets_by_name = subnets_by_id
.values()
Expand All @@ -2588,16 +2605,18 @@ impl DataStore {
continue;
}

let Some((.., subnet)) = db::lookup::LookupPath::new(opctx, self)
let Some(res) = db::lookup::LookupPath::new(opctx, self)
.vpc_id(vpc_id)
.vpc_subnet_name(Name::ref_cast(name))
.fetch()
.await
.ok()
.map_or_else(allow_not_found, |v| Some(Ok(v)))
else {
continue;
};

let (.., subnet) = res?;

subnets_by_id.insert(subnet.id(), subnet.clone());
subnets_by_name.insert(subnet.name().clone(), subnet);
}
Expand All @@ -2610,11 +2629,12 @@ impl DataStore {
.vpc_name(Name::ref_cast(&name))
.fetch()
.await
.ok()
.map(|(.., vpc)| (name, vpc))
.map_or_else(allow_not_found, |(.., vpc)| {
Some(Ok((name, vpc)))
})
})
.collect::<HashMap<_, _>>()
.await;
.try_collect::<HashMap<_, _>>()
.await?;

let inetgws = stream::iter(inetgw_names)
.filter_map(|name| async {
Expand All @@ -2623,11 +2643,12 @@ impl DataStore {
.internet_gateway_name(Name::ref_cast(&name))
.fetch()
.await
.ok()
.map(|(.., igw)| (name, igw))
.map_or_else(allow_not_found, |(.., igw)| {
Some(Ok((name, igw)))
})
})
.collect::<HashMap<_, _>>()
.await;
.try_collect::<HashMap<_, _>>()
.await?;

let instances = stream::iter(instance_names)
.filter_map(|name| async {
Expand All @@ -2636,10 +2657,11 @@ impl DataStore {
.instance_name(Name::ref_cast(&name))
.fetch()
.await
.ok()
.map(|(.., auth, inst)| (name, auth, inst))
.map_or_else(allow_not_found, |(.., auth, inst)| {
Some(Ok((name, auth, inst)))
})
})
.filter_map(|(name, authz_instance, instance)| async move {
.try_filter_map(|(name, authz_instance, instance)| async move {
// XXX: currently an instance can have one primary NIC,
// and it is not dual-stack (v4 + v6). We need
// to clarify what should be resolved in the v6 case.
Expand All @@ -2648,11 +2670,13 @@ impl DataStore {
&authz_instance,
)
.await
.ok()
.map(|primary_nic| (name, (instance, primary_nic)))
.map_or_else(allow_not_found, |primary_nic| {
Some(Ok((name, (instance, primary_nic))))
})
.transpose()
})
.collect::<HashMap<_, _>>()
.await;
.try_collect::<HashMap<_, _>>()
.await?;

// See the discussion in `resolve_firewall_rules_for_sled_agent` on
// how we should resolve name misses in route resolution.
Expand Down Expand Up @@ -3824,7 +3848,8 @@ mod tests {
.unwrap();

// Resolve the rules: we will have two entries generated by the
// VPC subnet (v4, v6).
// VPC subnet (v4, v6). The absence of an instance does not cause
// us to bail, it is simply a name resolution failure.
let routes = datastore
.vpc_resolve_router_rules(&opctx, authz_router.id())
.await
Expand Down

0 comments on commit 0e89dea

Please sign in to comment.