Skip to content

Commit 1f3de04

Browse files
jgallaghersmklein
andauthored
Use a Qorb pool to choose pantry clients (#6822)
The meat of this PR is the change in implementation of `get_pantry_address`: instead of asking our internal DNS resolver to look up a crucible pantry (which does not randomize, so in practice we always get whichever pantry the DNS server listed first), we ask a Qorb connection pool for the address of a healthy client. `get_pantry_address` itself does not use the client directly and only cares about its address, but the pool does keep a client around so that it can call `pantry_status()` as a health check. (It doesn't look at the contents of the result; only whether or not the request succeeded - @jmpesp if that should be more refined, please say so.) This partially addresses #3763; once this lands, if a pantry is down or unhealthy but still present in DNS (i.e., not expunged), Qorb + the status health checks should mean we'll pick a different pantry for new operations, instead of the current behavior of always sticking to the first pantry in DNS. --------- Co-authored-by: Sean Klein <[email protected]>
1 parent 4e8200a commit 1f3de04

File tree

12 files changed

+195
-42
lines changed

12 files changed

+195
-42
lines changed

Cargo.lock

Lines changed: 6 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

internal-dns/resolver/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ hickory-resolver.workspace = true
1313
internal-dns-types.workspace = true
1414
omicron-common.workspace = true
1515
omicron-workspace-hack.workspace = true
16+
qorb.workspace = true
1617
reqwest = { workspace = true, features = ["rustls-tls", "stream"] }
1718
slog.workspace = true
1819
thiserror.workspace = true

internal-dns/resolver/src/resolver.rs

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,46 @@ pub enum ResolveError {
2626
NotFoundByString(String),
2727
}
2828

29+
/// A wrapper around a set of bootstrap DNS addresses, providing a convenient
30+
/// way to construct a [`qorb::resolvers::dns::DnsResolver`] for specific
31+
/// services.
32+
#[derive(Debug, Clone)]
33+
pub struct QorbResolver {
34+
bootstrap_dns_ips: Vec<SocketAddr>,
35+
}
36+
37+
impl QorbResolver {
38+
pub fn new(bootstrap_dns_ips: Vec<SocketAddr>) -> Self {
39+
Self { bootstrap_dns_ips }
40+
}
41+
42+
pub fn bootstrap_dns_ips(&self) -> &[SocketAddr] {
43+
&self.bootstrap_dns_ips
44+
}
45+
46+
pub fn for_service(
47+
&self,
48+
service: ServiceName,
49+
) -> qorb::resolver::BoxedResolver {
50+
let config = qorb::resolvers::dns::DnsResolverConfig {
51+
// Ignore the TTL returned by our servers, primarily to avoid
52+
// thrashing if they return a TTL of 0 (which they currently do:
53+
// https://github.com/oxidecomputer/omicron/issues/6790).
54+
hardcoded_ttl: Some(std::time::Duration::MAX),
55+
// We don't currently run additional internal DNS servers that
56+
// themselves need to be found via a set of bootstrap DNS IPs, but
57+
// if we did, we'd populate `resolver_service` here to tell qorb how
58+
// to find them.
59+
..Default::default()
60+
};
61+
Box::new(qorb::resolvers::dns::DnsResolver::new(
62+
qorb::service::Name(service.srv_name()),
63+
self.bootstrap_dns_ips.clone(),
64+
config,
65+
))
66+
}
67+
}
68+
2969
/// A wrapper around a DNS resolver, providing a way to conveniently
3070
/// look up IP addresses of services based on their SRV keys.
3171
#[derive(Clone)]

nexus/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ paste.workspace = true
6868
pq-sys = "*"
6969
progenitor-client.workspace = true
7070
propolis-client.workspace = true
71+
qorb.workspace = true
7172
rand.workspace = true
7273
ref-cast.workspace = true
7374
reqwest = { workspace = true, features = ["json"] }

nexus/db-queries/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ diesel.workspace = true
2222
diesel-dtrace.workspace = true
2323
dropshot.workspace = true
2424
futures.workspace = true
25+
internal-dns-resolver.workspace = true
2526
internal-dns-types.workspace = true
2627
ipnetwork.workspace = true
2728
macaddr.workspace = true

nexus/db-queries/src/db/pool.rs

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,13 @@
88
use super::Config as DbConfig;
99
use crate::db::pool_connection::{DieselPgConnector, DieselPgConnectorArgs};
1010

11+
use internal_dns_resolver::QorbResolver;
1112
use internal_dns_types::names::ServiceName;
1213
use qorb::backend;
1314
use qorb::policy::Policy;
1415
use qorb::resolver::{AllBackends, Resolver};
15-
use qorb::resolvers::dns::{DnsResolver, DnsResolverConfig};
16-
use qorb::service;
1716
use slog::Logger;
1817
use std::collections::BTreeMap;
19-
use std::net::SocketAddr;
2018
use std::sync::Arc;
2119
use tokio::sync::watch;
2220

@@ -55,19 +53,6 @@ impl Resolver for SingleHostResolver {
5553
}
5654
}
5755

58-
fn make_dns_resolver(
59-
bootstrap_dns: Vec<SocketAddr>,
60-
) -> qorb::resolver::BoxedResolver {
61-
Box::new(DnsResolver::new(
62-
service::Name(ServiceName::Cockroach.srv_name()),
63-
bootstrap_dns,
64-
DnsResolverConfig {
65-
hardcoded_ttl: Some(tokio::time::Duration::MAX),
66-
..Default::default()
67-
},
68-
))
69-
}
70-
7156
fn make_single_host_resolver(
7257
config: &DbConfig,
7358
) -> qorb::resolver::BoxedResolver {
@@ -96,11 +81,11 @@ impl Pool {
9681
///
9782
/// Creating this pool does not necessarily wait for connections to become
9883
/// available, as backends may shift over time.
99-
pub fn new(log: &Logger, bootstrap_dns: Vec<SocketAddr>) -> Self {
84+
pub fn new(log: &Logger, resolver: &QorbResolver) -> Self {
10085
// Make sure diesel-dtrace's USDT probes are enabled.
10186
usdt::register_probes().expect("Failed to register USDT DTrace probes");
10287

103-
let resolver = make_dns_resolver(bootstrap_dns);
88+
let resolver = resolver.for_service(ServiceName::Cockroach);
10489
let connector = make_postgres_connector(log);
10590

10691
let policy = Policy::default();

nexus/src/app/mod.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ use omicron_common::api::external::Error;
3232
use omicron_common::api::internal::shared::SwitchLocation;
3333
use omicron_uuid_kinds::OmicronZoneUuid;
3434
use oximeter_producer::Server as ProducerServer;
35+
use sagas::common_storage::make_pantry_connection_pool;
36+
use sagas::common_storage::PooledPantryClient;
3537
use slog::Logger;
3638
use std::collections::HashMap;
3739
use std::net::SocketAddrV6;
@@ -186,6 +188,9 @@ pub struct Nexus {
186188
// Nexus to not all fail.
187189
samael_max_issue_delay: std::sync::Mutex<Option<chrono::Duration>>,
188190

191+
/// Conection pool for Crucible pantries
192+
pantry_connection_pool: qorb::pool::Pool<PooledPantryClient>,
193+
189194
/// DNS resolver for internal services
190195
internal_resolver: internal_dns_resolver::Resolver,
191196

@@ -214,10 +219,12 @@ pub struct Nexus {
214219
impl Nexus {
215220
/// Create a new Nexus instance for the given rack id `rack_id`
216221
// TODO-polish revisit rack metadata
222+
#[allow(clippy::too_many_arguments)]
217223
pub(crate) async fn new_with_id(
218224
rack_id: Uuid,
219225
log: Logger,
220226
resolver: internal_dns_resolver::Resolver,
227+
qorb_resolver: internal_dns_resolver::QorbResolver,
221228
pool: db::Pool,
222229
producer_registry: &ProducerRegistry,
223230
config: &NexusConfig,
@@ -473,6 +480,7 @@ impl Nexus {
473480
as Arc<dyn nexus_auth::storage::Storage>,
474481
),
475482
samael_max_issue_delay: std::sync::Mutex::new(None),
483+
pantry_connection_pool: make_pantry_connection_pool(&qorb_resolver),
476484
internal_resolver: resolver.clone(),
477485
external_resolver,
478486
external_dns_servers: config
@@ -936,6 +944,12 @@ impl Nexus {
936944
&self.internal_resolver
937945
}
938946

947+
pub(crate) fn pantry_connection_pool(
948+
&self,
949+
) -> &qorb::pool::Pool<PooledPantryClient> {
950+
&self.pantry_connection_pool
951+
}
952+
939953
pub(crate) async fn dpd_clients(
940954
&self,
941955
) -> Result<HashMap<SwitchLocation, dpd_client::Client>, String> {

nexus/src/app/sagas/common_storage.rs

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,27 +8,33 @@ use super::*;
88

99
use crate::Nexus;
1010
use crucible_pantry_client::types::VolumeConstructionRequest;
11-
use internal_dns_types::names::ServiceName;
1211
use nexus_db_queries::authz;
1312
use nexus_db_queries::context::OpContext;
1413
use nexus_db_queries::db;
1514
use nexus_db_queries::db::lookup::LookupPath;
1615
use omicron_common::api::external::Error;
1716
use omicron_common::retry_until_known_result;
1817
use slog::Logger;
18+
use slog_error_chain::InlineErrorChain;
1919
use std::net::SocketAddrV6;
2020

21+
mod pantry_pool;
22+
23+
pub(crate) use pantry_pool::make_pantry_connection_pool;
24+
pub(crate) use pantry_pool::PooledPantryClient;
25+
2126
// Common Pantry operations
2227

2328
pub(crate) async fn get_pantry_address(
2429
nexus: &Arc<Nexus>,
2530
) -> Result<SocketAddrV6, ActionError> {
26-
nexus
27-
.resolver()
28-
.lookup_socket_v6(ServiceName::CruciblePantry)
29-
.await
30-
.map_err(|e| e.to_string())
31-
.map_err(ActionError::action_failed)
31+
let client = nexus.pantry_connection_pool().claim().await.map_err(|e| {
32+
ActionError::action_failed(format!(
33+
"failed to claim pantry client from pool: {}",
34+
InlineErrorChain::new(&e)
35+
))
36+
})?;
37+
Ok(client.address())
3238
}
3339

3440
pub(crate) async fn call_pantry_attach_for_disk(
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
//! `qorb` support for Crucible pantry connection pooling.
6+
7+
use anyhow::anyhow;
8+
use anyhow::Context;
9+
use internal_dns_resolver::QorbResolver;
10+
use internal_dns_types::names::ServiceName;
11+
use qorb::backend;
12+
use qorb::pool;
13+
use std::net::SocketAddr;
14+
use std::net::SocketAddrV6;
15+
use std::sync::Arc;
16+
17+
/// Wrapper around a Crucible pantry client that also remembers its address.
18+
///
19+
/// In most cases when Nexus wants to pick a pantry, it doesn't actually want a
20+
/// client right then, but instead wants to write down its address for subsequent
21+
/// use (and reuse) later. This type carries around a `client` only to perform
22+
/// health checks as supported by `qorb`; the rest of Nexus only accesses its
23+
/// `address`.
24+
#[derive(Debug)]
25+
pub(crate) struct PooledPantryClient {
26+
client: crucible_pantry_client::Client,
27+
address: SocketAddrV6,
28+
}
29+
30+
impl PooledPantryClient {
31+
pub(crate) fn address(&self) -> SocketAddrV6 {
32+
self.address
33+
}
34+
}
35+
36+
/// A [`backend::Connector`] for [`PooledPantryClient`]s.
37+
#[derive(Debug)]
38+
struct PantryConnector;
39+
40+
#[async_trait::async_trait]
41+
impl backend::Connector for PantryConnector {
42+
type Connection = PooledPantryClient;
43+
44+
async fn connect(
45+
&self,
46+
backend: &backend::Backend,
47+
) -> Result<Self::Connection, backend::Error> {
48+
let address = match backend.address {
49+
SocketAddr::V6(addr) => addr,
50+
SocketAddr::V4(addr) => {
51+
return Err(backend::Error::Other(anyhow!(
52+
"unexpected IPv4 address for Crucible pantry: {addr}"
53+
)));
54+
}
55+
};
56+
let client =
57+
crucible_pantry_client::Client::new(&format!("http://{address}"));
58+
Ok(PooledPantryClient { client, address })
59+
}
60+
61+
async fn is_valid(
62+
&self,
63+
conn: &mut Self::Connection,
64+
) -> Result<(), backend::Error> {
65+
conn.client
66+
.pantry_status()
67+
.await
68+
.with_context(|| {
69+
format!("failed to fetch pantry status from {}", conn.address())
70+
})
71+
.map_err(backend::Error::Other)?;
72+
73+
Ok(())
74+
}
75+
76+
async fn on_acquire(
77+
&self,
78+
conn: &mut Self::Connection,
79+
) -> Result<(), backend::Error> {
80+
self.is_valid(conn).await
81+
}
82+
}
83+
84+
pub(crate) fn make_pantry_connection_pool(
85+
qorb_resolver: &QorbResolver,
86+
) -> pool::Pool<PooledPantryClient> {
87+
pool::Pool::new(
88+
qorb_resolver.for_service(ServiceName::CruciblePantry),
89+
Arc::new(PantryConnector),
90+
qorb::policy::Policy::default(),
91+
)
92+
}

nexus/src/app/sagas/region_replacement_drive.rs

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1035,11 +1035,6 @@ async fn srrd_drive_region_replacement_prepare(
10351035
"disk id" => ?disk.id(),
10361036
);
10371037

1038-
// XXX: internal-dns does not randomize the order of addresses
1039-
// in its responses: if the first Pantry in the list of
1040-
// addresses returned by DNS isn't responding, the drive saga
1041-
// will still continually try to use it.
1042-
10431038
let pantry_address = get_pantry_address(nexus).await?;
10441039

10451040
DriveAction::Pantry {

0 commit comments

Comments
 (0)