From 1dbbf1fd23983686602ed88417a30aeac8e45a18 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Sat, 1 Feb 2025 12:53:59 -0800 Subject: [PATCH 1/8] bump to v13; API version to planned release date (#7458) --- dev-tools/openapi-manager/src/spec.rs | 2 +- dev-tools/releng/src/main.rs | 2 +- nexus/external-api/src/lib.rs | 2 +- openapi/nexus.json | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dev-tools/openapi-manager/src/spec.rs b/dev-tools/openapi-manager/src/spec.rs index 57cbbea5dab..1aee73f12ae 100644 --- a/dev-tools/openapi-manager/src/spec.rs +++ b/dev-tools/openapi-manager/src/spec.rs @@ -104,7 +104,7 @@ pub fn all_apis() -> Vec { }, ApiSpec { title: "Oxide Region API", - version: semver::Version::new(20241204, 0, 0), + version: semver::Version::new(20250212, 0, 0), description: "API for interacting with the Oxide control plane", boundary: ApiBoundary::External, api_description: diff --git a/dev-tools/releng/src/main.rs b/dev-tools/releng/src/main.rs index 2f0508c41a2..ba71d8f8c03 100644 --- a/dev-tools/releng/src/main.rs +++ b/dev-tools/releng/src/main.rs @@ -42,7 +42,7 @@ use crate::job::Jobs; /// to as "v8", "version 8", or "release 8" to customers). The use of semantic /// versioning is mostly to hedge for perhaps wanting something more granular in /// the future. -const BASE_VERSION: Version = Version::new(12, 0, 0); +const BASE_VERSION: Version = Version::new(13, 0, 0); const RETRY_ATTEMPTS: usize = 3; diff --git a/nexus/external-api/src/lib.rs b/nexus/external-api/src/lib.rs index a832bde3199..d76192072a8 100644 --- a/nexus/external-api/src/lib.rs +++ b/nexus/external-api/src/lib.rs @@ -23,7 +23,7 @@ use omicron_common::api::external::{ use openapi_manager_types::ValidationContext; use openapiv3::OpenAPI; -pub const API_VERSION: &str = "20241204.0.0"; +pub const API_VERSION: &str = "20250212.0.0"; const MIB: usize = 1024 * 1024; const GIB: usize = 1024 * MIB; diff --git a/openapi/nexus.json b/openapi/nexus.json index 45f2d4787cf..c94b5970c08 100644 --- a/openapi/nexus.json +++ b/openapi/nexus.json @@ -7,7 +7,7 @@ "url": "https://oxide.computer", "email": "api@oxide.computer" }, - "version": "20241204.0.0" + "version": "20250212.0.0" }, "paths": { "/device/auth": { From 1a0bb7424a04fa221178ef958a49dd492c96b50b Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Sat, 1 Feb 2025 14:36:18 -0800 Subject: [PATCH 2/8] nexus: split out control plane zones as separate artifacts (#7452) The primary change here is adding a control knob to `ArtifactsWithPlan::from_stream` that either splits out the artifacts or combines all the zones into a single tarball. `ControlPlaneZonesMode::Composite` is the current behavior and is used in Wicket and existing unit tests. For the moment this doesn't support handling repositories with artifacts already split out, but the semantics of this setting are that if/when we do that, it will combine them into a single artifact for Wicket. `ControlPlaneZonesMode::Split` enables the new behavior and is used in Nexus. This makes the individual zone images available in the TUF Repo Depot and therefore immediately usable by Sled Agent when creating zones. Additionally, we add a check to assembling a TUF repo to ensure that no individual zone images have the same hash; since the fake zones created during tests had the same hashes, we learned such a repo would not be accepted by Nexus due to this check: https://github.com/oxidecomputer/omicron/blob/e5bf48fdba89707257ef6ba75375714359941370/update-common/src/artifacts/update_plan.rs#L875-L882 Closes #4411. (If we want to split out the control plane zones in actual repos we can open a new issue for that.) --- Cargo.lock | 18 ++ Cargo.toml | 3 + brand-metadata/Cargo.toml | 15 + brand-metadata/src/lib.rs | 151 ++++++++++ common/src/api/internal/nexus.rs | 3 + .../tasks/tuf_artifact_replication.rs | 2 +- nexus/src/app/update/mod.rs | 15 +- nexus/tests/integration_tests/updates.rs | 21 ++ sled-agent/Cargo.toml | 1 + sled-agent/src/bootstrap/http_entrypoints.rs | 10 +- sled-agent/src/updates.rs | 91 ++---- tufaceous-lib/Cargo.toml | 2 + tufaceous-lib/src/artifact.rs | 37 ++- tufaceous-lib/src/artifact/composite.rs | 37 ++- tufaceous-lib/src/assemble/manifest.rs | 40 ++- tufaceous-lib/src/lib.rs | 1 - tufaceous-lib/src/oxide_metadata.rs | 133 --------- tufaceous/manifests/fake.toml | 4 +- update-common/Cargo.toml | 4 + .../src/artifacts/artifacts_with_plan.rs | 97 +++++- .../src/artifacts/extracted_artifacts.rs | 7 +- update-common/src/artifacts/update_plan.rs | 279 +++++++++++++++--- wicketd/src/update_tracker.rs | 5 +- wicketd/tests/integration_tests/updates.rs | 9 +- 24 files changed, 672 insertions(+), 313 deletions(-) create mode 100644 brand-metadata/Cargo.toml create mode 100644 brand-metadata/src/lib.rs delete mode 100644 tufaceous-lib/src/oxide_metadata.rs diff --git a/Cargo.lock b/Cargo.lock index 20a6d44e2e5..41dacdffcb2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6673,6 +6673,17 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "omicron-brand-metadata" +version = "0.1.0" +dependencies = [ + "omicron-workspace-hack", + "semver 1.0.24", + "serde", + "serde_json", + "tar", +] + [[package]] name = "omicron-certificates" version = "0.1.0" @@ -7336,6 +7347,7 @@ dependencies = [ "nexus-reconfigurator-blippy", "nexus-sled-agent-shared", "nexus-types", + "omicron-brand-metadata", "omicron-common", "omicron-ddm-admin-client", "omicron-test-utils", @@ -12406,11 +12418,13 @@ dependencies = [ "hex", "hubtools", "itertools 0.13.0", + "omicron-brand-metadata", "omicron-common", "omicron-test-utils", "omicron-workspace-hack", "parse-size", "rand", + "semver 1.0.24", "serde", "serde_json", "serde_path_to_error", @@ -12752,15 +12766,19 @@ dependencies = [ "debug-ignore", "display-error-chain", "dropshot 0.15.1", + "flate2", + "fs-err", "futures", "hex", "hubtools", + "omicron-brand-metadata", "omicron-common", "omicron-test-utils", "omicron-workspace-hack", "rand", "sha2", "slog", + "tar", "thiserror 1.0.69", "tokio", "tokio-util", diff --git a/Cargo.toml b/Cargo.toml index 98f571b8267..b0505ae695e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,6 +2,7 @@ members = [ "api_identity", "bootstore", + "brand-metadata", "certificates", "clickhouse-admin", "clickhouse-admin/api", @@ -135,6 +136,7 @@ members = [ default-members = [ "api_identity", "bootstore", + "brand-metadata", "certificates", "clickhouse-admin", "clickhouse-admin/api", @@ -493,6 +495,7 @@ nexus-types = { path = "nexus/types" } nom = "7.1.3" num-integer = "0.1.46" num = { version = "0.4.3", default-features = false, features = [ "libm" ] } +omicron-brand-metadata = { path = "brand-metadata" } omicron-clickhouse-admin = { path = "clickhouse-admin" } omicron-certificates = { path = "certificates" } omicron-cockroach-admin = { path = "cockroach-admin" } diff --git a/brand-metadata/Cargo.toml b/brand-metadata/Cargo.toml new file mode 100644 index 00000000000..98462c695e2 --- /dev/null +++ b/brand-metadata/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "omicron-brand-metadata" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[dependencies] +omicron-workspace-hack.workspace = true +semver.workspace = true +serde.workspace = true +serde_json.workspace = true +tar.workspace = true + +[lints] +workspace = true diff --git a/brand-metadata/src/lib.rs b/brand-metadata/src/lib.rs new file mode 100644 index 00000000000..f25b120ac63 --- /dev/null +++ b/brand-metadata/src/lib.rs @@ -0,0 +1,151 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Handling of `oxide.json` metadata files in tarballs. +//! +//! `oxide.json` is originally defined by the omicron1(7) zone brand, which +//! lives at . tufaceous +//! extended this format with additional archive types for identifying other +//! types of tarballs; this crate covers those extensions so they can be used +//! across the Omicron codebase. + +use std::io::{Error, ErrorKind, Read, Result, Write}; + +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct Metadata { + v: String, + + // helios-build-utils defines a top-level `i` field for extra information, + // but omicron-package doesn't use this for the package name and version. + // We can also benefit from having rich types for these extra fields, so + // any additional top-level fields (including `i`) that exist for a given + // archive type should be deserialized as part of `ArchiveType`. + #[serde(flatten)] + t: ArchiveType, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(rename_all = "snake_case", tag = "t")] +pub enum ArchiveType { + // Originally defined in helios-build-utils (part of helios-omicron-brand): + Baseline, + Layer(LayerInfo), + Os, + + // tufaceous extensions: + Rot, + ControlPlane, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct LayerInfo { + pub pkg: String, + pub version: semver::Version, +} + +impl Metadata { + pub fn new(archive_type: ArchiveType) -> Metadata { + Metadata { v: "1".into(), t: archive_type } + } + + pub fn append_to_tar( + &self, + a: &mut tar::Builder, + mtime: u64, + ) -> Result<()> { + let mut b = serde_json::to_vec(self)?; + b.push(b'\n'); + + let mut h = tar::Header::new_ustar(); + h.set_entry_type(tar::EntryType::Regular); + h.set_username("root")?; + h.set_uid(0); + h.set_groupname("root")?; + h.set_gid(0); + h.set_path("oxide.json")?; + h.set_mode(0o444); + h.set_size(b.len().try_into().unwrap()); + h.set_mtime(mtime); + h.set_cksum(); + + a.append(&h, b.as_slice())?; + Ok(()) + } + + /// Read `Metadata` from a tar archive. + /// + /// `oxide.json` is generally the first file in the archive, so this should + /// be a just-opened archive with no entries already read. + pub fn read_from_tar(a: &mut tar::Archive) -> Result { + for entry in a.entries()? { + let mut entry = entry?; + if entry.path()? == std::path::Path::new("oxide.json") { + return Ok(serde_json::from_reader(&mut entry)?); + } + } + Err(Error::new(ErrorKind::InvalidData, "oxide.json is not present")) + } + + pub fn archive_type(&self) -> &ArchiveType { + &self.t + } + + pub fn is_layer(&self) -> bool { + matches!(&self.t, ArchiveType::Layer(_)) + } + + pub fn layer_info(&self) -> Result<&LayerInfo> { + match &self.t { + ArchiveType::Layer(info) => Ok(info), + _ => Err(Error::new( + ErrorKind::InvalidData, + "archive is not the \"layer\" type", + )), + } + } + + pub fn is_baseline(&self) -> bool { + matches!(&self.t, ArchiveType::Baseline) + } + + pub fn is_os(&self) -> bool { + matches!(&self.t, ArchiveType::Os) + } + + pub fn is_rot(&self) -> bool { + matches!(&self.t, ArchiveType::Rot) + } + + pub fn is_control_plane(&self) -> bool { + matches!(&self.t, ArchiveType::ControlPlane) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_deserialize() { + let metadata: Metadata = serde_json::from_str( + r#"{"v":"1","t":"layer","pkg":"nexus","version":"12.0.0-0.ci+git3a2ed5e97b3"}"#, + ) + .unwrap(); + assert!(metadata.is_layer()); + let info = metadata.layer_info().unwrap(); + assert_eq!(info.pkg, "nexus"); + assert_eq!(info.version, "12.0.0-0.ci+git3a2ed5e97b3".parse().unwrap()); + + let metadata: Metadata = serde_json::from_str( + r#"{"v":"1","t":"os","i":{"checksum":"42eda100ee0e3bf44b9d0bb6a836046fa3133c378cd9d3a4ba338c3ba9e56eb7","name":"ci 3a2ed5e/9d37813 2024-12-20 08:54"}}"#, + ).unwrap(); + assert!(metadata.is_os()); + + let metadata: Metadata = + serde_json::from_str(r#"{"v":"1","t":"control_plane"}"#).unwrap(); + assert!(metadata.is_control_plane()); + } +} diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 6705569ae4c..0396ffc28fa 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -305,7 +305,10 @@ pub enum KnownArtifactKind { GimletRotBootloader, Host, Trampoline, + /// Composite artifact of all control plane zones ControlPlane, + /// Individual control plane zone + Zone, // PSC Artifacts PscSp, diff --git a/nexus/src/app/background/tasks/tuf_artifact_replication.rs b/nexus/src/app/background/tasks/tuf_artifact_replication.rs index b540c7db905..3ae62b8d400 100644 --- a/nexus/src/app/background/tasks/tuf_artifact_replication.rs +++ b/nexus/src/app/background/tasks/tuf_artifact_replication.rs @@ -126,7 +126,7 @@ enum ArtifactHandle { } impl ArtifactHandle { - async fn file(&self) -> anyhow::Result { + async fn file(&self) -> std::io::Result { match self { ArtifactHandle::Extracted(handle) => handle.file().await, #[cfg(test)] diff --git a/nexus/src/app/update/mod.rs b/nexus/src/app/update/mod.rs index 05ed9d71899..01e7285478c 100644 --- a/nexus/src/app/update/mod.rs +++ b/nexus/src/app/update/mod.rs @@ -13,7 +13,7 @@ use nexus_db_queries::context::OpContext; use omicron_common::api::external::{ Error, SemverVersion, TufRepoInsertResponse, TufRepoInsertStatus, }; -use update_common::artifacts::ArtifactsWithPlan; +use update_common::artifacts::{ArtifactsWithPlan, ControlPlaneZonesMode}; mod common_sp_update; mod host_phase1_updater; @@ -51,10 +51,15 @@ impl super::Nexus { Error::internal_error("updates system not initialized") })?; - let artifacts_with_plan = - ArtifactsWithPlan::from_stream(body, Some(file_name), &self.log) - .await - .map_err(|error| error.to_http_error())?; + let artifacts_with_plan = ArtifactsWithPlan::from_stream( + body, + Some(file_name), + ControlPlaneZonesMode::Split, + &self.log, + ) + .await + .map_err(|error| error.to_http_error())?; + // Now store the artifacts in the database. let tuf_repo_description = TufRepoDescription::from_external( artifacts_with_plan.description().clone(), diff --git a/nexus/tests/integration_tests/updates.rs b/nexus/tests/integration_tests/updates.rs index 51d93f49caf..37821a1b881 100644 --- a/nexus/tests/integration_tests/updates.rs +++ b/nexus/tests/integration_tests/updates.rs @@ -131,6 +131,27 @@ async fn test_repo_upload() -> Result<()> { .map(|artifact| artifact.hash) .collect::>() .len(); + // The repository description should have `Zone` artifacts instead of the + // composite `ControlPlane` artifact. + assert_eq!( + initial_description + .artifacts + .iter() + .filter_map(|artifact| { + if artifact.id.kind == KnownArtifactKind::Zone.into() { + Some(&artifact.id.name) + } else { + None + } + }) + .collect::>(), + ["zone1", "zone2"] + ); + assert!(!initial_description + .artifacts + .iter() + .any(|artifact| artifact.id.kind + == KnownArtifactKind::ControlPlane.into())); // The artifact replication background task should have been activated, and // we should see a local repo and successful PUTs. diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index 12efddda079..fa437cb6ba3 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -108,6 +108,7 @@ omicron-workspace-hack.workspace = true slog-error-chain.workspace = true walkdir.workspace = true zip.workspace = true +omicron-brand-metadata.workspace = true [target.'cfg(target_os = "illumos")'.dependencies] opte-ioctl.workspace = true diff --git a/sled-agent/src/bootstrap/http_entrypoints.rs b/sled-agent/src/bootstrap/http_entrypoints.rs index a29eaed376f..4e87e94e911 100644 --- a/sled-agent/src/bootstrap/http_entrypoints.rs +++ b/sled-agent/src/bootstrap/http_entrypoints.rs @@ -28,6 +28,7 @@ use sled_agent_types::rack_ops::RackOperationStatus; use sled_hardware_types::Baseboard; use sled_storage::manager::StorageHandle; use slog::Logger; +use slog_error_chain::InlineErrorChain; use sprockets_tls::keys::SprocketsConfig; use std::net::Ipv6Addr; use tokio::sync::mpsc::error::TrySendError; @@ -85,10 +86,11 @@ impl BootstrapAgentApi for BootstrapAgentImpl { ) -> Result>, HttpError> { let ctx = rqctx.context(); let updates = UpdateManager::new(ctx.updates.clone()); - let components = updates - .components_get() - .await - .map_err(|err| HttpError::for_internal_error(err.to_string()))?; + let components = updates.components_get().await.map_err(|err| { + HttpError::for_internal_error( + InlineErrorChain::new(&err).to_string(), + ) + })?; Ok(HttpResponseOk(components)) } diff --git a/sled-agent/src/updates.rs b/sled-agent/src/updates.rs index 3633ec0dccc..3fd810c2534 100644 --- a/sled-agent/src/updates.rs +++ b/sled-agent/src/updates.rs @@ -6,42 +6,28 @@ use bootstrap_agent_api::Component; use camino::{Utf8Path, Utf8PathBuf}; -use omicron_common::api::internal::nexus::UpdateArtifactId; +use omicron_brand_metadata::Metadata; +use omicron_common::api::external::SemverVersion; use serde::{Deserialize, Serialize}; -use std::io::Read; #[derive(thiserror::Error, Debug)] pub enum Error { - #[error("I/O Error: {message}: {err}")] + #[error("I/O Error: while accessing {path}")] Io { - message: String, + path: Utf8PathBuf, #[source] err: std::io::Error, }, - #[error("Utf-8 error converting path: {0}")] - FromPathBuf(#[from] camino::FromPathBufError), - - #[error( - "sled-agent only supports applying zones, found artifact ID {}/{} with kind {}", - .0.name, .0.version, .0.kind - )] - UnsupportedKind(UpdateArtifactId), - - #[error("Version not found in artifact {}", 0)] - VersionNotFound(Utf8PathBuf), - - #[error("Cannot parse json: {0}")] - Json(#[from] serde_json::Error), - - #[error("Malformed version in artifact {path}: {why}")] - VersionMalformed { path: Utf8PathBuf, why: String }, + #[error("failed to read zone version from {path}")] + ZoneVersion { + path: Utf8PathBuf, + #[source] + err: std::io::Error, + }, - #[error("Cannot parse semver in {path}: {err}")] + #[error("Cannot parse semver in {path}")] Semver { path: Utf8PathBuf, err: semver::Error }, - - #[error("Failed request to Nexus: {0}")] - Response(nexus_client::Error), } fn default_zone_artifact_path() -> Utf8PathBuf { @@ -61,16 +47,8 @@ impl Default for ConfigUpdates { } } -// Helper functions for returning errors -fn version_malformed_err(path: &Utf8Path, key: &str) -> Error { - Error::VersionMalformed { - path: path.to_path_buf(), - why: format!("Missing '{key}'"), - } -} - fn io_err(path: &Utf8Path, err: std::io::Error) -> Error { - Error::Io { message: format!("Cannot access {path}"), err } + Error::Io { path: path.into(), err } } pub struct UpdateManager { @@ -87,46 +65,19 @@ impl UpdateManager { &self, path: &Utf8Path, ) -> Result { - // Decode the zone image let file = std::fs::File::open(path).map_err(|err| io_err(path, err))?; let gzr = flate2::read::GzDecoder::new(file); let mut component_reader = tar::Archive::new(gzr); - let entries = - component_reader.entries().map_err(|err| io_err(path, err))?; - - // Look for the JSON file which contains the package information - for entry in entries { - let mut entry = entry.map_err(|err| io_err(path, err))?; - let entry_path = entry.path().map_err(|err| io_err(path, err))?; - if entry_path == Utf8Path::new("oxide.json") { - let mut contents = String::new(); - entry - .read_to_string(&mut contents) - .map_err(|err| io_err(path, err))?; - let json: serde_json::Value = - serde_json::from_str(contents.as_str())?; - - // Parse keys from the JSON file - let serde_json::Value::String(pkg) = &json["pkg"] else { - return Err(version_malformed_err(path, "pkg")); - }; - let serde_json::Value::String(version) = &json["version"] - else { - return Err(version_malformed_err(path, "version")); - }; - - // Extract the name and semver version - let name = pkg.to_string(); - let version = omicron_common::api::external::SemverVersion( - semver::Version::parse(version).map_err(|err| { - Error::Semver { path: path.to_path_buf(), err } - })?, - ); - return Ok(crate::updates::Component { name, version }); - } - } - Err(Error::VersionNotFound(path.to_path_buf())) + let metadata = Metadata::read_from_tar(&mut component_reader) + .map_err(|err| Error::ZoneVersion { path: path.into(), err })?; + let info = metadata + .layer_info() + .map_err(|err| Error::ZoneVersion { path: path.into(), err })?; + Ok(Component { + name: info.pkg.clone(), + version: SemverVersion(info.version.clone()), + }) } pub async fn components_get(&self) -> Result, Error> { diff --git a/tufaceous-lib/Cargo.toml b/tufaceous-lib/Cargo.toml index f5b7bab3937..c3814b1215f 100644 --- a/tufaceous-lib/Cargo.toml +++ b/tufaceous-lib/Cargo.toml @@ -25,10 +25,12 @@ futures.workspace = true hex.workspace = true hubtools.workspace = true itertools.workspace = true +omicron-brand-metadata.workspace = true omicron-common.workspace = true omicron-workspace-hack.workspace = true parse-size.workspace = true rand.workspace = true +semver.workspace = true serde.workspace = true serde_json.workspace = true serde_path_to_error.workspace = true diff --git a/tufaceous-lib/src/artifact.rs b/tufaceous-lib/src/artifact.rs index c46540617b4..486744e3ceb 100644 --- a/tufaceous-lib/src/artifact.rs +++ b/tufaceous-lib/src/artifact.rs @@ -12,10 +12,9 @@ use buf_list::BufList; use bytes::Bytes; use camino::Utf8PathBuf; use fs_err::File; +use omicron_brand_metadata::Metadata; use omicron_common::{api::external::SemverVersion, update::ArtifactKind}; -use crate::oxide_metadata; - mod composite; pub use composite::CompositeControlPlaneArchiveBuilder; @@ -162,7 +161,7 @@ impl HostPhaseImages { .context("error reading path from archive")?; if path == Path::new(OXIDE_JSON_FILE_NAME) { let json_bytes = read_entry(entry, OXIDE_JSON_FILE_NAME)?; - let metadata: oxide_metadata::Metadata = + let metadata: Metadata = serde_json::from_slice(&json_bytes).with_context(|| { format!( "error deserializing JSON from {OXIDE_JSON_FILE_NAME}" @@ -282,7 +281,7 @@ impl RotArchives { .context("error reading path from archive")?; if path == Path::new(OXIDE_JSON_FILE_NAME) { let json_bytes = read_entry(entry, OXIDE_JSON_FILE_NAME)?; - let metadata: oxide_metadata::Metadata = + let metadata: Metadata = serde_json::from_slice(&json_bytes).with_context(|| { format!( "error deserializing JSON from {OXIDE_JSON_FILE_NAME}" @@ -346,24 +345,40 @@ pub struct ControlPlaneZoneImages { impl ControlPlaneZoneImages { pub fn extract(reader: R) -> Result { + let mut zones = Vec::new(); + Self::extract_into(reader, |name, reader| { + let mut buf = Vec::new(); + io::copy(reader, &mut buf)?; + zones.push((name, buf.into())); + Ok(()) + })?; + Ok(Self { zones }) + } + + pub fn extract_into(reader: R, mut handler: F) -> Result<()> + where + R: io::Read, + F: FnMut(String, &mut dyn io::Read) -> Result<()>, + { let uncompressed = flate2::bufread::GzDecoder::new(BufReader::new(reader)); let mut archive = tar::Archive::new(uncompressed); let mut oxide_json_found = false; - let mut zones = Vec::new(); + let mut zone_found = false; for entry in archive .entries() .context("error building list of entries from archive")? { - let entry = entry.context("error reading entry from archive")?; + let mut entry = + entry.context("error reading entry from archive")?; let path = entry .header() .path() .context("error reading path from archive")?; if path == Path::new(OXIDE_JSON_FILE_NAME) { let json_bytes = read_entry(entry, OXIDE_JSON_FILE_NAME)?; - let metadata: oxide_metadata::Metadata = + let metadata: Metadata = serde_json::from_slice(&json_bytes).with_context(|| { format!( "error deserializing JSON from {OXIDE_JSON_FILE_NAME}" @@ -382,9 +397,9 @@ impl ControlPlaneZoneImages { .and_then(|s| s.to_str()) .map(|s| s.to_string()) { - let data = read_entry(entry, &name)?; - zones.push((name, data)); + handler(name, &mut entry)?; } + zone_found = true; } } @@ -395,14 +410,14 @@ impl ControlPlaneZoneImages { if !not_found.is_empty() { bail!("required files not found: {}", not_found.join(", ")) } - if zones.is_empty() { + if !zone_found { bail!( "no zone images found in `{}/`", CONTROL_PLANE_ARCHIVE_ZONE_DIRECTORY ); } - Ok(Self { zones }) + Ok(()) } } diff --git a/tufaceous-lib/src/artifact/composite.rs b/tufaceous-lib/src/artifact/composite.rs index e7793868c99..50574d704f3 100644 --- a/tufaceous-lib/src/artifact/composite.rs +++ b/tufaceous-lib/src/artifact/composite.rs @@ -7,8 +7,6 @@ use super::HOST_PHASE_1_FILE_NAME; use super::HOST_PHASE_2_FILE_NAME; use super::ROT_ARCHIVE_A_FILE_NAME; use super::ROT_ARCHIVE_B_FILE_NAME; -use crate::oxide_metadata; -use crate::oxide_metadata::Metadata; use anyhow::anyhow; use anyhow::bail; use anyhow::Context; @@ -16,6 +14,10 @@ use anyhow::Result; use camino::Utf8Path; use flate2::write::GzEncoder; use flate2::Compression; +use omicron_brand_metadata::{ArchiveType, Metadata}; +use sha2::Digest; +use sha2::Sha256; +use std::collections::HashMap; use std::io::BufWriter; use std::io::Write; @@ -33,18 +35,15 @@ pub struct CompositeEntry<'a> { pub struct CompositeControlPlaneArchiveBuilder { inner: CompositeTarballBuilder, + hashes: HashMap<[u8; 32], String>, } impl CompositeControlPlaneArchiveBuilder { pub fn new(writer: W, mtime_source: MtimeSource) -> Result { - let metadata = oxide_metadata::MetadataBuilder::new( - oxide_metadata::ArchiveType::ControlPlane, - ) - .build() - .context("error building oxide metadata")?; + let metadata = Metadata::new(ArchiveType::ControlPlane); let inner = CompositeTarballBuilder::new(writer, metadata, mtime_source)?; - Ok(Self { inner }) + Ok(Self { inner, hashes: HashMap::new() }) } pub fn append_zone( @@ -56,6 +55,14 @@ impl CompositeControlPlaneArchiveBuilder { if name_path.file_name() != Some(name) { bail!("control plane zone filenames should not contain paths"); } + if let Some(duplicate) = + self.hashes.insert(Sha256::digest(&entry.data).into(), name.into()) + { + bail!( + "duplicate zones are not allowed \ + ({name} and {duplicate} have the same checksum)" + ); + } let path = Utf8Path::new(CONTROL_PLANE_ARCHIVE_ZONE_DIRECTORY).join(name_path); self.inner.append_file(path.as_str(), entry) @@ -72,11 +79,7 @@ pub struct CompositeRotArchiveBuilder { impl CompositeRotArchiveBuilder { pub fn new(writer: W, mtime_source: MtimeSource) -> Result { - let metadata = oxide_metadata::MetadataBuilder::new( - oxide_metadata::ArchiveType::Rot, - ) - .build() - .context("error building oxide metadata")?; + let metadata = Metadata::new(ArchiveType::Rot); let inner = CompositeTarballBuilder::new(writer, metadata, mtime_source)?; Ok(Self { inner }) @@ -107,11 +110,7 @@ pub struct CompositeHostArchiveBuilder { impl CompositeHostArchiveBuilder { pub fn new(writer: W, mtime_source: MtimeSource) -> Result { - let metadata = oxide_metadata::MetadataBuilder::new( - oxide_metadata::ArchiveType::Os, - ) - .build() - .context("error building oxide metadata")?; + let metadata = Metadata::new(ArchiveType::Os); let inner = CompositeTarballBuilder::new(writer, metadata, mtime_source)?; Ok(Self { inner }) @@ -144,7 +143,7 @@ impl CompositeTarballBuilder { BufWriter::new(writer), Compression::fast(), )); - metadata.append_to_tar(&mut builder, mtime_source)?; + metadata.append_to_tar(&mut builder, mtime_source.into_mtime())?; Ok(Self { builder }) } diff --git a/tufaceous-lib/src/assemble/manifest.rs b/tufaceous-lib/src/assemble/manifest.rs index fa5df799821..3203cdfc825 100644 --- a/tufaceous-lib/src/assemble/manifest.rs +++ b/tufaceous-lib/src/assemble/manifest.rs @@ -224,7 +224,9 @@ impl ArtifactManifest { /// Checks that all expected artifacts are present, returning an error with /// details if any artifacts are missing. pub fn verify_all_present(&self) -> Result<()> { - let all_artifacts: BTreeSet<_> = KnownArtifactKind::iter().collect(); + let all_artifacts: BTreeSet<_> = KnownArtifactKind::iter() + .filter(|k| !matches!(k, KnownArtifactKind::Zone)) + .collect(); let present_artifacts: BTreeSet<_> = self.artifacts.keys().copied().collect(); @@ -261,7 +263,8 @@ impl<'a> FakeDataAttributes<'a> { // non-Hubris artifacts: just make fake data KnownArtifactKind::Host | KnownArtifactKind::Trampoline - | KnownArtifactKind::ControlPlane => return make_filler_text(size), + | KnownArtifactKind::ControlPlane + | KnownArtifactKind::Zone => return make_filler_text(size), // hubris artifacts: build a fake archive (SimGimletSp and // SimGimletRot are used by sp-sim) @@ -539,15 +542,40 @@ impl DeserializedControlPlaneZoneSource { })?; // For now, always use the current time as the source. (Maybe // change this to use the mtime on disk in the future?) - (name, data, MtimeSource::Now) + (name.to_owned(), data, MtimeSource::Now) } DeserializedControlPlaneZoneSource::Fake { name, size } => { - let data = make_filler_text(*size as usize); - (name.as_str(), data, MtimeSource::Zero) + use flate2::{write::GzEncoder, Compression}; + use omicron_brand_metadata::{ + ArchiveType, LayerInfo, Metadata, + }; + + let mut tar = tar::Builder::new(GzEncoder::new( + Vec::new(), + Compression::fast(), + )); + + let metadata = Metadata::new(ArchiveType::Layer(LayerInfo { + pkg: name.clone(), + version: semver::Version::new(0, 0, 0), + })); + metadata.append_to_tar(&mut tar, 0)?; + + let mut h = tar::Header::new_ustar(); + h.set_entry_type(tar::EntryType::Regular); + h.set_path("fake")?; + h.set_mode(0o444); + h.set_size(*size); + h.set_mtime(0); + h.set_cksum(); + tar.append(&h, make_filler_text(*size as usize).as_slice())?; + + let data = tar.into_inner()?.finish()?; + (format!("{name}.tar.gz"), data, MtimeSource::Zero) } }; let entry = CompositeEntry { data: &data, mtime_source }; - f(name, entry) + f(&name, entry) } fn apply_size_delta(&mut self, size_delta: i64) -> Result<()> { diff --git a/tufaceous-lib/src/lib.rs b/tufaceous-lib/src/lib.rs index f1b0e392856..bf6fd5e03a7 100644 --- a/tufaceous-lib/src/lib.rs +++ b/tufaceous-lib/src/lib.rs @@ -6,7 +6,6 @@ mod archive; mod artifact; pub mod assemble; mod key; -pub mod oxide_metadata; mod repository; mod root; mod target; diff --git a/tufaceous-lib/src/oxide_metadata.rs b/tufaceous-lib/src/oxide_metadata.rs deleted file mode 100644 index 43f0c67df71..00000000000 --- a/tufaceous-lib/src/oxide_metadata.rs +++ /dev/null @@ -1,133 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! This file is a copy of -//! . -//! Once that is open sourced, we should switch to using that. - -/* - * Copyright 2023 Oxide Computer Company - */ - -use std::collections::HashMap; - -use anyhow::{bail, Result}; -use serde::{Deserialize, Serialize}; - -use crate::MtimeSource; - -#[derive(Clone, Copy, Debug, Deserialize, Serialize)] -#[serde(rename_all = "snake_case")] -pub enum ArchiveType { - Baseline, - Layer, - Os, - Rot, - ControlPlane, -} - -#[derive(Clone, Debug, Deserialize, Serialize)] -pub struct Metadata { - v: String, - t: ArchiveType, - #[serde(default, skip_serializing_if = "HashMap::is_empty")] - i: HashMap, -} - -pub fn parse(s: &str) -> Result { - let m: Metadata = serde_json::from_str(s)?; - if m.v != "1" { - bail!("unexpected metadata version {}", m.v); - } - Ok(m) -} - -impl Metadata { - pub fn append_to_tar( - &self, - a: &mut tar::Builder, - mtime_source: MtimeSource, - ) -> Result<()> { - let mut b = serde_json::to_vec(self)?; - b.push(b'\n'); - - // XXX This was changed from upstream to add oxide.json with optionally - // a zero timestamp, to ensure stability of fake manifests. - let mtime = mtime_source.into_mtime(); - - let mut h = tar::Header::new_ustar(); - h.set_entry_type(tar::EntryType::Regular); - h.set_username("root")?; - h.set_uid(0); - h.set_groupname("root")?; - h.set_gid(0); - h.set_path("oxide.json")?; - h.set_mode(0o444); - h.set_size(b.len().try_into().unwrap()); - h.set_mtime(mtime); - h.set_cksum(); - - a.append(&h, b.as_slice())?; - Ok(()) - } - - pub fn is_layer(&self) -> bool { - matches!(&self.t, ArchiveType::Layer) - } - - pub fn is_baseline(&self) -> bool { - matches!(&self.t, ArchiveType::Baseline) - } - - pub fn is_os(&self) -> bool { - matches!(&self.t, ArchiveType::Os) - } - - pub fn is_rot(&self) -> bool { - matches!(&self.t, ArchiveType::Rot) - } - - pub fn is_control_plane(&self) -> bool { - matches!(&self.t, ArchiveType::ControlPlane) - } - - pub fn archive_type(&self) -> ArchiveType { - self.t - } - - pub fn info(&self) -> &HashMap { - &self.i - } -} - -pub struct MetadataBuilder { - archive_type: ArchiveType, - info: HashMap, -} - -impl MetadataBuilder { - pub fn new(archive_type: ArchiveType) -> MetadataBuilder { - MetadataBuilder { archive_type, info: Default::default() } - } - - pub fn info( - &mut self, - name: &str, - value: &str, - ) -> Result<&mut MetadataBuilder> { - if name.len() < 3 { - bail!("info property names must be at least three characters"); - } - self.info.insert(name.to_string(), value.to_string()); - Ok(self) - } - - pub fn build(&mut self) -> Result { - Ok(Metadata { - v: "1".into(), - t: self.archive_type, - i: self.info.clone(), - }) - } -} diff --git a/tufaceous/manifests/fake.toml b/tufaceous/manifests/fake.toml index c3f6404f537..74b1b57147f 100644 --- a/tufaceous/manifests/fake.toml +++ b/tufaceous/manifests/fake.toml @@ -39,8 +39,8 @@ version = "1.0.0" [artifact.control_plane.source] kind = "composite-control-plane" zones = [ - { kind = "fake", name = "zone1.tar.gz", size = "1MiB" }, - { kind = "fake", name = "zone2.tar.gz", size = "1MiB" }, + { kind = "fake", name = "zone1", size = "1MiB" }, + { kind = "fake", name = "zone2", size = "1MiB" }, ] [[artifact.psc_sp]] diff --git a/update-common/Cargo.toml b/update-common/Cargo.toml index 96675d47d70..789f6466825 100644 --- a/update-common/Cargo.toml +++ b/update-common/Cargo.toml @@ -28,6 +28,10 @@ tokio-util.workspace = true tough.workspace = true tufaceous-lib.workspace = true omicron-workspace-hack.workspace = true +omicron-brand-metadata.workspace = true +tar.workspace = true +flate2.workspace = true +fs-err = { workspace = true, features = ["tokio"] } [dev-dependencies] clap.workspace = true diff --git a/update-common/src/artifacts/artifacts_with_plan.rs b/update-common/src/artifacts/artifacts_with_plan.rs index 650efccdfb6..2a7a27a9f66 100644 --- a/update-common/src/artifacts/artifacts_with_plan.rs +++ b/update-common/src/artifacts/artifacts_with_plan.rs @@ -75,6 +75,7 @@ impl ArtifactsWithPlan { pub async fn from_stream( body: impl Stream> + Send, file_name: Option, + zone_mode: ControlPlaneZonesMode, log: &Logger, ) -> Result { // Create a temporary file to store the incoming archive.`` @@ -115,6 +116,7 @@ impl ArtifactsWithPlan { io::BufReader::new(tempfile), file_name, repo_hash, + zone_mode, log, ) .await?; @@ -126,6 +128,7 @@ impl ArtifactsWithPlan { zip_data: T, file_name: Option, repo_hash: ArtifactHash, + zone_mode: ControlPlaneZonesMode, log: &Logger, ) -> Result where @@ -174,8 +177,11 @@ impl ArtifactsWithPlan { // these are just direct copies of artifacts we just unpacked into // `dir`, but we'll also unpack nested artifacts like the RoT dual A/B // archives. - let mut builder = - UpdatePlanBuilder::new(artifacts.system_version.clone(), log)?; + let mut builder = UpdatePlanBuilder::new( + artifacts.system_version.clone(), + zone_mode, + log, + )?; // Make a pass through each artifact in the repo. For each artifact, we // do one of the following: @@ -316,6 +322,16 @@ impl ArtifactsWithPlan { } } +#[derive(Debug, Clone, Copy)] +pub enum ControlPlaneZonesMode { + /// Ensure the control plane zones are combined into a single composite + /// `ControlPlane` artifact, used by Wicket. + Composite, + /// Ensure the control plane zones are individual `Zone` artifacts, used + /// by Nexus. + Split, +} + fn unzip_into_tempdir( zip_data: T, log: &Logger, @@ -366,8 +382,12 @@ mod tests { create_fake_archive(&logctx.log, &archive_path).await?; // Now check that it can be read by the archive extractor. - let plan = - build_artifacts_with_plan(&logctx.log, &archive_path).await?; + let plan = build_artifacts_with_plan( + &logctx.log, + &archive_path, + ControlPlaneZonesMode::Composite, + ) + .await?; // Check that all known artifact kinds are present in the map. let by_id_kinds: BTreeSet<_> = plan.by_id().keys().map(|id| id.kind.clone()).collect(); @@ -380,9 +400,12 @@ mod tests { .map(|meta| meta.id.kind.clone()) .collect(); - // `by_id` should contain one entry for every `KnownArtifactKind`... - let mut expected_kinds: BTreeSet<_> = - KnownArtifactKind::iter().map(ArtifactKind::from).collect(); + // `by_id` should contain one entry for every `KnownArtifactKind` + // (except `Zone`)... + let mut expected_kinds: BTreeSet<_> = KnownArtifactKind::iter() + .filter(|k| !matches!(k, KnownArtifactKind::Zone)) + .map(ArtifactKind::from) + .collect(); assert_eq!( expected_kinds, by_id_kinds, "expected kinds match by_id kinds" @@ -441,6 +464,41 @@ mod tests { Ok(()) } + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_extract_fake_split() -> Result<()> { + let logctx = test_setup_log("test_extract_fake"); + let temp_dir = Utf8TempDir::new()?; + let archive_path = temp_dir.path().join("archive.zip"); + + // Create the archive. + create_fake_archive(&logctx.log, &archive_path).await?; + + // Now check that it can be read by the archive extractor in split mode. + let plan = build_artifacts_with_plan( + &logctx.log, + &archive_path, + ControlPlaneZonesMode::Split, + ) + .await?; + + // `by_id` should contain one entry for every `KnownArtifactKind` + // (except `ControlPlane`). + let by_id_kinds: BTreeSet<_> = + plan.by_id().keys().map(|id| id.kind.clone()).collect(); + let expected_kinds: BTreeSet<_> = KnownArtifactKind::iter() + .filter(|k| !matches!(k, KnownArtifactKind::ControlPlane)) + .map(ArtifactKind::from) + .collect(); + assert_eq!( + expected_kinds, by_id_kinds, + "expected kinds match by_id kinds" + ); + + logctx.cleanup_successful(); + + Ok(()) + } + /// Test that the archive generated by running `tufaceous assemble` twice /// has the same artifacts and hashes. #[tokio::test(flavor = "multi_thread", worker_threads = 2)] @@ -451,8 +509,12 @@ mod tests { // Create the archive and build a plan from it. create_fake_archive(&logctx.log, &archive_path).await?; - let mut plan1 = - build_artifacts_with_plan(&logctx.log, &archive_path).await?; + let mut plan1 = build_artifacts_with_plan( + &logctx.log, + &archive_path, + ControlPlaneZonesMode::Composite, + ) + .await?; // Add a 2 second delay to ensure that if we bake any second-based // timestamps in, that they end up being different from those in the @@ -461,8 +523,12 @@ mod tests { let archive2_path = temp_dir.path().join("archive2.zip"); create_fake_archive(&logctx.log, &archive2_path).await?; - let mut plan2 = - build_artifacts_with_plan(&logctx.log, &archive2_path).await?; + let mut plan2 = build_artifacts_with_plan( + &logctx.log, + &archive2_path, + ControlPlaneZonesMode::Composite, + ) + .await?; // At the moment, the repo .zip itself doesn't match because it bakes // in timestamps. However, the artifacts inside should match exactly. @@ -499,15 +565,18 @@ mod tests { async fn build_artifacts_with_plan( log: &slog::Logger, archive_path: &Utf8Path, + zone_mode: ControlPlaneZonesMode, ) -> Result { let zip_bytes = std::fs::File::open(&archive_path) .context("error opening archive.zip")?; // We could also compute the hash from the file here, but the repo hash // doesn't matter for the test. let repo_hash = ArtifactHash([0u8; 32]); - let plan = ArtifactsWithPlan::from_zip(zip_bytes, None, repo_hash, log) - .await - .with_context(|| format!("error reading {archive_path}"))?; + let plan = ArtifactsWithPlan::from_zip( + zip_bytes, None, repo_hash, zone_mode, log, + ) + .await + .with_context(|| format!("error reading {archive_path}"))?; Ok(plan) } diff --git a/update-common/src/artifacts/extracted_artifacts.rs b/update-common/src/artifacts/extracted_artifacts.rs index 309b188a9dc..dd5b1edc8ba 100644 --- a/update-common/src/artifacts/extracted_artifacts.rs +++ b/update-common/src/artifacts/extracted_artifacts.rs @@ -73,12 +73,9 @@ impl ExtractedArtifactDataHandle { /// /// This can fail due to I/O errors outside our control (e.g., something /// removed the contents of our temporary directory). - pub async fn file(&self) -> anyhow::Result { + pub async fn file(&self) -> std::io::Result { let path = path_for_artifact(&self.tempdir, &self.hash_id); - - tokio::fs::File::open(&path) - .await - .with_context(|| format!("failed to open {path}")) + fs_err::tokio::File::open(&path).await.map(|file| file.into_parts().0) } /// Async stream to read the contents of this artifact on demand. diff --git a/update-common/src/artifacts/update_plan.rs b/update-common/src/artifacts/update_plan.rs index f5ff081651b..297d66f9868 100644 --- a/update-common/src/artifacts/update_plan.rs +++ b/update-common/src/artifacts/update_plan.rs @@ -11,6 +11,7 @@ use super::ArtifactIdData; use super::Board; +use super::ControlPlaneZonesMode; use super::ExtractedArtifactDataHandle; use super::ExtractedArtifacts; use super::HashingNamedUtf8TempFile; @@ -35,6 +36,8 @@ use std::collections::BTreeMap; use std::collections::HashMap; use std::io; use tokio::io::AsyncReadExt; +use tokio::runtime::Handle; +use tufaceous_lib::ControlPlaneZoneImages; use tufaceous_lib::HostPhaseImages; use tufaceous_lib::RotArchives; @@ -75,6 +78,9 @@ pub struct UpdatePlan { // We also need to send installinator the hash of the control_plane image it // should fetch from us. This is already present in the TUF repository, but // we record it here for use by the update process. + // + // When built with `ControlPlaneZonesMode::Split`, this hash does not + // reference any artifacts in our corresponding `ArtifactsWithPlan`. pub control_plane_hash: ArtifactHash, } @@ -146,12 +152,14 @@ pub struct UpdatePlanBuilder<'a> { // extra fields we use to build the plan extracted_artifacts: ExtractedArtifacts, + zone_mode: ControlPlaneZonesMode, log: &'a Logger, } impl<'a> UpdatePlanBuilder<'a> { pub fn new( system_version: SemverVersion, + zone_mode: ControlPlaneZonesMode, log: &'a Logger, ) -> Result { let extracted_artifacts = ExtractedArtifacts::new(log)?; @@ -181,6 +189,7 @@ impl<'a> UpdatePlanBuilder<'a> { artifacts_meta: Vec::new(), extracted_artifacts, + zone_mode, log, }) } @@ -246,6 +255,12 @@ impl<'a> UpdatePlanBuilder<'a> { ) .await } + KnownArtifactKind::Zone => { + // We don't currently support repos with already split-out + // zones. + self.add_unknown_artifact(artifact_id, artifact_hash, stream) + .await + } } } @@ -265,6 +280,7 @@ impl<'a> UpdatePlanBuilder<'a> { | KnownArtifactKind::Host | KnownArtifactKind::Trampoline | KnownArtifactKind::ControlPlane + | KnownArtifactKind::Zone | KnownArtifactKind::PscRot | KnownArtifactKind::SwitchRot | KnownArtifactKind::GimletRotBootloader @@ -357,6 +373,7 @@ impl<'a> UpdatePlanBuilder<'a> { | KnownArtifactKind::Host | KnownArtifactKind::Trampoline | KnownArtifactKind::ControlPlane + | KnownArtifactKind::Zone | KnownArtifactKind::PscRot | KnownArtifactKind::SwitchRot | KnownArtifactKind::GimletSp @@ -451,6 +468,7 @@ impl<'a> UpdatePlanBuilder<'a> { | KnownArtifactKind::Host | KnownArtifactKind::Trampoline | KnownArtifactKind::ControlPlane + | KnownArtifactKind::Zone | KnownArtifactKind::PscSp | KnownArtifactKind::SwitchSp | KnownArtifactKind::GimletRotBootloader @@ -708,24 +726,34 @@ impl<'a> UpdatePlanBuilder<'a> { )); } - // The control plane artifact is the easiest one: we just need to copy - // it into our tempdir and record it. Nothing to inspect or extract. - let artifact_hash_id = ArtifactHashId { - kind: artifact_id.kind.clone(), - hash: artifact_hash, - }; - - let data = - self.extracted_artifacts.store(artifact_hash_id, stream).await?; - - self.control_plane_hash = Some(data.hash()); + match self.zone_mode { + ControlPlaneZonesMode::Composite => { + // Just copy it into our tempdir and record it. + let artifact_hash_id = ArtifactHashId { + kind: artifact_id.kind.clone(), + hash: artifact_hash, + }; + let data = self + .extracted_artifacts + .store(artifact_hash_id, stream) + .await?; + self.record_extracted_artifact( + artifact_id, + data, + KnownArtifactKind::ControlPlane.into(), + self.log, + )?; + } + ControlPlaneZonesMode::Split => { + // Extract each zone image into its own artifact. + self.extract_control_plane_zones(stream)?; + } + } - self.record_extracted_artifact( - artifact_id, - data, - KnownArtifactKind::ControlPlane.into(), - self.log, - )?; + // Even if we split the control plane artifact, use this as a marker + // that we've seen the artifact before. The hash is meaningless in + // `Split` mode. + self.control_plane_hash = Some(artifact_hash); Ok(()) } @@ -858,6 +886,75 @@ impl<'a> UpdatePlanBuilder<'a> { Ok((image1, image2)) } + /// Helper function for extracting and recording zones out of `stream`, a + /// composite control plane artifact. + /// + /// This code can only be used with multithreaded Tokio executors; see + /// `extract_nested_artifact_pair` for context on `block_in_place`. + fn extract_control_plane_zones( + &mut self, + stream: impl Stream> + Send, + ) -> Result<(), RepositoryError> { + tokio::task::block_in_place(|| { + let stream = std::pin::pin!(stream); + let reader = + tokio_util::io::StreamReader::new(stream.map_err(|error| { + // StreamReader requires a conversion from tough's errors to + // std::io::Error. + std::io::Error::new(io::ErrorKind::Other, error) + })); + let reader = tokio_util::io::SyncIoBridge::new(reader); + self.extract_control_plane_zones_impl(reader) + }) + } + + fn extract_control_plane_zones_impl( + &mut self, + reader: impl io::Read, + ) -> Result<(), RepositoryError> { + ControlPlaneZoneImages::extract_into(reader, |_, reader| { + let mut out = self.extracted_artifacts.new_tempfile()?; + io::copy(reader, &mut out)?; + let data = self + .extracted_artifacts + .store_tempfile(KnownArtifactKind::Zone.into(), out)?; + + // Read the zone name and version from the `oxide.json` at the root + // of the zone. + let data_clone = data.clone(); + let file = Handle::current().block_on(async move { + std::io::Result::Ok(data_clone.file().await?.into_std().await) + })?; + let mut tar = tar::Archive::new(flate2::read::GzDecoder::new(file)); + let metadata = + omicron_brand_metadata::Metadata::read_from_tar(&mut tar)?; + let info = metadata.layer_info()?; + + let artifact_id = ArtifactId { + name: info.pkg.clone(), + version: SemverVersion(info.version.clone()), + kind: KnownArtifactKind::Zone.into(), + }; + self.record_extracted_artifact( + artifact_id, + data, + KnownArtifactKind::Zone.into(), + self.log, + )?; + Ok(()) + }) + .map_err(|error| { + // Fish the original RepositoryError out of this + // anyhow::Error if it is one. + error.downcast().unwrap_or_else(|error| { + RepositoryError::TarballExtract { + kind: KnownArtifactKind::ControlPlane, + error, + } + }) + }) + } + // Record an artifact in `by_id` and `by_hash`, or fail if either already has an // entry for this id/hash. fn record_extracted_artifact( @@ -1180,11 +1277,15 @@ mod tests { use super::*; use bytes::Bytes; + use flate2::{write::GzEncoder, Compression}; use futures::StreamExt; + use omicron_brand_metadata::{ArchiveType, LayerInfo, Metadata}; use omicron_test_utils::dev::test_setup_log; use rand::{distributions::Standard, thread_rng, Rng}; use sha2::{Digest, Sha256}; - use tufaceous_lib::{CompositeEntry, MtimeSource}; + use tufaceous_lib::{ + CompositeControlPlaneArchiveBuilder, CompositeEntry, MtimeSource, + }; fn make_random_bytes() -> Vec { thread_rng().sample_iter(Standard).take(128).collect() @@ -1369,8 +1470,12 @@ mod tests { let logctx = test_setup_log("test_bad_rot_version"); - let mut plan_builder = - UpdatePlanBuilder::new(VERSION_0, &logctx.log).unwrap(); + let mut plan_builder = UpdatePlanBuilder::new( + VERSION_0, + ControlPlaneZonesMode::Composite, + &logctx.log, + ) + .unwrap(); // The control plane artifact can be arbitrary bytes; just populate it // with random data. @@ -1537,9 +1642,12 @@ mod tests { let logctx = test_setup_log("test_multi_rot_version"); - let mut plan_builder = - UpdatePlanBuilder::new("0.0.0".parse().unwrap(), &logctx.log) - .unwrap(); + let mut plan_builder = UpdatePlanBuilder::new( + "0.0.0".parse().unwrap(), + ControlPlaneZonesMode::Composite, + &logctx.log, + ) + .unwrap(); // The control plane artifact can be arbitrary bytes; just populate it // with random data. @@ -1722,9 +1830,12 @@ mod tests { let logctx = test_setup_log("test_update_plan_from_artifacts"); - let mut plan_builder = - UpdatePlanBuilder::new("0.0.0".parse().unwrap(), &logctx.log) - .unwrap(); + let mut plan_builder = UpdatePlanBuilder::new( + "0.0.0".parse().unwrap(), + ControlPlaneZonesMode::Composite, + &logctx.log, + ) + .unwrap(); // Add a couple artifacts with kinds wicketd/nexus don't understand; it // should still ingest and serve them. @@ -1919,6 +2030,12 @@ mod tests { assert_eq!(hash_ids.len(), 1); assert_eq!(plan.control_plane_hash, hash_ids[0].hash); } + KnownArtifactKind::Zone => { + unreachable!( + "tufaceous does not yet generate repos \ + with split-out control plane zones" + ); + } KnownArtifactKind::PscSp => { assert!( id.name.starts_with("test-psc-"), @@ -2018,9 +2135,12 @@ mod tests { let logctx = test_setup_log("test_bad_hubris_cabooses"); - let mut plan_builder = - UpdatePlanBuilder::new("0.0.0".parse().unwrap(), &logctx.log) - .unwrap(); + let mut plan_builder = UpdatePlanBuilder::new( + "0.0.0".parse().unwrap(), + ControlPlaneZonesMode::Composite, + &logctx.log, + ) + .unwrap(); let gimlet_rot = make_bad_rot_image("gimlet"); let psc_rot = make_bad_rot_image("psc"); @@ -2096,9 +2216,12 @@ mod tests { // bootloader let logctx = test_setup_log("test_too_many_rot_bootloader"); - let mut plan_builder = - UpdatePlanBuilder::new("0.0.0".parse().unwrap(), &logctx.log) - .unwrap(); + let mut plan_builder = UpdatePlanBuilder::new( + "0.0.0".parse().unwrap(), + ControlPlaneZonesMode::Composite, + &logctx.log, + ) + .unwrap(); let gimlet_rot_bootloader = make_fake_rot_bootloader_image("test-gimlet-a", "test-gimlet-a"); @@ -2159,9 +2282,12 @@ mod tests { let logctx = test_setup_log("test_update_plan_from_artifacts"); - let mut plan_builder = - UpdatePlanBuilder::new("0.0.0".parse().unwrap(), &logctx.log) - .unwrap(); + let mut plan_builder = UpdatePlanBuilder::new( + "0.0.0".parse().unwrap(), + ControlPlaneZonesMode::Composite, + &logctx.log, + ) + .unwrap(); // The control plane artifact can be arbitrary bytes; just populate it // with random data. @@ -2320,9 +2446,12 @@ mod tests { let logctx = test_setup_log("test_update_plan_from_artifacts"); - let mut plan_builder = - UpdatePlanBuilder::new("0.0.0".parse().unwrap(), &logctx.log) - .unwrap(); + let mut plan_builder = UpdatePlanBuilder::new( + "0.0.0".parse().unwrap(), + ControlPlaneZonesMode::Composite, + &logctx.log, + ) + .unwrap(); let gimlet_rot = make_random_rot_image("gimlet", "gimlet", "gitc1"); let gimlet2_rot = make_random_rot_image("gimlet", "gimlet", "gitc2"); @@ -2359,6 +2488,80 @@ mod tests { logctx.cleanup_successful(); } + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_split_control_plane() { + const VERSION_0: SemverVersion = SemverVersion::new(0, 0, 0); + + let logctx = test_setup_log("test_split_control_plane"); + + let mut zones = Vec::new(); + for name in ["first", "second"] { + let mut tar = tar::Builder::new(GzEncoder::new( + Vec::new(), + Compression::fast(), + )); + let metadata = Metadata::new(ArchiveType::Layer(LayerInfo { + pkg: name.to_owned(), + version: VERSION_0.0.clone(), + })); + metadata.append_to_tar(&mut tar, 0).unwrap(); + let data = tar.into_inner().unwrap().finish().unwrap(); + zones.push((name, data)); + } + + let mut cp_builder = CompositeControlPlaneArchiveBuilder::new( + Vec::new(), + MtimeSource::Now, + ) + .unwrap(); + for (name, data) in &zones { + cp_builder + .append_zone( + name, + CompositeEntry { data, mtime_source: MtimeSource::Now }, + ) + .unwrap(); + } + let data = Bytes::from(cp_builder.finish().unwrap()); + + let mut plan_builder = UpdatePlanBuilder::new( + VERSION_0, + ControlPlaneZonesMode::Split, + &logctx.log, + ) + .unwrap(); + let hash = ArtifactHash(Sha256::digest(&data).into()); + let id = ArtifactId { + name: "control_plane".into(), + version: VERSION_0, + kind: KnownArtifactKind::ControlPlane.into(), + }; + plan_builder + .add_artifact(id, hash, futures::stream::iter([Ok(data.clone())])) + .await + .unwrap(); + + // All of the artifacts created should be Zones (and notably, not + // ControlPlane). Their artifact hashes should match the calculated hash + // of the zone contents. + for (id, vec) in &plan_builder.by_id { + let content = + &zones.iter().find(|(name, _)| *name == id.name).unwrap().1; + let expected_hash = ArtifactHash(Sha256::digest(content).into()); + assert_eq!(id.version, VERSION_0); + assert_eq!(id.kind, KnownArtifactKind::Zone.into()); + assert_eq!( + vec, + &vec![ArtifactHashId { + kind: KnownArtifactKind::Zone.into(), + hash: expected_hash + }] + ); + } + + logctx.cleanup_successful(); + } + async fn read_to_vec(data: &ExtractedArtifactDataHandle) -> Vec { let mut buf = Vec::with_capacity(data.file_size()); let mut stream = data.reader_stream().await.unwrap(); diff --git a/wicketd/src/update_tracker.rs b/wicketd/src/update_tracker.rs index b1fb5ae326c..43b550a2d8b 100644 --- a/wicketd/src/update_tracker.rs +++ b/wicketd/src/update_tracker.rs @@ -62,6 +62,7 @@ use tokio::task::JoinHandle; use tokio_util::io::StreamReader; use update_common::artifacts::ArtifactIdData; use update_common::artifacts::ArtifactsWithPlan; +use update_common::artifacts::ControlPlaneZonesMode; use update_common::artifacts::UpdatePlan; use update_engine::events::ProgressUnits; use update_engine::AbortHandle; @@ -356,7 +357,9 @@ impl UpdateTracker { stream, // We don't have a good file name here because file contents are // uploaded over stdin, so let ArtifactsWithPlan pick the name. - None, &self.log, + None, + ControlPlaneZonesMode::Composite, + &self.log, ) .await .map_err(|error| error.to_http_error())?; diff --git a/wicketd/tests/integration_tests/updates.rs b/wicketd/tests/integration_tests/updates.rs index af3bbfe656a..d333a1104a1 100644 --- a/wicketd/tests/integration_tests/updates.rs +++ b/wicketd/tests/integration_tests/updates.rs @@ -69,9 +69,12 @@ async fn test_updates() { .expect("get_artifacts_and_event_reports succeeded") .into_inner(); - // We should have an artifact for every known artifact kind... - let expected_kinds: BTreeSet<_> = - KnownArtifactKind::iter().map(ArtifactKind::from).collect(); + // We should have an artifact for every known artifact kind (except + // `Zone`)... + let expected_kinds: BTreeSet<_> = KnownArtifactKind::iter() + .filter(|k| !matches!(k, KnownArtifactKind::Zone)) + .map(ArtifactKind::from) + .collect(); // ... and installable artifacts that replace the top level host, // trampoline, and RoT with their inner parts (phase1/phase2 for OS images From 1949d79cd313a19a6b63ee370ca0e241101c7d21 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 3 Feb 2025 15:02:21 -0500 Subject: [PATCH 3/8] [reconfigurator] Move resource allocation out of `BlueprintBuilder` (#7235) Most of this diff is moving stuff around in preparation for a larger rework, but there are some nontrivial changes worth reviewing that I'll call out in comments below. This is staged on top of #7234. Prior to this PR, `BlueprintBuilder` had 3 fields that allocated various kinds of resources: * `sled_ip_allocators` (for allocating underlay IPs) * `external_networking` (for creating OPTE NICs and assigning external IP addresses for Nexus/boundary NTP/external DNS) * `internal_dns_subnets` (for assigning internal DNS zones, which get IPs distinct from the normal sled subnet) All three were lazily instantiated on first use. This is to allow the planner to first expunge zones then start adding; the first "add" that needs a resource from one of these allocators creates the allocator by snapshotting the state of available resources at that point (_after_ expungement). Eventually I'd like to break the builder up to make these two phases more explicit, but for now this PR settles for shrinking the builder's surface area some: * `sled_ip_allocators` is gone; each `SledEditor` now maintains its own underlay IP allocator. These are no longer instantiated lazily, but for underlay IPs specifically this is fine: we never reuse underlay IPs from expunged zones, so we don't need to wait for expungement to reuse resources like the other two. * `external_networking`/`internal_dns_subnets` were replaced by `resource_allocator`, which internally holds both of those types and has pass-through methods for the builder to use. This is still lazily instantiated via `OnceCell` to keep the existing implicit ordering/behavior. --- .../db-queries/src/db/datastore/deployment.rs | 71 +++- .../planning/src/blueprint_builder/builder.rs | 389 ++++++++++++------ .../planning/src/blueprint_builder/mod.rs | 2 - .../planning/src/blueprint_editor.rs | 18 +- .../src/blueprint_editor/allocators.rs | 111 +++++ .../allocators}/external_networking.rs | 304 ++++---------- .../allocators}/internal_dns.rs | 38 +- .../src/blueprint_editor/sled_editor.rs | 94 +++-- .../sled_editor/underlay_ip_allocator.rs | 156 +++++++ nexus/reconfigurator/planning/src/example.rs | 2 +- .../planning/src/ip_allocator.rs | 120 ------ nexus/reconfigurator/planning/src/lib.rs | 1 - nexus/reconfigurator/planning/src/planner.rs | 15 +- .../output/planner_nonprovisionable_1_2.txt | 2 +- .../output/planner_nonprovisionable_2_2a.txt | 2 +- 15 files changed, 799 insertions(+), 526 deletions(-) create mode 100644 nexus/reconfigurator/planning/src/blueprint_editor/allocators.rs rename nexus/reconfigurator/planning/src/{blueprint_builder => blueprint_editor/allocators}/external_networking.rs (78%) rename nexus/reconfigurator/planning/src/{blueprint_builder => blueprint_editor/allocators}/internal_dns.rs (87%) create mode 100644 nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/underlay_ip_allocator.rs delete mode 100644 nexus/reconfigurator/planning/src/ip_allocator.rs diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index b4a33fbcda4..1610097c231 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -2007,6 +2007,7 @@ mod tests { use nexus_reconfigurator_planning::blueprint_builder::Ensure; use nexus_reconfigurator_planning::blueprint_builder::EnsureMultiple; use nexus_reconfigurator_planning::example::example; + use nexus_reconfigurator_planning::example::ExampleSystemBuilder; use nexus_types::deployment::blueprint_zone_type; use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::BlueprintZoneDisposition; @@ -2899,32 +2900,71 @@ mod tests { #[tokio::test] async fn test_ensure_external_networking_bails_on_bad_target() { + let test_name = "test_ensure_external_networking_bails_on_bad_target"; + // Setup - let logctx = dev::test_setup_log( - "test_ensure_external_networking_bails_on_bad_target", - ); + let logctx = dev::test_setup_log(test_name); let db = TestDatabase::new_with_datastore(&logctx.log).await; let (opctx, datastore) = (db.opctx(), db.datastore()); - // Create an initial empty collection - let collection = CollectionBuilder::new("test").build(); - - let blueprint1 = - create_blueprint_with_external_ip(&datastore, &opctx).await; + // Create two blueprints, both of which have external networking (via 1 + // Nexus zone). + let (example_system, blueprint1) = + ExampleSystemBuilder::new(&opctx.log, test_name) + .nsleds(1) + .nexus_count(1) + .internal_dns_count(0) + .expect("internal DNS count can be 0") + .external_dns_count(0) + .expect("external DNS count can be 0") + .crucible_pantry_count(0) + .build(); let blueprint2 = BlueprintBuilder::new_based_on( &logctx.log, &blueprint1, - &EMPTY_PLANNING_INPUT, - &collection, - "test2", + &example_system.input, + &example_system.collection, + &format!("{test_name}-2"), ) .expect("failed to create builder") .build(); - // Insert both into the blueprint table. + // Insert an IP pool range covering the one Nexus IP. + let nexus_ip = blueprint1 + .all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning) + .find_map(|(_, zone_config)| { + zone_config + .zone_type + .external_networking() + .map(|(ip, _nic)| ip.ip()) + }) + .expect("found external IP"); + let (service_ip_pool, _) = datastore + .ip_pools_service_lookup(&opctx) + .await + .expect("lookup service ip pool"); + datastore + .ip_pool_add_range( + &opctx, + &service_ip_pool, + &IpRange::try_from((nexus_ip, nexus_ip)) + .expect("valid IP range"), + ) + .await + .expect("add range to service IP pool"); + + // Insert both (plus the original parent of blueprint1, internal to + // `ExampleSystemBuilder`) into the blueprint table. + let blueprint0 = example_system.initial_blueprint; + datastore.blueprint_insert(&opctx, &blueprint0).await.unwrap(); datastore.blueprint_insert(&opctx, &blueprint1).await.unwrap(); datastore.blueprint_insert(&opctx, &blueprint2).await.unwrap(); + let bp0_target = BlueprintTarget { + target_id: blueprint0.id, + enabled: true, + time_made_target: now_db_precision(), + }; let bp1_target = BlueprintTarget { target_id: blueprint1.id, enabled: true, @@ -2936,7 +2976,12 @@ mod tests { time_made_target: now_db_precision(), }; - // Set bp1_target as the current target. + // Set bp1_target as the current target (which requires making bp0 the + // target first). + datastore + .blueprint_target_set_current(&opctx, bp0_target) + .await + .unwrap(); datastore .blueprint_target_set_current(&opctx, bp1_target) .await diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 6e62acfaf24..aad29e924e5 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -4,10 +4,15 @@ //! Low-level facility for generating Blueprints +use crate::blueprint_editor::BlueprintResourceAllocator; +use crate::blueprint_editor::BlueprintResourceAllocatorInputError; use crate::blueprint_editor::EditedSled; +use crate::blueprint_editor::ExternalNetworkingChoice; +use crate::blueprint_editor::ExternalNetworkingError; +use crate::blueprint_editor::ExternalSnatNetworkingChoice; +use crate::blueprint_editor::InternalDnsError; use crate::blueprint_editor::SledEditError; use crate::blueprint_editor::SledEditor; -use crate::ip_allocator::IpAllocator; use crate::planner::rng::PlannerRng; use crate::planner::zone_needs_expungement; use crate::planner::ZoneExpungeReason; @@ -15,7 +20,6 @@ use anyhow::anyhow; use anyhow::bail; use anyhow::Context as _; use clickhouse_admin_types::OXIMETER_CLUSTER; -use ipnet::IpAdd; use nexus_inventory::now_db_precision; use nexus_sled_agent_shared::inventory::OmicronZoneDataset; use nexus_sled_agent_shared::inventory::ZoneKind; @@ -45,20 +49,15 @@ use nexus_types::deployment::ZpoolFilter; use nexus_types::deployment::ZpoolName; use nexus_types::external_api::views::SledState; use nexus_types::inventory::Collection; -use omicron_common::address::get_sled_address; -use omicron_common::address::get_switch_zone_address; use omicron_common::address::ReservedRackSubnet; use omicron_common::address::CLICKHOUSE_HTTP_PORT; -use omicron_common::address::CP_SERVICES_RESERVED_ADDRESSES; use omicron_common::address::DNS_HTTP_PORT; use omicron_common::address::DNS_PORT; use omicron_common::address::NTP_PORT; -use omicron_common::address::SLED_RESERVED_ADDRESSES; use omicron_common::api::external::Generation; use omicron_common::api::external::Vni; use omicron_common::api::internal::shared::NetworkInterface; use omicron_common::api::internal::shared::NetworkInterfaceKind; -use omicron_common::policy::INTERNAL_DNS_REDUNDANCY; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SledUuid; @@ -82,17 +81,10 @@ use std::net::SocketAddrV6; use thiserror::Error; use super::clickhouse::ClickhouseAllocator; -use super::external_networking::ensure_input_networking_records_appear_in_parent_blueprint; -use super::external_networking::BuilderExternalNetworking; -use super::external_networking::ExternalNetworkingChoice; -use super::external_networking::ExternalSnatNetworkingChoice; -use super::internal_dns::DnsSubnetAllocator; /// Errors encountered while assembling blueprints #[derive(Debug, Error)] pub enum Error { - #[error("sled {sled_id}: ran out of available addresses for sled")] - OutOfAddresses { sled_id: SledUuid }, #[error( "sled {sled_id}: no available zpools for additional {kind:?} zones" )] @@ -101,14 +93,6 @@ pub enum Error { NoNexusZonesInParentBlueprint, #[error("no Boundary NTP zones exist in parent blueprint")] NoBoundaryNtpZonesInParentBlueprint, - #[error("no external DNS IP addresses are available")] - NoExternalDnsIpAvailable, - #[error("no external service IP addresses are available")] - NoExternalServiceIpAvailable, - #[error("no system MAC addresses are available")] - NoSystemMacAddressAvailable, - #[error("exhausted available OPTE IP addresses for service {kind:?}")] - ExhaustedOpteIps { kind: ZoneKind }, #[error( "invariant violation: found decommissioned sled with \ {num_zones} non-expunged zones: {sled_id}" @@ -119,18 +103,18 @@ pub enum Error { }, #[error("programming error in planner")] Planner(#[source] anyhow::Error), - #[error("no reserved subnets available for DNS")] - NoAvailableDnsSubnets, - #[error("can only have {INTERNAL_DNS_REDUNDANCY} internal DNS servers")] - TooManyDnsServers, - #[error("planner produced too many {kind:?} zones")] - TooManyZones { kind: ZoneKind }, #[error("error editing sled {sled_id}")] SledEditError { sled_id: SledUuid, #[source] err: SledEditError, }, + #[error("error constructing resource allocator")] + AllocatorInput(#[from] BlueprintResourceAllocatorInputError), + #[error("error allocating internal DNS subnet")] + AllocateInternalDnsSubnet(#[from] InternalDnsError), + #[error("error allocating external networking resources")] + AllocateExternalNetworking(#[from] ExternalNetworkingError), } /// Describes the result of an idempotent "ensure" operation @@ -362,9 +346,14 @@ pub struct BlueprintBuilder<'a> { // These fields are used to allocate resources for sleds. input: &'a PlanningInput, - sled_ip_allocators: BTreeMap, - external_networking: OnceCell>, - internal_dns_subnets: OnceCell, + + // `allocators` contains logic for choosing new underlay IPs, external IPs, + // internal DNS subnets, etc. It's held in a `OnceCell` to delay its + // creation until it's first needed; the planner expunges zones before + // adding zones, so this delay allows us to reuse resources that just came + // free. (This is implicit and awkward; as we rework the builder we should + // rework this to make it more explicit.) + resource_allocator: OnceCell, // These fields will become part of the final blueprint. See the // corresponding fields in `Blueprint`. @@ -524,23 +513,47 @@ impl<'a> BlueprintBuilder<'a> { generation: Generation::new(), datasets: IdMap::new(), }); - let editor = SledEditor::for_existing( - state, - zones.clone(), - disks, - datasets.clone(), - ) + let editor = match state { + SledState::Active => { + let subnet = input + .sled_lookup(SledFilter::Commissioned, *sled_id) + .with_context(|| { + format!( + "failed to find sled details for \ + active sled in parent blueprint {sled_id}" + ) + })? + .resources + .subnet; + SledEditor::for_existing_active( + subnet, + zones.clone(), + disks, + datasets.clone(), + ) + } + SledState::Decommissioned => { + SledEditor::for_existing_decommissioned( + zones.clone(), + disks, + datasets.clone(), + ) + } + } .with_context(|| { format!("failed to construct SledEditor for sled {sled_id}") })?; + sled_editors.insert(*sled_id, editor); } // Add new, empty `SledEditor`s for any commissioned sleds in our input // that weren't in the parent blueprint. (These are newly-added sleds.) - for sled_id in input.all_sled_ids(SledFilter::Commissioned) { + for (sled_id, details) in input.all_sleds(SledFilter::Commissioned) { if let Entry::Vacant(slot) = sled_editors.entry(sled_id) { - slot.insert(SledEditor::for_new_active()); + slot.insert(SledEditor::for_new_active( + details.resources.subnet, + )); } } @@ -549,9 +562,7 @@ impl<'a> BlueprintBuilder<'a> { parent_blueprint, collection: inventory, input, - sled_ip_allocators: BTreeMap::new(), - external_networking: OnceCell::new(), - internal_dns_subnets: OnceCell::new(), + resource_allocator: OnceCell::new(), sled_editors, cockroachdb_setting_preserve_downgrade: parent_blueprint .cockroachdb_setting_preserve_downgrade, @@ -562,59 +573,33 @@ impl<'a> BlueprintBuilder<'a> { }) } - // Helper method to create a `BuilderExternalNetworking` allocator at the - // point of first use. This is to work around some timing issues between the - // planner and the builder: `BuilderExternalNetworking` wants to know what - // resources are in use by running zones, but the planner constructs the - // `BlueprintBuilder` before it expunges zones. We want to wait to look at - // resources are in use until after that expungement happens; this - // implicitly does that by virtue of knowing that the planner won't try to - // allocate new external networking until after it's done expunging and has - // started to provision new zones. - // - // This is gross and fragile. We should clean up the planner/builder split - // soon. - fn external_networking( + fn resource_allocator( &mut self, - ) -> Result<&mut BuilderExternalNetworking<'a>, Error> { - self.external_networking - .get_or_try_init(|| { - // Check the planning input: there shouldn't be any external - // networking resources in the database (the source of `input`) - // that we don't know about from the parent blueprint. - ensure_input_networking_records_appear_in_parent_blueprint( - self.parent_blueprint, - self.input, - )?; - - BuilderExternalNetworking::new( - self.sled_editors.values().flat_map(|editor| { - editor.zones(BlueprintZoneFilter::ShouldBeRunning) - }), - self.sled_editors.values().flat_map(|editor| { - editor.zones(BlueprintZoneFilter::Expunged) - }), - self.input.service_ip_pool_ranges(), - ) - }) - .map_err(Error::Planner)?; - Ok(self.external_networking.get_mut().unwrap()) - } - - // See `external_networking`; this is a similar helper for DNS subnet - // allocation, with deferred creation for the same reasons. - fn internal_dns_subnets( - &mut self, - ) -> Result<&mut DnsSubnetAllocator, Error> { - self.internal_dns_subnets.get_or_try_init(|| { - DnsSubnetAllocator::new( - self.sled_editors.values().flat_map(|editor| { - editor.zones(BlueprintZoneFilter::ShouldBeRunning) - }), + ) -> Result<&mut BlueprintResourceAllocator, Error> { + self.resource_allocator.get_or_try_init(|| { + // Check the planning input: there shouldn't be any external + // networking resources in the database (the source of `input`) + // that we don't know about from the parent blueprint. + // + // TODO-cleanup Should the planner do this instead? Move this check + // to blippy. + ensure_input_networking_records_appear_in_parent_blueprint( + self.parent_blueprint, self.input, ) + .map_err(Error::Planner)?; + + let allocator = BlueprintResourceAllocator::new( + self.sled_editors + .iter() + .map(|(sled_id, editor)| (*sled_id, editor)), + self.input.service_ip_pool_ranges().to_vec(), + self.input.target_internal_dns_zone_count(), + )?; + + Ok::<_, Error>(allocator) })?; - Ok(self.internal_dns_subnets.get_mut().unwrap()) + Ok(self.resource_allocator.get_mut().expect("get_or_init succeeded")) } /// Iterates over the list of sled IDs for which we have zones. @@ -1121,7 +1106,8 @@ impl<'a> BlueprintBuilder<'a> { let gz_address_index = self.next_internal_dns_gz_address_index(sled_id); let sled_subnet = self.sled_resources(sled_id)?.subnet; let rack_subnet = ReservedRackSubnet::from_subnet(sled_subnet); - let dns_subnet = self.internal_dns_subnets()?.alloc(rack_subnet)?; + let dns_subnet = + self.resource_allocator()?.next_internal_dns_subnet(rack_subnet)?; let address = dns_subnet.dns_address(); let zpool = self.sled_select_zpool(sled_id, ZoneKind::InternalDns)?; let zone_type = @@ -1153,7 +1139,7 @@ impl<'a> BlueprintBuilder<'a> { nic_ip, nic_subnet, nic_mac, - } = self.external_networking()?.for_new_external_dns()?; + } = self.resource_allocator()?.next_external_ip_external_dns()?; let nic = NetworkInterface { id: self.rng.sled_rng(sled_id).next_network_interface(), kind: NetworkInterfaceKind::Service { id: id.into_untyped_uuid() }, @@ -1357,7 +1343,7 @@ impl<'a> BlueprintBuilder<'a> { nic_ip, nic_subnet, nic_mac, - } = self.external_networking()?.for_new_nexus()?; + } = self.resource_allocator()?.next_external_ip_nexus()?; let external_ip = OmicronZoneExternalFloatingIp { id: self.rng.sled_rng(sled_id).next_external_ip(), ip: external_ip, @@ -1641,7 +1627,7 @@ impl<'a> BlueprintBuilder<'a> { nic_ip, nic_subnet, nic_mac, - } = self.external_networking()?.for_new_boundary_ntp()?; + } = self.resource_allocator()?.next_external_ip_boundary_ntp()?; let external_ip = OmicronZoneExternalSnatIp { id: self.rng.sled_rng(sled_id).next_external_ip(), snat_cfg, @@ -1723,45 +1709,14 @@ impl<'a> BlueprintBuilder<'a> { /// Returns a newly-allocated underlay address suitable for use by Omicron /// zones fn sled_alloc_ip(&mut self, sled_id: SledUuid) -> Result { - let sled_subnet = self.sled_resources(sled_id)?.subnet; - let editor = self.sled_editors.get(&sled_id).ok_or_else(|| { + let editor = self.sled_editors.get_mut(&sled_id).ok_or_else(|| { Error::Planner(anyhow!( - "tried to allocate underlay IP for unknown sled {sled_id}" + "tried to allocate underlay IP on unknown sled {sled_id}" )) })?; - let allocator = - self.sled_ip_allocators.entry(sled_id).or_insert_with(|| { - let sled_subnet_addr = sled_subnet.net().prefix(); - let minimum = sled_subnet_addr - .saturating_add(u128::from(SLED_RESERVED_ADDRESSES)); - let maximum = sled_subnet_addr - .saturating_add(u128::from(CP_SERVICES_RESERVED_ADDRESSES)); - assert!(sled_subnet.net().contains(minimum)); - assert!(sled_subnet.net().contains(maximum)); - let mut allocator = IpAllocator::new(minimum, maximum); - - // We shouldn't need to explicitly reserve the sled's global - // zone and switch addresses because they should be out of our - // range, but we do so just to be sure. - let sled_gz_addr = *get_sled_address(sled_subnet).ip(); - assert!(sled_subnet.net().contains(sled_gz_addr)); - assert!(minimum > sled_gz_addr); - assert!(maximum > sled_gz_addr); - let switch_zone_addr = get_switch_zone_address(sled_subnet); - assert!(sled_subnet.net().contains(switch_zone_addr)); - assert!(minimum > switch_zone_addr); - assert!(maximum > switch_zone_addr); - - // Record each of the sled's zones' underlay IPs as - // allocated. - for z in editor.zones(BlueprintZoneFilter::All) { - allocator.reserve(z.underlay_ip()); - } - - allocator - }); - - allocator.alloc().ok_or(Error::OutOfAddresses { sled_id }) + editor + .alloc_underlay_ip() + .map_err(|err| Error::SledEditError { sled_id, err }) } /// Selects a zpool for this zone type. @@ -1857,12 +1812,173 @@ impl<'a> BlueprintBuilder<'a> { /// ordinarily only come from RSS. /// /// TODO-cleanup: Remove when external DNS addresses are in the policy. - pub(crate) fn add_external_dns_ip( + // This can't be `#[cfg(test)]` because it's used by the `ExampleSystem` + // helper (which itself is used by reconfigurator-cli and friends). We give + // it a scary name instead. + pub(crate) fn inject_untracked_external_dns_ip( &mut self, addr: IpAddr, ) -> Result<(), Error> { - self.external_networking()?.add_external_dns_ip(addr) + Ok(self.resource_allocator()?.inject_untracked_external_dns_ip(addr)?) + } +} + +// Helper to validate that the system hasn't gone off the rails. There should +// never be any external networking resources in the planning input (which is +// derived from the contents of CRDB) that we don't know about from the parent +// blueprint. It's possible a given planning iteration could see such a state +// there have been intermediate changes made by other Nexus instances; e.g., +// +// 1. Nexus A generates a `PlanningInput` by reading from CRDB +// 2. Nexus B executes on a target blueprint that removes IPs/NICs from +// CRDB +// 3. Nexus B regenerates a new blueprint and prunes the zone(s) associated +// with the IPs/NICs from step 2 +// 4. Nexus B makes this new blueprint the target +// 5. Nexus A attempts to run planning with its `PlanningInput` from step 1 but +// the target blueprint from step 4; this will fail the following checks +// because the input contains records that were removed in step 3 +// +// We do not need to handle this class of error; it's a transient failure that +// will clear itself up when Nexus A repeats its planning loop from the top and +// generates a new `PlanningInput`. +// +// There may still be database records corresponding to _expunged_ zones, but +// that's okay: it just means we haven't yet realized a blueprint where those +// zones are expunged. And those should should still be in the blueprint (not +// pruned) until their database records are cleaned up. +// +// It's also possible that there may be networking records in the database +// assigned to zones that have been expunged, and our parent blueprint uses +// those same records for new zones. This is also fine and expected, and is a +// similar case to the previous paragraph: a zone with networking resources was +// expunged, the database doesn't realize it yet, but can still move forward and +// make planning decisions that reuse those resources for new zones. +pub(super) fn ensure_input_networking_records_appear_in_parent_blueprint( + parent_blueprint: &Blueprint, + input: &PlanningInput, +) -> anyhow::Result<()> { + use nexus_types::deployment::OmicronZoneExternalIp; + use omicron_common::address::DNS_OPTE_IPV4_SUBNET; + use omicron_common::address::DNS_OPTE_IPV6_SUBNET; + use omicron_common::address::NEXUS_OPTE_IPV4_SUBNET; + use omicron_common::address::NEXUS_OPTE_IPV6_SUBNET; + use omicron_common::address::NTP_OPTE_IPV4_SUBNET; + use omicron_common::address::NTP_OPTE_IPV6_SUBNET; + use omicron_common::api::external::MacAddr; + + let mut all_macs: HashSet = HashSet::new(); + let mut all_nexus_nic_ips: HashSet = HashSet::new(); + let mut all_boundary_ntp_nic_ips: HashSet = HashSet::new(); + let mut all_external_dns_nic_ips: HashSet = HashSet::new(); + let mut all_external_ips: HashSet = HashSet::new(); + + // Unlike the construction of the external IP allocator and existing IPs + // constructed above in `BuilderExternalNetworking::new()`, we do not + // check for duplicates here: we could very well see reuse of IPs + // between expunged zones or between expunged -> running zones. + for (_, z) in parent_blueprint.all_omicron_zones(BlueprintZoneFilter::All) { + let zone_type = &z.zone_type; + match zone_type { + BlueprintZoneType::BoundaryNtp(ntp) => { + all_boundary_ntp_nic_ips.insert(ntp.nic.ip); + } + BlueprintZoneType::Nexus(nexus) => { + all_nexus_nic_ips.insert(nexus.nic.ip); + } + BlueprintZoneType::ExternalDns(dns) => { + all_external_dns_nic_ips.insert(dns.nic.ip); + } + _ => (), + } + + if let Some((external_ip, nic)) = zone_type.external_networking() { + // As above, ignore localhost (used by the test suite). + if !external_ip.ip().is_loopback() { + all_external_ips.insert(external_ip); + } + all_macs.insert(nic.mac); + } + } + for external_ip_entry in + input.network_resources().omicron_zone_external_ips() + { + // As above, ignore localhost (used by the test suite). + if external_ip_entry.ip.ip().is_loopback() { + continue; + } + if !all_external_ips.contains(&external_ip_entry.ip) { + bail!( + "planning input contains unexpected external IP \ + (IP not found in parent blueprint): {external_ip_entry:?}" + ); + } + } + for nic_entry in input.network_resources().omicron_zone_nics() { + if !all_macs.contains(&nic_entry.nic.mac) { + bail!( + "planning input contains unexpected NIC \ + (MAC not found in parent blueprint): {nic_entry:?}" + ); + } + match nic_entry.nic.ip { + IpAddr::V4(ip) if NEXUS_OPTE_IPV4_SUBNET.contains(ip) => { + if !all_nexus_nic_ips.contains(&ip.into()) { + bail!( + "planning input contains unexpected NIC \ + (IP not found in parent blueprint): {nic_entry:?}" + ); + } + } + IpAddr::V4(ip) if NTP_OPTE_IPV4_SUBNET.contains(ip) => { + if !all_boundary_ntp_nic_ips.contains(&ip.into()) { + bail!( + "planning input contains unexpected NIC \ + (IP not found in parent blueprint): {nic_entry:?}" + ); + } + } + IpAddr::V4(ip) if DNS_OPTE_IPV4_SUBNET.contains(ip) => { + if !all_external_dns_nic_ips.contains(&ip.into()) { + bail!( + "planning input contains unexpected NIC \ + (IP not found in parent blueprint): {nic_entry:?}" + ); + } + } + IpAddr::V6(ip) if NEXUS_OPTE_IPV6_SUBNET.contains(ip) => { + if !all_nexus_nic_ips.contains(&ip.into()) { + bail!( + "planning input contains unexpected NIC \ + (IP not found in parent blueprint): {nic_entry:?}" + ); + } + } + IpAddr::V6(ip) if NTP_OPTE_IPV6_SUBNET.contains(ip) => { + if !all_boundary_ntp_nic_ips.contains(&ip.into()) { + bail!( + "planning input contains unexpected NIC \ + (IP not found in parent blueprint): {nic_entry:?}" + ); + } + } + IpAddr::V6(ip) if DNS_OPTE_IPV6_SUBNET.contains(ip) => { + if !all_external_dns_nic_ips.contains(&ip.into()) { + bail!( + "planning input contains unexpected NIC \ + (IP not found in parent blueprint): {nic_entry:?}" + ); + } + } + _ => { + bail!( + "planning input contains unexpected NIC \ + (IP not contained in known OPTE subnet): {nic_entry:?}" + ) + } + } } + Ok(()) } #[cfg(test)] @@ -2611,7 +2727,12 @@ pub mod test { let err = builder.sled_add_zone_nexus(sled_id).unwrap_err(); assert!( - matches!(err, Error::NoExternalServiceIpAvailable), + matches!( + err, + Error::AllocateExternalNetworking( + ExternalNetworkingError::NoExternalServiceIpAvailable + ) + ), "unexpected error {err}" ); } diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/mod.rs b/nexus/reconfigurator/planning/src/blueprint_builder/mod.rs index bab6476456d..9fd25391ab2 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/mod.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/mod.rs @@ -6,8 +6,6 @@ mod builder; mod clickhouse; -mod external_networking; -mod internal_dns; pub use builder::*; pub use clickhouse::{ClickhouseAllocator, ClickhouseZonesThatShouldBeRunning}; diff --git a/nexus/reconfigurator/planning/src/blueprint_editor.rs b/nexus/reconfigurator/planning/src/blueprint_editor.rs index 3eb10a194cc..7af22ce2f74 100644 --- a/nexus/reconfigurator/planning/src/blueprint_editor.rs +++ b/nexus/reconfigurator/planning/src/blueprint_editor.rs @@ -6,8 +6,24 @@ //! //! See crate-level documentation for details. +mod allocators; mod sled_editor; +pub use allocators::BlueprintResourceAllocatorInputError; +pub use allocators::ExternalNetworkingError; +pub use allocators::InternalDnsError; +pub use allocators::InternalDnsInputError; +pub use sled_editor::DatasetsEditError; +pub use sled_editor::DisksEditError; +pub use sled_editor::DuplicateDiskId; +pub use sled_editor::DuplicateZoneId; +pub use sled_editor::MultipleDatasetsOfKind; +pub use sled_editor::SledEditError; +pub use sled_editor::SledInputError; +pub use sled_editor::ZonesEditError; + +pub(crate) use allocators::BlueprintResourceAllocator; +pub(crate) use allocators::ExternalNetworkingChoice; +pub(crate) use allocators::ExternalSnatNetworkingChoice; pub(crate) use sled_editor::EditedSled; -pub(crate) use sled_editor::SledEditError; pub(crate) use sled_editor::SledEditor; diff --git a/nexus/reconfigurator/planning/src/blueprint_editor/allocators.rs b/nexus/reconfigurator/planning/src/blueprint_editor/allocators.rs new file mode 100644 index 00000000000..b394d5a82b7 --- /dev/null +++ b/nexus/reconfigurator/planning/src/blueprint_editor/allocators.rs @@ -0,0 +1,111 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Blueprint planner resource allocation + +use std::net::IpAddr; + +use super::SledEditor; +use nexus_types::deployment::BlueprintZoneFilter; +use omicron_common::address::DnsSubnet; +use omicron_common::address::IpRange; +use omicron_common::address::ReservedRackSubnet; +use omicron_uuid_kinds::SledUuid; + +mod external_networking; +mod internal_dns; + +pub use self::external_networking::ExternalNetworkingError; +pub use self::internal_dns::InternalDnsError; +pub use self::internal_dns::InternalDnsInputError; + +pub(crate) use self::external_networking::ExternalNetworkingChoice; +pub(crate) use self::external_networking::ExternalSnatNetworkingChoice; + +use self::external_networking::ExternalNetworkingAllocator; +use self::internal_dns::InternalDnsSubnetAllocator; + +#[derive(Debug, thiserror::Error)] +pub enum BlueprintResourceAllocatorInputError { + #[error(transparent)] + InternalDns(#[from] InternalDnsInputError), + #[error("failed to create external networking allocator")] + ExternalNetworking(#[source] anyhow::Error), +} + +#[derive(Debug)] +pub(crate) struct BlueprintResourceAllocator { + external_networking: ExternalNetworkingAllocator, + internal_dns: InternalDnsSubnetAllocator, +} + +impl BlueprintResourceAllocator { + pub fn new<'a, I>( + all_sleds: I, + service_ip_pool_ranges: Vec, + target_internal_dns_redundancy: usize, + ) -> Result + where + I: Iterator + Clone, + { + let internal_dns = InternalDnsSubnetAllocator::new( + all_sleds.clone().flat_map(|(_, editor)| { + editor.zones(BlueprintZoneFilter::ShouldBeRunning) + }), + target_internal_dns_redundancy, + )?; + + let external_networking = ExternalNetworkingAllocator::new( + all_sleds.clone().flat_map(|(_, editor)| { + editor.zones(BlueprintZoneFilter::ShouldBeRunning) + }), + all_sleds.flat_map(|(_, editor)| { + editor.zones(BlueprintZoneFilter::Expunged) + }), + service_ip_pool_ranges, + ) + .map_err(BlueprintResourceAllocatorInputError::ExternalNetworking)?; + + Ok(Self { external_networking, internal_dns }) + } + + pub fn next_internal_dns_subnet( + &mut self, + rack_subnet: ReservedRackSubnet, + ) -> Result { + self.internal_dns.alloc(rack_subnet) + } + + pub(crate) fn next_external_ip_nexus( + &mut self, + ) -> Result { + self.external_networking.for_new_nexus() + } + + pub(crate) fn next_external_ip_external_dns( + &mut self, + ) -> Result { + self.external_networking.for_new_external_dns() + } + + pub(crate) fn next_external_ip_boundary_ntp( + &mut self, + ) -> Result { + self.external_networking.for_new_boundary_ntp() + } + + /// Allow a test to manually add an external DNS address, which could + /// ordinarily only come from RSS. + /// + /// TODO-cleanup: Remove when external DNS addresses are in the policy. + // This can't be `#[cfg(test)]` because it's used by the `ExampleSystem` + // helper (which itself is used by reconfigurator-cli and friends). We give + // it a scary name instead. + pub(crate) fn inject_untracked_external_dns_ip( + &mut self, + ip: IpAddr, + ) -> Result<(), ExternalNetworkingError> { + self.external_networking.add_external_dns_ip(ip) + } +} diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/external_networking.rs b/nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs similarity index 78% rename from nexus/reconfigurator/planning/src/blueprint_builder/external_networking.rs rename to nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs index 9657ed9ed89..a63175c1789 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/external_networking.rs +++ b/nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs @@ -2,18 +2,13 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use super::Error; -use anyhow::anyhow; use anyhow::bail; use debug_ignore::DebugIgnore; use nexus_config::NUM_INITIAL_RESERVED_IP_ADDRESSES; use nexus_sled_agent_shared::inventory::ZoneKind; -use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintZoneConfig; -use nexus_types::deployment::BlueprintZoneFilter; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::OmicronZoneExternalIp; -use nexus_types::deployment::PlanningInput; use nexus_types::inventory::SourceNatConfig; use omicron_common::address::IpRange; use omicron_common::address::DNS_OPTE_IPV4_SUBNET; @@ -36,8 +31,22 @@ use std::net::Ipv4Addr; use std::net::Ipv6Addr; use strum::IntoEnumIterator as _; +#[derive(Debug, thiserror::Error)] +pub enum ExternalNetworkingError { + #[error("no external DNS IP addresses are available")] + NoExternalDnsIpAvailable, + #[error("no external service IP addresses are available")] + NoExternalServiceIpAvailable, + #[error("no system MAC addresses are available")] + NoSystemMacAddressAvailable, + #[error("exhausted available OPTE IP addresses for service {kind:?}")] + ExhaustedOpteIps { kind: ZoneKind }, + #[error("attempted to add duplicate external DNS IP: {ip}")] + AddDuplicateExternalDnsIp { ip: IpAddr }, +} + #[derive(Debug)] -pub(super) struct BuilderExternalNetworking<'a> { +pub(super) struct ExternalNetworkingAllocator { // These fields mirror how RSS chooses addresses for zone NICs. boundary_ntp_v4_ips: AvailableIterator<'static, Ipv4Addr>, boundary_ntp_v6_ips: AvailableIterator<'static, Ipv6Addr>, @@ -51,17 +60,17 @@ pub(super) struct BuilderExternalNetworking<'a> { available_external_dns_ips: BTreeSet, // Allocator for external IPs for service zones - external_ip_alloc: ExternalIpAllocator<'a>, + external_ip_alloc: ExternalIpAllocator, // Iterator of available MAC addresses in the system address range - available_system_macs: AvailableIterator<'a, MacAddr>, + available_system_macs: AvailableIterator<'static, MacAddr>, } -impl<'a> BuilderExternalNetworking<'a> { +impl ExternalNetworkingAllocator { pub(super) fn new<'b>( running_omicron_zones: impl Iterator, expunged_omicron_zones: impl Iterator, - service_ip_pool_ranges: &'a [IpRange], + service_ip_pool_ranges: Vec, ) -> anyhow::Result { // Scan through the running zones and build several sets of "used // resources". When adding new control plane zones to a sled, we may @@ -259,20 +268,24 @@ impl<'a> BuilderExternalNetworking<'a> { pub(super) fn for_new_nexus( &mut self, - ) -> Result { + ) -> Result { let external_ip = self.external_ip_alloc.claim_next_exclusive_ip()?; let (nic_ip, nic_subnet) = match external_ip { IpAddr::V4(_) => ( self.nexus_v4_ips .next() - .ok_or(Error::ExhaustedOpteIps { kind: ZoneKind::Nexus })? + .ok_or(ExternalNetworkingError::ExhaustedOpteIps { + kind: ZoneKind::Nexus, + })? .into(), IpNet::from(*NEXUS_OPTE_IPV4_SUBNET), ), IpAddr::V6(_) => ( self.nexus_v6_ips .next() - .ok_or(Error::ExhaustedOpteIps { kind: ZoneKind::Nexus })? + .ok_or(ExternalNetworkingError::ExhaustedOpteIps { + kind: ZoneKind::Nexus, + })? .into(), IpNet::from(*NEXUS_OPTE_IPV6_SUBNET), ), @@ -280,7 +293,7 @@ impl<'a> BuilderExternalNetworking<'a> { let nic_mac = self .available_system_macs .next() - .ok_or(Error::NoSystemMacAddressAvailable)?; + .ok_or(ExternalNetworkingError::NoSystemMacAddressAvailable)?; Ok(ExternalNetworkingChoice { external_ip, @@ -292,13 +305,13 @@ impl<'a> BuilderExternalNetworking<'a> { pub(super) fn for_new_boundary_ntp( &mut self, - ) -> Result { + ) -> Result { let snat_cfg = self.external_ip_alloc.claim_next_snat_ip()?; let (nic_ip, nic_subnet) = match snat_cfg.ip { IpAddr::V4(_) => ( self.boundary_ntp_v4_ips .next() - .ok_or(Error::ExhaustedOpteIps { + .ok_or(ExternalNetworkingError::ExhaustedOpteIps { kind: ZoneKind::BoundaryNtp, })? .into(), @@ -307,7 +320,7 @@ impl<'a> BuilderExternalNetworking<'a> { IpAddr::V6(_) => ( self.boundary_ntp_v6_ips .next() - .ok_or(Error::ExhaustedOpteIps { + .ok_or(ExternalNetworkingError::ExhaustedOpteIps { kind: ZoneKind::BoundaryNtp, })? .into(), @@ -317,7 +330,7 @@ impl<'a> BuilderExternalNetworking<'a> { let nic_mac = self .available_system_macs .next() - .ok_or(Error::NoSystemMacAddressAvailable)?; + .ok_or(ExternalNetworkingError::NoSystemMacAddressAvailable)?; Ok(ExternalSnatNetworkingChoice { snat_cfg, @@ -329,17 +342,17 @@ impl<'a> BuilderExternalNetworking<'a> { pub(super) fn for_new_external_dns( &mut self, - ) -> Result { + ) -> Result { let external_ip = self .available_external_dns_ips .pop_first() - .ok_or(Error::NoExternalDnsIpAvailable)?; + .ok_or(ExternalNetworkingError::NoExternalDnsIpAvailable)?; let (nic_ip, nic_subnet) = match external_ip { IpAddr::V4(_) => ( self.external_dns_v4_ips .next() - .ok_or(Error::ExhaustedOpteIps { + .ok_or(ExternalNetworkingError::ExhaustedOpteIps { kind: ZoneKind::ExternalDns, })? .into(), @@ -348,7 +361,7 @@ impl<'a> BuilderExternalNetworking<'a> { IpAddr::V6(_) => ( self.external_dns_v6_ips .next() - .ok_or(Error::ExhaustedOpteIps { + .ok_or(ExternalNetworkingError::ExhaustedOpteIps { kind: ZoneKind::ExternalDns, })? .into(), @@ -358,7 +371,7 @@ impl<'a> BuilderExternalNetworking<'a> { let nic_mac = self .available_system_macs .next() - .ok_or(Error::NoSystemMacAddressAvailable)?; + .ok_or(ExternalNetworkingError::NoSystemMacAddressAvailable)?; Ok(ExternalNetworkingChoice { external_ip, @@ -375,181 +388,31 @@ impl<'a> BuilderExternalNetworking<'a> { pub(crate) fn add_external_dns_ip( &mut self, addr: IpAddr, - ) -> Result<(), Error> { - if self.available_external_dns_ips.contains(&addr) { - return Err(Error::Planner(anyhow!( - "external DNS IP address already in use: {addr}" - ))); - } - - self.available_external_dns_ips.insert(addr); - Ok(()) - } -} - -// Helper to validate that the system hasn't gone off the rails. There should -// never be any external networking resources in the planning input (which is -// derived from the contents of CRDB) that we don't know about from the parent -// blueprint. It's possible a given planning iteration could see such a state -// there have been intermediate changes made by other Nexus instances; e.g., -// -// 1. Nexus A generates a `PlanningInput` by reading from CRDB -// 2. Nexus B executes on a target blueprint that removes IPs/NICs from -// CRDB -// 3. Nexus B regenerates a new blueprint and prunes the zone(s) associated -// with the IPs/NICs from step 2 -// 4. Nexus B makes this new blueprint the target -// 5. Nexus A attempts to run planning with its `PlanningInput` from step 1 but -// the target blueprint from step 4; this will fail the following checks -// because the input contains records that were removed in step 3 -// -// We do not need to handle this class of error; it's a transient failure that -// will clear itself up when Nexus A repeats its planning loop from the top and -// generates a new `PlanningInput`. -// -// There may still be database records corresponding to _expunged_ zones, but -// that's okay: it just means we haven't yet realized a blueprint where those -// zones are expunged. And those should should still be in the blueprint (not -// pruned) until their database records are cleaned up. -// -// It's also possible that there may be networking records in the database -// assigned to zones that have been expunged, and our parent blueprint uses -// those same records for new zones. This is also fine and expected, and is a -// similar case to the previous paragraph: a zone with networking resources was -// expunged, the database doesn't realize it yet, but can still move forward and -// make planning decisions that reuse those resources for new zones. -pub(super) fn ensure_input_networking_records_appear_in_parent_blueprint( - parent_blueprint: &Blueprint, - input: &PlanningInput, -) -> anyhow::Result<()> { - let mut all_macs: HashSet = HashSet::new(); - let mut all_nexus_nic_ips: HashSet = HashSet::new(); - let mut all_boundary_ntp_nic_ips: HashSet = HashSet::new(); - let mut all_external_dns_nic_ips: HashSet = HashSet::new(); - let mut all_external_ips: HashSet = HashSet::new(); - - // Unlike the construction of the external IP allocator and existing IPs - // constructed above in `BuilderExternalNetworking::new()`, we do not - // check for duplicates here: we could very well see reuse of IPs - // between expunged zones or between expunged -> running zones. - for (_, z) in parent_blueprint.all_omicron_zones(BlueprintZoneFilter::All) { - let zone_type = &z.zone_type; - match zone_type { - BlueprintZoneType::BoundaryNtp(ntp) => { - all_boundary_ntp_nic_ips.insert(ntp.nic.ip); - } - BlueprintZoneType::Nexus(nexus) => { - all_nexus_nic_ips.insert(nexus.nic.ip); - } - BlueprintZoneType::ExternalDns(dns) => { - all_external_dns_nic_ips.insert(dns.nic.ip); - } - _ => (), - } - - if let Some((external_ip, nic)) = zone_type.external_networking() { - // As above, ignore localhost (used by the test suite). - if !external_ip.ip().is_loopback() { - all_external_ips.insert(external_ip); - } - all_macs.insert(nic.mac); - } - } - for external_ip_entry in - input.network_resources().omicron_zone_external_ips() - { - // As above, ignore localhost (used by the test suite). - if external_ip_entry.ip.ip().is_loopback() { - continue; - } - if !all_external_ips.contains(&external_ip_entry.ip) { - bail!( - "planning input contains unexpected external IP \ - (IP not found in parent blueprint): {external_ip_entry:?}" - ); - } - } - for nic_entry in input.network_resources().omicron_zone_nics() { - if !all_macs.contains(&nic_entry.nic.mac) { - bail!( - "planning input contains unexpected NIC \ - (MAC not found in parent blueprint): {nic_entry:?}" - ); - } - match nic_entry.nic.ip { - IpAddr::V4(ip) if NEXUS_OPTE_IPV4_SUBNET.contains(ip) => { - if !all_nexus_nic_ips.contains(&ip.into()) { - bail!( - "planning input contains unexpected NIC \ - (IP not found in parent blueprint): {nic_entry:?}" - ); - } - } - IpAddr::V4(ip) if NTP_OPTE_IPV4_SUBNET.contains(ip) => { - if !all_boundary_ntp_nic_ips.contains(&ip.into()) { - bail!( - "planning input contains unexpected NIC \ - (IP not found in parent blueprint): {nic_entry:?}" - ); - } - } - IpAddr::V4(ip) if DNS_OPTE_IPV4_SUBNET.contains(ip) => { - if !all_external_dns_nic_ips.contains(&ip.into()) { - bail!( - "planning input contains unexpected NIC \ - (IP not found in parent blueprint): {nic_entry:?}" - ); - } - } - IpAddr::V6(ip) if NEXUS_OPTE_IPV6_SUBNET.contains(ip) => { - if !all_nexus_nic_ips.contains(&ip.into()) { - bail!( - "planning input contains unexpected NIC \ - (IP not found in parent blueprint): {nic_entry:?}" - ); - } - } - IpAddr::V6(ip) if NTP_OPTE_IPV6_SUBNET.contains(ip) => { - if !all_boundary_ntp_nic_ips.contains(&ip.into()) { - bail!( - "planning input contains unexpected NIC \ - (IP not found in parent blueprint): {nic_entry:?}" - ); - } - } - IpAddr::V6(ip) if DNS_OPTE_IPV6_SUBNET.contains(ip) => { - if !all_external_dns_nic_ips.contains(&ip.into()) { - bail!( - "planning input contains unexpected NIC \ - (IP not found in parent blueprint): {nic_entry:?}" - ); - } - } - _ => { - bail!( - "planning input contains unexpected NIC \ - (IP not contained in known OPTE subnet): {nic_entry:?}" - ) - } + ) -> Result<(), ExternalNetworkingError> { + if self.available_external_dns_ips.insert(addr) { + Ok(()) + } else { + return Err(ExternalNetworkingError::AddDuplicateExternalDnsIp { + ip: addr, + }); } } - Ok(()) } #[derive(Debug, Clone, Copy)] -pub(super) struct ExternalNetworkingChoice { - pub(super) external_ip: IpAddr, - pub(super) nic_ip: IpAddr, - pub(super) nic_subnet: IpNet, - pub(super) nic_mac: MacAddr, +pub(crate) struct ExternalNetworkingChoice { + pub(crate) external_ip: IpAddr, + pub(crate) nic_ip: IpAddr, + pub(crate) nic_subnet: IpNet, + pub(crate) nic_mac: MacAddr, } #[derive(Debug, Clone, Copy)] -pub(super) struct ExternalSnatNetworkingChoice { - pub(super) snat_cfg: SourceNatConfig, - pub(super) nic_ip: IpAddr, - pub(super) nic_subnet: IpNet, - pub(super) nic_mac: MacAddr, +pub(crate) struct ExternalSnatNetworkingChoice { + pub(crate) snat_cfg: SourceNatConfig, + pub(crate) nic_ip: IpAddr, + pub(crate) nic_subnet: IpNet, + pub(crate) nic_mac: MacAddr, } /// Combines a base iterator with an `in_use` set, filtering out any elements @@ -592,24 +455,24 @@ impl Iterator for AvailableIterator<'_, T> { // This struct keeps track of both kinds of IPs used by blueprints, allowing // allocation of either kind. #[derive(Debug)] -pub(super) struct ExternalIpAllocator<'a> { - service_ip_pool_ips: DebugIgnore + 'a>>, +struct ExternalIpAllocator { + service_ip_pool_ips: DebugIgnore>>, used_exclusive_ips: BTreeSet, used_snat_ips: BTreeMap>, } #[derive(Debug, thiserror::Error)] -pub(super) enum ExternalIpAllocatorError { +enum ExternalIpAllocatorError { #[error("duplicate external IP: {0:?}")] DuplicateExternalIp(OmicronZoneExternalIp), #[error("invalid SNAT port range")] InvalidSnatPortRange(#[source] anyhow::Error), } -impl<'a> ExternalIpAllocator<'a> { - pub fn new(service_pool_ranges: &'a [IpRange]) -> Self { +impl ExternalIpAllocator { + fn new(service_pool_ranges: Vec) -> Self { let service_ip_pool_ips = - service_pool_ranges.iter().flat_map(|r| r.iter()); + service_pool_ranges.into_iter().flat_map(|r| r.iter()); Self { service_ip_pool_ips: DebugIgnore(Box::new(service_ip_pool_ips)), used_exclusive_ips: BTreeSet::new(), @@ -617,7 +480,7 @@ impl<'a> ExternalIpAllocator<'a> { } } - pub fn mark_ip_used( + fn mark_ip_used( &mut self, external_ip: &OmicronZoneExternalIp, ) -> Result<(), ExternalIpAllocatorError> { @@ -693,7 +556,9 @@ impl<'a> ExternalIpAllocator<'a> { } } - fn claim_next_exclusive_ip(&mut self) -> Result { + fn claim_next_exclusive_ip( + &mut self, + ) -> Result { for ip in &mut *self.service_ip_pool_ips { if !self.used_snat_ips.contains_key(&ip) && self.used_exclusive_ips.insert(ip) @@ -702,10 +567,12 @@ impl<'a> ExternalIpAllocator<'a> { } } - Err(Error::NoExternalServiceIpAvailable) + Err(ExternalNetworkingError::NoExternalServiceIpAvailable) } - fn claim_next_snat_ip(&mut self) -> Result { + fn claim_next_snat_ip( + &mut self, + ) -> Result { // Prefer reusing an existing SNAT IP, if we still have port ranges // available on that ip. for (ip, used_port_ranges) in self.used_snat_ips.iter_mut() { @@ -732,7 +599,7 @@ impl<'a> ExternalIpAllocator<'a> { } } - Err(Error::NoExternalServiceIpAvailable) + Err(ExternalNetworkingError::NoExternalServiceIpAvailable) } } @@ -903,7 +770,7 @@ pub mod test { }; // Build up the allocator and mark all used IPs. - let mut allocator = ExternalIpAllocator::new(&ip_pool_ranges); + let mut allocator = ExternalIpAllocator::new(ip_pool_ranges); for &ip in &used_exclusive { allocator .mark_ip_used(&as_floating(ip)) @@ -1064,10 +931,10 @@ pub mod test { // Construct a builder; ask for external DNS IPs first (we should get IP // 1 then "none available") then Nexus IPs (we should get IP 2 then // "none available"). - let mut builder = BuilderExternalNetworking::new( + let mut builder = ExternalNetworkingAllocator::new( [&running_external_dns].iter().copied(), [&expunged_external_dns].iter().copied(), - std::slice::from_ref(&service_ip_pool), + vec![service_ip_pool], ) .expect("constructed builder"); @@ -1081,7 +948,7 @@ pub mod test { ); let err = builder.for_new_external_dns().expect_err("no DNS IPs left"); assert!( - matches!(err, Error::NoExternalDnsIpAvailable), + matches!(err, ExternalNetworkingError::NoExternalDnsIpAvailable), "unexpected error: {}", InlineErrorChain::new(&err), ); @@ -1093,7 +960,10 @@ pub mod test { ); let err = builder.for_new_nexus().expect_err("no Nexus IPs left"); assert!( - matches!(err, Error::NoExternalServiceIpAvailable), + matches!( + err, + ExternalNetworkingError::NoExternalServiceIpAvailable + ), "unexpected error: {}", InlineErrorChain::new(&err), ); @@ -1101,10 +971,10 @@ pub mod test { // Repeat the above test, but ask for IPs in the reverse order (Nexus // then external DNS). The outcome should be identical, as the IPs are // partitioned off and do not depend on request ordering. - let mut builder = BuilderExternalNetworking::new( + let mut builder = ExternalNetworkingAllocator::new( [&running_external_dns].iter().copied(), [&expunged_external_dns].iter().copied(), - std::slice::from_ref(&service_ip_pool), + vec![service_ip_pool], ) .expect("constructed builder"); @@ -1115,7 +985,10 @@ pub mod test { ); let err = builder.for_new_nexus().expect_err("no Nexus IPs left"); assert!( - matches!(err, Error::NoExternalServiceIpAvailable), + matches!( + err, + ExternalNetworkingError::NoExternalServiceIpAvailable + ), "unexpected error: {}", InlineErrorChain::new(&err), ); @@ -1130,7 +1003,7 @@ pub mod test { ); let err = builder.for_new_external_dns().expect_err("no DNS IPs left"); assert!( - matches!(err, Error::NoExternalDnsIpAvailable), + matches!(err, ExternalNetworkingError::NoExternalDnsIpAvailable), "unexpected error: {}", InlineErrorChain::new(&err), ); @@ -1141,10 +1014,10 @@ pub mod test { // same time). let expunged_external_dns2 = make_external_dns(1, BlueprintZoneDisposition::Expunged); - let mut builder = BuilderExternalNetworking::new( + let mut builder = ExternalNetworkingAllocator::new( [&running_external_dns].iter().copied(), [&expunged_external_dns, &expunged_external_dns2].iter().copied(), - std::slice::from_ref(&service_ip_pool), + vec![service_ip_pool], ) .expect("constructed builder"); @@ -1155,7 +1028,10 @@ pub mod test { ); let err = builder.for_new_nexus().expect_err("no Nexus IPs left"); assert!( - matches!(err, Error::NoExternalServiceIpAvailable), + matches!( + err, + ExternalNetworkingError::NoExternalServiceIpAvailable + ), "unexpected error: {}", InlineErrorChain::new(&err), ); @@ -1170,7 +1046,7 @@ pub mod test { ); let err = builder.for_new_external_dns().expect_err("no DNS IPs left"); assert!( - matches!(err, Error::NoExternalDnsIpAvailable), + matches!(err, ExternalNetworkingError::NoExternalDnsIpAvailable), "unexpected error: {}", InlineErrorChain::new(&err), ); diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/internal_dns.rs b/nexus/reconfigurator/planning/src/blueprint_editor/allocators/internal_dns.rs similarity index 87% rename from nexus/reconfigurator/planning/src/blueprint_builder/internal_dns.rs rename to nexus/reconfigurator/planning/src/blueprint_editor/allocators/internal_dns.rs index d0d849ac492..98073095c08 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/internal_dns.rs +++ b/nexus/reconfigurator/planning/src/blueprint_editor/allocators/internal_dns.rs @@ -2,31 +2,41 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use super::Error; use nexus_types::deployment::blueprint_zone_type::InternalDns; use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::BlueprintZoneType; -use nexus_types::deployment::PlanningInput; use omicron_common::address::DnsSubnet; use omicron_common::address::ReservedRackSubnet; use omicron_common::policy::INTERNAL_DNS_REDUNDANCY; use std::collections::BTreeSet; +#[derive(Debug, thiserror::Error)] +pub enum InternalDnsInputError { + #[error("can only have {INTERNAL_DNS_REDUNDANCY} internal DNS servers")] + TooManyDnsServers, +} + +#[derive(Debug, thiserror::Error)] +pub enum InternalDnsError { + #[error("no reserved subnets available for internal DNS")] + NoAvailableDnsSubnets, +} + /// Internal DNS zones are not allocated an address in the sled's subnet. /// Instead, they get a /64 subnet of the "reserved" rack subnet (so that /// it's routable with IPv6), and use the first address in that. There may /// be at most `INTERNAL_DNS_REDUNDANCY` subnets (and so servers) /// allocated. This structure tracks which subnets are currently allocated. #[derive(Debug)] -pub struct DnsSubnetAllocator { +pub struct InternalDnsSubnetAllocator { in_use: BTreeSet, } -impl DnsSubnetAllocator { +impl InternalDnsSubnetAllocator { pub fn new<'a>( running_omicron_zones: impl Iterator, - input: &'a PlanningInput, - ) -> Result { + target_redundancy: usize, + ) -> Result { let in_use = running_omicron_zones .filter_map(|zone_config| match zone_config.zone_type { BlueprintZoneType::InternalDns(InternalDns { @@ -37,9 +47,11 @@ impl DnsSubnetAllocator { }) .collect::>(); - let redundancy = input.target_internal_dns_zone_count(); - if redundancy > INTERNAL_DNS_REDUNDANCY { - return Err(Error::TooManyDnsServers); + // TODO-cleanup This is the only reason we take `target_redundancy`; do + // we have another use for it in the future, or should someone else do + // this check? + if target_redundancy > INTERNAL_DNS_REDUNDANCY { + return Err(InternalDnsInputError::TooManyDnsServers); } Ok(Self { in_use }) @@ -53,7 +65,7 @@ impl DnsSubnetAllocator { pub fn alloc( &mut self, rack_subnet: ReservedRackSubnet, - ) -> Result { + ) -> Result { let new = if let Some(first) = self.in_use.first() { // Take the first available DNS subnet. We currently generate // all `INTERNAL_DNS_REDUNDANCY` subnets and subtract any @@ -66,7 +78,7 @@ impl DnsSubnetAllocator { if let Some(first) = avail.next() { *first } else { - return Err(Error::NoAvailableDnsSubnets); + return Err(InternalDnsError::NoAvailableDnsSubnets); } } else { rack_subnet.get_dns_subnet(1) @@ -150,11 +162,11 @@ pub mod test { verify_blueprint(&blueprint1); // Create an allocator. - let mut allocator = DnsSubnetAllocator::new( + let mut allocator = InternalDnsSubnetAllocator::new( blueprint1 .all_omicron_zones(BlueprintZoneFilter::ShouldBeRunning) .map(|(_sled_id, zone_config)| zone_config), - &example.input, + example.input.target_internal_dns_zone_count(), ) .expect("can't create allocator"); diff --git a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs index 7cd09ef962f..8ba56613692 100644 --- a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs +++ b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs @@ -20,15 +20,21 @@ use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::BlueprintZonesConfig; use nexus_types::deployment::DiskFilter; use nexus_types::external_api::views::SledState; +use omicron_common::address::Ipv6Subnet; +use omicron_common::address::SLED_PREFIX; use omicron_common::disk::DatasetKind; use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::ZpoolUuid; +use std::iter; use std::mem; +use std::net::Ipv6Addr; +use underlay_ip_allocator::SledUnderlayIpAllocator; mod datasets; mod disks; +mod underlay_ip_allocator; mod zones; pub use self::datasets::DatasetsEditError; @@ -102,6 +108,8 @@ pub enum SledEditError { zpool ({zpool}) is not present in this sled's disks" )] ZoneOnNonexistentZpool { zone_id: OmicronZoneUuid, zpool: ZpoolName }, + #[error("ran out of underlay IP addresses")] + OutOfUnderlayIps, } #[derive(Debug)] @@ -118,32 +126,33 @@ enum InnerSledEditor { } impl SledEditor { - pub fn for_existing( - state: SledState, + pub fn for_existing_active( + subnet: Ipv6Subnet, zones: BlueprintZonesConfig, disks: BlueprintPhysicalDisksConfig, datasets: BlueprintDatasetsConfig, ) -> Result { - match state { - SledState::Active => { - let inner = ActiveSledEditor::new(zones, disks, datasets)?; - Ok(Self(InnerSledEditor::Active(inner))) - } - SledState::Decommissioned => { - let inner = EditedSled { - state, - zones, - disks, - datasets, - edit_counts: SledEditCounts::zeroes(), - }; - Ok(Self(InnerSledEditor::Decommissioned(inner))) - } - } + let inner = ActiveSledEditor::new(subnet, zones, disks, datasets)?; + Ok(Self(InnerSledEditor::Active(inner))) + } + + pub fn for_existing_decommissioned( + zones: BlueprintZonesConfig, + disks: BlueprintPhysicalDisksConfig, + datasets: BlueprintDatasetsConfig, + ) -> Result { + let inner = EditedSled { + state: SledState::Decommissioned, + zones, + disks, + datasets, + edit_counts: SledEditCounts::zeroes(), + }; + Ok(Self(InnerSledEditor::Decommissioned(inner))) } - pub fn for_new_active() -> Self { - Self(InnerSledEditor::Active(ActiveSledEditor::new_empty())) + pub fn for_new_active(subnet: Ipv6Subnet) -> Self { + Self(InnerSledEditor::Active(ActiveSledEditor::new_empty(subnet))) } pub fn finalize(self) -> EditedSled { @@ -179,7 +188,9 @@ impl SledEditor { // below, we'll be left in the active state with an empty sled // editor), but omicron in general is not panic safe and aborts // on panic. Plus `finalize()` should never panic. - let mut stolen = ActiveSledEditor::new_empty(); + let mut stolen = ActiveSledEditor::new_empty(Ipv6Subnet::new( + Ipv6Addr::LOCALHOST, + )); mem::swap(editor, &mut stolen); let mut finalized = stolen.finalize(); @@ -192,6 +203,12 @@ impl SledEditor { Ok(()) } + pub fn alloc_underlay_ip(&mut self) -> Result { + self.as_active_mut()? + .alloc_underlay_ip() + .ok_or(SledEditError::OutOfUnderlayIps) + } + pub fn disks( &self, filter: DiskFilter, @@ -283,6 +300,7 @@ impl SledEditor { #[derive(Debug)] struct ActiveSledEditor { + underlay_ip_allocator: SledUnderlayIpAllocator, zones: ZonesEditor, disks: DisksEditor, datasets: DatasetsEditor, @@ -299,19 +317,40 @@ pub(crate) struct EditedSled { impl ActiveSledEditor { pub fn new( + subnet: Ipv6Subnet, zones: BlueprintZonesConfig, disks: BlueprintPhysicalDisksConfig, datasets: BlueprintDatasetsConfig, ) -> Result { + let zones = ZonesEditor::from(zones); + + // We never reuse underlay IPs within a sled, regardless of zone + // dispositions. If a zone has been fully removed from the blueprint + // some time after expungement, we may reuse its IP; reconfigurator must + // know that's safe prior to pruning the expunged zone. + let zone_ips = + zones.zones(BlueprintZoneFilter::All).map(|z| z.underlay_ip()); + Ok(Self { - zones: zones.into(), + underlay_ip_allocator: SledUnderlayIpAllocator::new( + subnet, zone_ips, + ), + zones, disks: disks.try_into()?, datasets: DatasetsEditor::new(datasets)?, }) } - pub fn new_empty() -> Self { + pub fn new_empty(subnet: Ipv6Subnet) -> Self { + // Creating the underlay IP allocator can only fail if we have a zone + // with an IP outside the sled subnet, but we don't have any zones at + // all, so this can't fail. Match explicitly to guard against this error + // turning into an enum and getting new variants we'd need to check. + let underlay_ip_allocator = + SledUnderlayIpAllocator::new(subnet, iter::empty()); + Self { + underlay_ip_allocator, zones: ZonesEditor::empty(), disks: DisksEditor::empty(), datasets: DatasetsEditor::empty(), @@ -361,6 +400,10 @@ impl ActiveSledEditor { } } + pub fn alloc_underlay_ip(&mut self) -> Option { + self.underlay_ip_allocator.alloc() + } + pub fn disks( &self, filter: DiskFilter, @@ -415,6 +458,11 @@ impl ActiveSledEditor { // Ensure we can construct the configs for the datasets for this zone. let datasets = ZoneDatasetConfigs::new(&self.disks, &zone)?; + // This zone's underlay IP should have come from us (via + // `alloc_underlay_ip()`), but in case it wasn't, ensure we don't hand + // out this IP again later. + self.underlay_ip_allocator.mark_as_allocated(zone.underlay_ip()); + // Actually add the zone and its datasets. self.zones.add_zone(zone)?; datasets.ensure_in_service(&mut self.datasets, rng); diff --git a/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/underlay_ip_allocator.rs b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/underlay_ip_allocator.rs new file mode 100644 index 00000000000..9586497d443 --- /dev/null +++ b/nexus/reconfigurator/planning/src/blueprint_editor/sled_editor/underlay_ip_allocator.rs @@ -0,0 +1,156 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Allocator for zone underlay IP addresses with a single sled's subnet. + +use ipnet::IpAdd; +use omicron_common::address::get_sled_address; +use omicron_common::address::get_switch_zone_address; +use omicron_common::address::Ipv6Subnet; +use omicron_common::address::CP_SERVICES_RESERVED_ADDRESSES; +use omicron_common::address::SLED_PREFIX; +use omicron_common::address::SLED_RESERVED_ADDRESSES; +use std::net::Ipv6Addr; + +/// Very simple allocator for picking addresses from a sled's subnet +/// +/// The current implementation takes the max address seen so far and uses the +/// next one. This will never reuse old IPs. That avoids a bunch of +/// operational issues. It does mean we will eventually run out of IPs. But we +/// do have a big space right now (2^16). +// This overlaps with the bump allocator that's used in RSS. That one is not +// general enough to use here, though this one could potentially be used there. +#[derive(Debug)] +pub(crate) struct SledUnderlayIpAllocator { + last: Ipv6Addr, + maximum: Ipv6Addr, +} + +impl SledUnderlayIpAllocator { + /// Create a new allocator for the given sled subnet that reserves all the + /// specified IPs. + /// + /// Fails if any of the specified IPs are not part of the sled subnet. + pub fn new(sled_subnet: Ipv6Subnet, in_use_ips: I) -> Self + where + I: Iterator, + { + let sled_subnet_addr = sled_subnet.net().prefix(); + let minimum = sled_subnet_addr + .saturating_add(u128::from(SLED_RESERVED_ADDRESSES)); + let maximum = sled_subnet_addr + .saturating_add(u128::from(CP_SERVICES_RESERVED_ADDRESSES)); + assert!(maximum > minimum); + + // We shouldn't need to explicitly reserve the sled's global + // zone and switch addresses because they should be out of our + // range, but we do so just to be sure. + let sled_gz_addr = *get_sled_address(sled_subnet).ip(); + assert!(sled_subnet.net().contains(sled_gz_addr)); + assert!(minimum > sled_gz_addr); + let switch_zone_addr = get_switch_zone_address(sled_subnet); + assert!(sled_subnet.net().contains(switch_zone_addr)); + assert!(minimum > switch_zone_addr); + assert!(sled_subnet.net().contains(minimum)); + assert!(sled_subnet.net().contains(maximum)); + + let mut slf = Self { last: minimum, maximum }; + for ip in in_use_ips { + slf.mark_as_allocated(ip); + } + assert!(minimum <= slf.last); + assert!(slf.last < slf.maximum); + + slf + } + + /// Mark an address as used. + /// + /// Marking an address that has already been handed out by this allocator + /// (or could have been handed out by this allocator) is allowed and does + /// nothing. + /// + /// Marking an address that is outside the range of this sled does nothing. + /// E.g., RSS currently allocates IPs from within the + /// `SLED_RESERVED_ADDRESSES` range, and internal DNS zone IPs are outside + /// the sled subnet entirely. IPs from these unexpected ranges are ignored. + pub fn mark_as_allocated(&mut self, ip: Ipv6Addr) { + if ip < self.maximum && ip > self.last { + self.last = ip; + } + } + + /// Allocate an unused address from this allocator's range + pub fn alloc(&mut self) -> Option { + let next = self.last.saturating_add(1); + if next == self.last { + // We ran out of the entire IPv6 address space. + return None; + } + + if next >= self.maximum { + // We ran out of our allotted range. + return None; + } + + self.last = next; + Some(next) + } +} + +#[cfg(test)] +mod test { + use std::collections::BTreeSet; + + use super::*; + + #[test] + fn test_basic() { + let sled_subnet = Ipv6Subnet::new("fd00::d0".parse().unwrap()); + let reserved: Vec = vec![ + "fd00::50".parse().unwrap(), + "fd00::d3".parse().unwrap(), + "fd00::d7".parse().unwrap(), + ]; + let reserved_ips = reserved.iter().copied().collect::>(); + + let mut allocator = + SledUnderlayIpAllocator::new(sled_subnet, reserved.iter().copied()); + + let mut allocated = Vec::new(); + for _ in 0..16 { + let addr = allocator.alloc().expect("allocated IP"); + println!("allocated: {addr}"); + assert!(!reserved_ips.contains(&addr)); + assert!(!allocated.contains(&addr)); + allocated.push(addr); + } + + assert_eq!( + allocated, + [ + // Because fd00::d7 was reserved, everything up to it is also + // skipped. It doesn't have to work that way, but it currently + // does. + "fd00::d8".parse::().unwrap(), + "fd00::d9".parse().unwrap(), + "fd00::da".parse().unwrap(), + "fd00::db".parse().unwrap(), + "fd00::dc".parse().unwrap(), + "fd00::dd".parse().unwrap(), + "fd00::de".parse().unwrap(), + "fd00::df".parse().unwrap(), + "fd00::e0".parse().unwrap(), + "fd00::e1".parse().unwrap(), + "fd00::e2".parse().unwrap(), + "fd00::e3".parse().unwrap(), + "fd00::e4".parse().unwrap(), + "fd00::e5".parse().unwrap(), + "fd00::e6".parse().unwrap(), + "fd00::e7".parse().unwrap(), + ] + .to_vec() + ); + } +} diff --git a/nexus/reconfigurator/planning/src/example.rs b/nexus/reconfigurator/planning/src/example.rs index 2b235d85499..0431d356d3d 100644 --- a/nexus/reconfigurator/planning/src/example.rs +++ b/nexus/reconfigurator/planning/src/example.rs @@ -405,7 +405,7 @@ impl ExampleSystemBuilder { // pick addresses in the TEST-NET-2 (RFC 5737) range. for i in 0..self.external_dns_count.0 { builder - .add_external_dns_ip(IpAddr::V4(Ipv4Addr::new( + .inject_untracked_external_dns_ip(IpAddr::V4(Ipv4Addr::new( 198, 51, 100, diff --git a/nexus/reconfigurator/planning/src/ip_allocator.rs b/nexus/reconfigurator/planning/src/ip_allocator.rs deleted file mode 100644 index a32fe936af8..00000000000 --- a/nexus/reconfigurator/planning/src/ip_allocator.rs +++ /dev/null @@ -1,120 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Very simple allocator for picking addresses from a sled's subnet - -use ipnet::IpAdd; -use std::net::Ipv6Addr; - -/// Very simple allocator for picking addresses from a sled's subnet -/// -/// The current implementation takes the max address seen so far and uses the -/// next one. This will never reuse old IPs. That avoids a bunch of -/// operational issues. It does mean we will eventually run out of IPs. But we -/// do have a big space right now (2^16). -// This overlaps with the bump allocator that's used in RSS. That one is not -// general enough to use here, though this one could potentially be used there. -pub struct IpAllocator { - last: Ipv6Addr, - maximum: Ipv6Addr, -} - -impl IpAllocator { - /// Make an allocator that allocates addresses from the range `(minimum, - /// maximum)` (exclusive). - pub fn new(minimum: Ipv6Addr, maximum: Ipv6Addr) -> IpAllocator { - IpAllocator { last: minimum, maximum } - } - - /// Mark the given address reserved so that it will never be returned by - /// `alloc()`. - /// - /// The given address can be outside the range provided to - /// `IpAllocator::new()`, in which case this reservation will be ignored. - pub fn reserve(&mut self, addr: Ipv6Addr) { - if addr < self.maximum && addr > self.last { - self.last = addr; - } - } - - /// Allocate an unused address from this allocator's range - pub fn alloc(&mut self) -> Option { - let next = self.last.saturating_add(1); - if next == self.last { - // We ran out of the entire IPv6 address space. - return None; - } - - if next >= self.maximum { - // We ran out of our allotted range. - return None; - } - - self.last = next; - Some(next) - } -} - -#[cfg(test)] -mod test { - use super::IpAllocator; - use std::collections::BTreeSet; - use std::net::Ipv6Addr; - - #[test] - fn test_basic() { - let range_start: Ipv6Addr = "fd00::d0".parse().unwrap(); - let range_end: Ipv6Addr = "fd00::e8".parse().unwrap(); - let reserved: BTreeSet = [ - // These first two are deliberately out of range. - "fd00::ff".parse().unwrap(), - "fd00::c0".parse().unwrap(), - "fd00::d3".parse().unwrap(), - "fd00::d7".parse().unwrap(), - ] - .iter() - .copied() - .collect(); - - let mut allocator = IpAllocator::new(range_start, range_end); - for r in &reserved { - allocator.reserve(*r); - } - - let mut allocated = BTreeSet::new(); - while let Some(addr) = allocator.alloc() { - println!("allocated: {}", addr); - assert!(!reserved.contains(&addr)); - assert!(!allocated.contains(&addr)); - allocated.insert(addr); - } - - assert_eq!( - allocated, - [ - // Because d7 was reserved, everything up to it is also skipped. - // It doesn't have to work that way, but it currently does. - "fd00::d8".parse().unwrap(), - "fd00::d9".parse().unwrap(), - "fd00::da".parse().unwrap(), - "fd00::db".parse().unwrap(), - "fd00::dc".parse().unwrap(), - "fd00::dd".parse().unwrap(), - "fd00::de".parse().unwrap(), - "fd00::df".parse().unwrap(), - "fd00::e0".parse().unwrap(), - "fd00::e1".parse().unwrap(), - "fd00::e2".parse().unwrap(), - "fd00::e3".parse().unwrap(), - "fd00::e4".parse().unwrap(), - "fd00::e5".parse().unwrap(), - "fd00::e6".parse().unwrap(), - "fd00::e7".parse().unwrap(), - ] - .iter() - .copied() - .collect() - ); - } -} diff --git a/nexus/reconfigurator/planning/src/lib.rs b/nexus/reconfigurator/planning/src/lib.rs index f6c521c0f82..8262380075c 100644 --- a/nexus/reconfigurator/planning/src/lib.rs +++ b/nexus/reconfigurator/planning/src/lib.rs @@ -9,6 +9,5 @@ pub mod blueprint_builder; pub mod blueprint_editor; pub mod example; -mod ip_allocator; pub mod planner; pub mod system; diff --git a/nexus/reconfigurator/planning/src/planner.rs b/nexus/reconfigurator/planning/src/planner.rs index f2084671e53..a6965e4d6dd 100644 --- a/nexus/reconfigurator/planning/src/planner.rs +++ b/nexus/reconfigurator/planning/src/planner.rs @@ -825,6 +825,7 @@ mod test { use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::ZpoolUuid; + use slog_error_chain::InlineErrorChain; use std::collections::BTreeSet; use std::collections::HashMap; use std::net::IpAddr; @@ -1240,7 +1241,7 @@ mod test { { Ok(_) => panic!("unexpected success"), Err(err) => { - let err = format!("{err:#}"); + let err = InlineErrorChain::new(&err).to_string(); assert!( err.contains("can only have ") && err.contains(" internal DNS servers"), @@ -1474,7 +1475,7 @@ mod test { .expect("can't parse external DNS IP address") }); for addr in external_dns_ips { - blueprint_builder.add_external_dns_ip(addr).unwrap(); + blueprint_builder.inject_untracked_external_dns_ip(addr).unwrap(); } // Now we can add external DNS zones. We'll add two to the first @@ -2102,6 +2103,16 @@ mod test { zone.disposition = BlueprintZoneDisposition::Expunged; } + // Similarly, a sled can only have gotten into the `Decommissioned` + // state via blueprints. If the database says the sled is + // decommissioned but the parent blueprint says it's still active, + // that's an invalid state that the planner will reject. + *blueprint1 + .sled_state + .get_mut(sled_id) + .expect("found state in parent blueprint") = + SledState::Decommissioned; + *sled_id }; println!("1 -> 2: decommissioned {decommissioned_sled_id}"); diff --git a/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_1_2.txt b/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_1_2.txt index c2458505254..e7357e5bbfa 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_1_2.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_1_2.txt @@ -200,7 +200,7 @@ to: blueprint 9f71f5d3-a272-4382-9154-6ea2e171a6c6 └─ + expunged - sled 68d24ac5-f341-49ea-a92a-0381b52ab387 (active): + sled 68d24ac5-f341-49ea-a92a-0381b52ab387 (decommissioned): physical disks from generation 2: ---------------------------------------------------------------------- diff --git a/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_2_2a.txt b/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_2_2a.txt index fbba79e9afe..f070fc076cd 100644 --- a/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_2_2a.txt +++ b/nexus/reconfigurator/planning/tests/output/planner_nonprovisionable_2_2a.txt @@ -185,7 +185,7 @@ to: blueprint 9f71f5d3-a272-4382-9154-6ea2e171a6c6 REMOVED SLEDS: - sled 68d24ac5-f341-49ea-a92a-0381b52ab387 (was active): + sled 68d24ac5-f341-49ea-a92a-0381b52ab387 (state was unknown): omicron zones from generation 2: --------------------------------------------------------------------------------------------- From 26d8e5007e39fb530ce67ebd44621d2d6ff54ae9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 3 Feb 2025 23:05:33 +0000 Subject: [PATCH 4/8] Bump openssl from 0.10.66 to 0.10.70 (#7464) --- Cargo.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 41dacdffcb2..82c9c792eb5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7717,9 +7717,9 @@ dependencies = [ [[package]] name = "openssl" -version = "0.10.66" +version = "0.10.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1" +checksum = "61cfb4e166a8bb8c9b55c500bc2308550148ece889be90f609377e58140f42c6" dependencies = [ "bitflags 2.6.0", "cfg-if", @@ -7749,9 +7749,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.104" +version = "0.9.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45abf306cbf99debc8195b66b7346498d7b10c210de50418b5ccd7ceba08c741" +checksum = "8b22d5b84be05a8d6947c7cb71f7c849aa0f112acd4bf51c2a7c1c988ac0a9dc" dependencies = [ "cc", "libc", From 2624bae900274812f25b3296e675252b402eb105 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Mon, 3 Feb 2025 15:25:03 -0800 Subject: [PATCH 5/8] releng: allow pinning Helios (#7469) Closes #7332. This defines a new file, **tools/pins.toml**, and an associated dev-tools crate, **omicron-pins**, which is intended to eventually replace the various commit/checksum entries scattered across the tools directory. TOML is selected for its trivial Serde deserializability, Rust support for automatic editing of the file without destroying comments, and its built-in support in the Nix language, so we can easily continue supporting our download tools and the Nix development environment flake as we transition those pins over to pins.toml. I expect `Pins::read_from_dir` will grow functionality for reading some commit refs from Cargo.toml or package-manifest.toml to avoid needing to update commits in several places where applicable. That is not relevant for Helios but will help for other tools we pin. --- Cargo.lock | 14 ++++ Cargo.toml | 3 + dev-tools/pins/Cargo.toml | 17 +++++ dev-tools/pins/src/lib.rs | 38 ++++++++++ dev-tools/releng/Cargo.toml | 1 + dev-tools/releng/src/main.rs | 130 +++++++++++++++++++++++++---------- tools/pins.toml | 4 ++ 7 files changed, 170 insertions(+), 37 deletions(-) create mode 100644 dev-tools/pins/Cargo.toml create mode 100644 dev-tools/pins/src/lib.rs create mode 100644 tools/pins.toml diff --git a/Cargo.lock b/Cargo.lock index 82c9c792eb5..d06b2ee5320 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7255,6 +7255,19 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "omicron-pins" +version = "0.1.0" +dependencies = [ + "anyhow", + "camino", + "cargo_metadata", + "fs-err", + "omicron-workspace-hack", + "serde", + "toml_edit 0.22.22", +] + [[package]] name = "omicron-releng" version = "0.1.0" @@ -7269,6 +7282,7 @@ dependencies = [ "futures", "hex", "omicron-common", + "omicron-pins", "omicron-workspace-hack", "omicron-zone-package", "once_cell", diff --git a/Cargo.toml b/Cargo.toml index b0505ae695e..7fa49f6298f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,6 +42,7 @@ members = [ "dev-tools/openapi-manager", "dev-tools/openapi-manager/types", "dev-tools/oxlog", + "dev-tools/pins", "dev-tools/reconfigurator-cli", "dev-tools/releng", "dev-tools/xtask", @@ -177,6 +178,7 @@ default-members = [ "dev-tools/openapi-manager", "dev-tools/openapi-manager/types", "dev-tools/oxlog", + "dev-tools/pins", "dev-tools/reconfigurator-cli", "dev-tools/releng", # Do not include xtask in the list of default members, because this causes @@ -506,6 +508,7 @@ omicron-nexus = { path = "nexus" } omicron-omdb = { path = "dev-tools/omdb" } omicron-package = { path = "package" } omicron-passwords = { path = "passwords" } +omicron-pins = { path = "dev-tools/pins" } omicron-rpaths = { path = "rpaths" } omicron-sled-agent = { path = "sled-agent" } omicron-test-utils = { path = "test-utils" } diff --git a/dev-tools/pins/Cargo.toml b/dev-tools/pins/Cargo.toml new file mode 100644 index 00000000000..227c7f1b281 --- /dev/null +++ b/dev-tools/pins/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "omicron-pins" +version = "0.1.0" +edition = "2021" +license = "MPL-2.0" + +[dependencies] +anyhow.workspace = true +camino.workspace = true +cargo_metadata.workspace = true +fs-err.workspace = true +omicron-workspace-hack.workspace = true +serde.workspace = true +toml_edit.workspace = true + +[lints] +workspace = true diff --git a/dev-tools/pins/src/lib.rs b/dev-tools/pins/src/lib.rs new file mode 100644 index 00000000000..a82c1141378 --- /dev/null +++ b/dev-tools/pins/src/lib.rs @@ -0,0 +1,38 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use anyhow::{Context, Result}; +use camino::Utf8Path; +use serde::Deserialize; + +#[derive(Debug, Clone, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct Pins { + pub helios: Option, +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct Helios { + pub commit: String, + pub incorporation: String, +} + +impl Pins { + pub fn read() -> Result { + let workspace_root = cargo_metadata::MetadataCommand::new() + .no_deps() + .exec()? + .workspace_root; + Pins::read_from_dir(&workspace_root) + } + + pub fn read_from_dir(workspace_dir: &Utf8Path) -> Result { + let pins_path = workspace_dir.join("tools").join("pins.toml"); + let pins_text = fs_err::read(&pins_path)?; + let pins: Pins = toml_edit::de::from_slice(&pins_text) + .with_context(|| format!("while reading {pins_path}"))?; + Ok(pins) + } +} diff --git a/dev-tools/releng/Cargo.toml b/dev-tools/releng/Cargo.toml index 19ede6c24db..6aad69ae37b 100644 --- a/dev-tools/releng/Cargo.toml +++ b/dev-tools/releng/Cargo.toml @@ -15,6 +15,7 @@ fs-err = { workspace = true, features = ["tokio"] } futures.workspace = true hex.workspace = true omicron-common.workspace = true +omicron-pins.workspace = true omicron-workspace-hack.workspace = true omicron-zone-package.workspace = true once_cell.workspace = true diff --git a/dev-tools/releng/src/main.rs b/dev-tools/releng/src/main.rs index ba71d8f8c03..a7d98a299c8 100644 --- a/dev-tools/releng/src/main.rs +++ b/dev-tools/releng/src/main.rs @@ -189,11 +189,11 @@ async fn main() -> Result<()> { .context("failed to change working directory to workspace root")?; // Determine the target directory. - let target_dir = cargo_metadata::MetadataCommand::new() + let metadata = cargo_metadata::MetadataCommand::new() .no_deps() .exec() - .context("failed to get cargo metadata")? - .target_directory; + .context("failed to get cargo metadata")?; + let target_dir = metadata.target_directory; // We build everything in Omicron with $CARGO, but we need to use the rustup // proxy for Cargo when outside Omicron. @@ -209,6 +209,9 @@ async fn main() -> Result<()> { None => rustup_cargo.clone(), }; + // Read pins.toml. + let pins = omicron_pins::Pins::read_from_dir(&metadata.workspace_root)?; + let permits = Arc::new(Semaphore::new( std::thread::available_parallelism() .context("couldn't get available parallelism")? @@ -270,34 +273,66 @@ async fn main() -> Result<()> { } } - // Ensure the Helios checkout exists - if args.helios_dir.exists() { + // Ensure the Helios checkout exists. If the directory exists and is + // non-empty, check that the commit is correct. + if args.helios_dir.exists() + && fs::read_dir(&args.helios_dir).await?.next_entry().await?.is_some() + { if !args.ignore_helios_origin { - // check that our helios clone is up to date - Command::new(&args.git_bin) - .arg("-C") - .arg(&args.helios_dir) - .args(["fetch", "--no-write-fetch-head", "origin", "master"]) - .ensure_success(&logger) - .await?; - let stdout = Command::new(&args.git_bin) - .arg("-C") - .arg(&args.helios_dir) - .args(["rev-parse", "HEAD", "origin/master"]) - .ensure_stdout(&logger) - .await?; - let mut lines = stdout.lines(); - let first = - lines.next().context("git-rev-parse output was empty")?; - if !lines.all(|line| line == first) { - error!( - logger, - "helios checkout at {0} is out-of-date; run \ - `git pull -C {0}`, or run omicron-releng with \ - --ignore-helios-origin or --helios-dir", - shell_words::quote(args.helios_dir.as_str()) - ); - preflight_ok = false; + if let Some(helios) = &pins.helios { + let stdout = Command::new(&args.git_bin) + .arg("-C") + .arg(&args.helios_dir) + .args(["rev-parse", "HEAD"]) + .ensure_stdout(&logger) + .await?; + let line = stdout + .lines() + .next() + .context("git-rev-parse output was empty")?; + if line != helios.commit { + error!( + logger, + "helios checkout at {0} is not at pinned commit {1}; \ + checkout the correct commit, or run omicron-releng \ + with --ignore-helios-origin or --helios-dir", + shell_words::quote(args.helios_dir.as_str()), + shell_words::quote(&helios.commit), + ); + preflight_ok = false; + } + } else { + // check that our helios clone is up to date + Command::new(&args.git_bin) + .arg("-C") + .arg(&args.helios_dir) + .args([ + "fetch", + "--no-write-fetch-head", + "origin", + "master", + ]) + .ensure_success(&logger) + .await?; + let stdout = Command::new(&args.git_bin) + .arg("-C") + .arg(&args.helios_dir) + .args(["rev-parse", "HEAD", "origin/master"]) + .ensure_stdout(&logger) + .await?; + let mut lines = stdout.lines(); + let first = + lines.next().context("git-rev-parse output was empty")?; + if !lines.all(|line| line == first) { + error!( + logger, + "helios checkout at {0} is out-of-date; run \ + `git pull -C {0}`, or run omicron-releng with \ + --ignore-helios-origin or --helios-dir", + shell_words::quote(args.helios_dir.as_str()) + ); + preflight_ok = false; + } } } } else { @@ -307,14 +342,19 @@ async fn main() -> Result<()> { .arg(&args.helios_dir) .ensure_success(&logger) .await?; + if let Some(helios) = &pins.helios { + info!( + logger, + "checking out {} to {}", args.helios_dir, helios.commit, + ); + Command::new(&args.git_bin) + .arg("-C") + .arg(&args.helios_dir) + .args(["checkout", &helios.commit]) + .ensure_success(&logger) + .await?; + } } - // Record the branch and commit in the output - Command::new(&args.git_bin) - .arg("-C") - .arg(&args.helios_dir) - .args(["status", "--branch", "--porcelain=2"]) - .ensure_success(&logger) - .await?; // Check that the omicron1 brand is installed if !Command::new("pkg") @@ -363,6 +403,14 @@ async fn main() -> Result<()> { .context("failed to create temporary directory")?; let mut jobs = Jobs::new(&logger, permits.clone(), &args.output_dir); + // Record the branch and commit of helios.git in the output + Command::new(&args.git_bin) + .arg("-C") + .arg(&args.helios_dir) + .args(["status", "--branch", "--porcelain=2"]) + .ensure_success(&logger) + .await?; + jobs.push_command( "helios-setup", Command::new("ptime") @@ -535,6 +583,14 @@ async fn main() -> Result<()> { .env_remove("CARGO") .env_remove("RUSTUP_TOOLCHAIN"); + if let Some(helios) = &pins.helios { + image_cmd = image_cmd.arg("-F").arg(format!( + "extra_packages+=\ + /consolidation/oxide/omicron-release-incorporation@{}", + helios.incorporation + )); + } + if !args.helios_local { image_cmd = image_cmd .arg("-p") // use an external package repository diff --git a/tools/pins.toml b/tools/pins.toml new file mode 100644 index 00000000000..c9c1a19bb96 --- /dev/null +++ b/tools/pins.toml @@ -0,0 +1,4 @@ +# Helios should *not* be pinned except on release branches. +#[helios] +#commit = "" +#incorporation = "" From 8980bfb8668854b5f70e8c103eccf74c4fbeeb90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karen=20C=C3=A1rcamo?= Date: Mon, 3 Feb 2025 19:41:26 -0800 Subject: [PATCH 6/8] [reconfigurator] Reject clickhouse configurations from old generations (#7347) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Overview This commit adds functionality to clickhouse-admin to keep track of the blueprint generation number. There is also a new validation check where if reconfigurator attempts to generate a configuration file from a previous generation, clickhouse-admin will not generate such configuration file, and exit with an error. Additionally, there's been a small clean up of the clickhouse-admin code. ## Manual testing In a local omicron deployment first tell reconfigurator to deploy a clickhouse policy `both` with the default number of replicas and keepers. ```console root@oxz_switch:~# omdb nexus blueprints diff target d6a6c153-76aa-4933-98bd-1009d95f03d2 note: Nexus URL not specified. Will pick one from DNS. note: using DNS server for subnet fd00:1122:3344::/48 note: (if this is not right, use --dns-server to specify an alternate DNS server) note: using Nexus URL http://[fd00:1122:3344:101::c]:12221 from: blueprint fb9d6881-3c8a-44e2-b9f3-b8222ebdae99 to: blueprint d6a6c153-76aa-4933-98bd-1009d95f03d2 <...> CLICKHOUSE CLUSTER CONFIG: + generation::::::::::::::::::::::::::::::::::::: 2 + max used server id::::::::::::::::::::::::::::: 3 + max used keeper id::::::::::::::::::::::::::::: 5 + cluster name::::::::::::::::::::::::::::::::::: oximeter_cluster + cluster secret::::::::::::::::::::::::::::::::: 750a492f-1c3d-430c-8d18-c74596fd2ec8 + highest seen keeper leader committed log index: 0 clickhouse keepers at generation 2: ------------------------------------------------ zone id keeper id ------------------------------------------------ + 13c665e9-d7bd-43a5-b780-47acf8326feb 1 + 325e3ac5-6cc8-4aec-9ac0-ea8d9a60c40f 2 + 37e41e42-3b0c-49a6-8403-99fe66e84897 3 + 4e65bf56-c7d6-485d-9b7f-8513a55838f9 4 + 8a5df7fa-8633-4bf9-a7fa-567d5e62ffbf 5 clickhouse servers at generation 2: ------------------------------------------------ zone id server id ------------------------------------------------ + 45af8162-253a-494c-992e-137d2bd5f350 1 + 676772e0-d0c4-425b-a0d1-f6df46e4d10c 2 + 84d249d1-9c13-460a-9c7c-08a979471246 3 ``` We can see keepers and servers are at generation 2. Now we zlogin into a keeper zone to check we have recorded that information and that the node has joined the quorum. ```console root@oxz_clickhouse_keeper_37e41e42:~# curl http://[fd00:1122:3344:101::23]:8888/generation 2 root@oxz_clickhouse_keeper_37e41e42:~# head -n 1 /opt/oxide/clickhouse_keeper/keeper_config.xml root@oxz_clickhouse_keeper_37e41e42:~# curl http://[fd00:1122:3344:101::23]:8888/4lw-lgif {"first_log_idx":1,"first_log_term":1,"last_log_idx":7123,"last_log_term":1,"last_committed_log_idx":7123,"leader_committed_log_idx":7123,"target_committed_log_idx":7123,"last_snapshot_idx":0} ``` We zlogin into a replica zone and check we have recorded that information, and the database contains the expected oximeter table and fields. ```console root@oxz_clickhouse_server_676772e0:~# curl http://[fd00:1122:3344:101::28]:8888/generation 2 root@oxz_clickhouse_server_676772e0:~# head -n 1 /opt/oxide/clickhouse_server/config.d/replica-server-config.xml root@oxz_clickhouse_server_676772e0:~# /opt/oxide/clickhouse_server/clickhouse client --host fd00:1122:3344:101::28 ClickHouse client version 23.8.7.1. Connecting to fd00:1122:3344:101::28:9000 as user default. Connected to ClickHouse server version 23.8.7 revision 54465. oximeter_cluster_2 :) show tables in oximeter SHOW TABLES FROM oximeter Query id: 1baa160b-3332-4fa4-a91d-0032fd917a96 ┌─name─────────────────────────────┐ │ fields_bool │ │ fields_bool_local │ │ fields_i16 │ │ fields_i16_local │ │ <...> │ │ version │ └──────────────────────────────────┘ 81 rows in set. Elapsed: 0.009 sec. ``` No we want to force a new generation number, so we set a clickhouse policy with an additional server and keeper ```console root@oxz_switch:~# omdb nexus blueprints diff target a598ce1b-1413-47d6-bc8c-7b63b6d09158 note: Nexus URL not specified. Will pick one from DNS. note: using DNS server for subnet fd00:1122:3344::/48 note: (if this is not right, use --dns-server to specify an alternate DNS server) note: using Nexus URL http://[fd00:1122:3344:101::c]:12221 from: blueprint d6a6c153-76aa-4933-98bd-1009d95f03d2 to: blueprint a598ce1b-1413-47d6-bc8c-7b63b6d09158 <...> CLICKHOUSE CLUSTER CONFIG: * generation::::::::::::::::::::::::::::::::::::: 2 -> 3 * max used server id::::::::::::::::::::::::::::: 3 -> 4 * max used keeper id::::::::::::::::::::::::::::: 5 -> 6 cluster name::::::::::::::::::::::::::::::::::: oximeter_cluster (unchanged) cluster secret::::::::::::::::::::::::::::::::: 750a492f-1c3d-430c-8d18-c74596fd2ec8 (unchanged) * highest seen keeper leader committed log index: 0 -> 13409 clickhouse keepers generation 2 -> 3: ------------------------------------------------ zone id keeper id ------------------------------------------------ 13c665e9-d7bd-43a5-b780-47acf8326feb 1 325e3ac5-6cc8-4aec-9ac0-ea8d9a60c40f 2 37e41e42-3b0c-49a6-8403-99fe66e84897 3 4e65bf56-c7d6-485d-9b7f-8513a55838f9 4 8a5df7fa-8633-4bf9-a7fa-567d5e62ffbf 5 + ccb1b5cf-7ca8-4c78-b9bc-970d156e6109 6 clickhouse servers generation 2 -> 3: ------------------------------------------------ zone id server id ------------------------------------------------ 45af8162-253a-494c-992e-137d2bd5f350 1 + 497f4829-f3fe-4c94-86b2-dbd4e814cc90 4 676772e0-d0c4-425b-a0d1-f6df46e4d10c 2 84d249d1-9c13-460a-9c7c-08a979471246 3 ``` We deploy it and do the same checks on the same zones we checked previously and in the new zones Old keeper zone: ```console root@oxz_clickhouse_keeper_37e41e42:~# curl http://[fd00:1122:3344:101::23]:8888/generation 3 root@oxz_clickhouse_keeper_37e41e42:~# head -n 1 /opt/oxide/clickhouse_keeper/keeper_config.xml root@oxz_clickhouse_keeper_37e41e42:~# curl http://[fd00:1122:3344:101::23]:8888/4lw-lgif {"first_log_idx":1,"first_log_term":1,"last_log_idx":25198,"last_log_term":1,"last_committed_log_idx":25198,"leader_committed_log_idx":25198,"target_committed_log_idx":25198,"last_snapshot_idx":0} ``` New keeper zone: ```console root@oxz_clickhouse_keeper_ccb1b5cf:~# curl http://[fd00:1122:3344:101::29]:8888/generation 3 root@oxz_clickhouse_keeper_ccb1b5cf:~# head -n 1 /opt/oxide/clickhouse_keeper/keeper_config.xml root@oxz_clickhouse_keeper_ccb1b5cf:~# curl http://[fd00:1122:3344:101::29]:8888/4lw-lgif {"first_log_idx":1,"first_log_term":1,"last_log_idx":35857,"last_log_term":1,"last_committed_log_idx":35853,"leader_committed_log_idx":35853,"target_committed_log_idx":35853,"last_snapshot_idx":0} ``` Old replica zone: ```console root@oxz_clickhouse_server_676772e0:~# curl http://[fd00:1122:3344:101::28]:8888/generation 3 root@oxz_clickhouse_server_676772e0:~# head -n 1 /opt/oxide/clickhouse_server/config.d/replica-server-config.xml root@oxz_clickhouse_server_676772e0:~# /opt/oxide/clickhouse_server/clickhouse client --host fd00:1122:3344:101::28 ClickHouse client version 23.8.7.1. Connecting to fd00:1122:3344:101::28:9000 as user default. Connected to ClickHouse server version 23.8.7 revision 54465. oximeter_cluster_2 :) show tables in oximeter SHOW TABLES FROM oximeter Query id: d4500915-d5b5-452f-a404-35e1e172b8f8 ┌─name─────────────────────────────┐ │ fields_bool │ │ fields_bool_local │ │ fields_i16 │ │ fields_i16_local │ │ <...> │ │ version │ └──────────────────────────────────┘ 81 rows in set. Elapsed: 0.002 sec. ``` New replica zone: ```console root@oxz_clickhouse_server_497f4829:~# curl http://[fd00:1122:3344:101::2a]:8888/generation 3 root@oxz_clickhouse_server_497f4829:~# head -n 1 /opt/oxide/clickhouse_server/config.d/replica-server-config.xml root@oxz_clickhouse_server_497f4829:~# /opt/oxide/clickhouse_server/clickhouse client --host fd00:1122:3344:101::2a ClickHouse client version 23.8.7.1. Connecting to fd00:1122:3344:101::2a:9000 as user default. Connected to ClickHouse server version 23.8.7 revision 54465. oximeter_cluster_4 :) show tables in oximeter SHOW TABLES FROM oximeter Query id: 9e02b839-e938-44ef-8b2e-a61d0b8c25af ┌─name─────────────────────────────┐ │ fields_bool │ │ fields_bool_local │ │ fields_i16 │ │ fields_i16_local │ │ <...> │ │ version │ └──────────────────────────────────┘ 81 rows in set. Elapsed: 0.014 sec. ``` To verify clickhouse-admin exits with an error if the incoming generation number is lower than the current one, I tested by runing clickhouse-admin against a local clickward deployment: ```console # clickhouse-admin-server karcar@ixchel:~/src/omicron$ curl http://[::1]:8888/generation 34 karcar@ixchel:~/src/omicron$ curl --header "Content-Type: application/json" --request PUT "http://[::1]:8888/config" -d ' > { > "generation": 3, > "settings": { > "config_dir": "/tmp/ch-dir/", > "id": 1, > "datastore_path": "/tmp/ch-dir/", > "listen_addr": "::1", > "keepers": [{"ipv6": "::1"}], > "remote_servers": [{"ipv6": "::1"}] > } > }' { "request_id": "01809997-b9da-4e9c-837f-11413a6254b7", "error_code": "Internal", "message": "Internal Server Error" } # From the logs {"msg":"request completed","v":0,"name":"clickhouse-admin-server","level":30,"time":"2025-01-21T01:08:24.946465Z","hostname":"ixchel","pid":58943,"uri":"/config","method":"PUT","req_id":"01809997-b9da-4e9c-837f-11413a6254b7","remote_addr":"[::1]:54628","local_addr":"[::1]:8888","component":"dropshot","file":"/Users/karcar/.cargo/registry/src/index.crates.io-6f17d22bba15001f/dropshot-0.13.0/src/server.rs:851","error_message_external":"Internal Server Error","error_message_internal":"current generation is greater than incoming generation","latency_us":227,"response_code":"500"} # clickhouse-admin-keeper karcar@ixchel:~/src/omicron$ curl http://[::1]:8888/generation 23 karcar@ixchel:~/src/omicron$ curl --header "Content-Type: application/json" --request PUT "http://[::1]:8888/config" -d ' { "generation": 2, "settings": { "config_dir": "/tmp/ch-dir/", "id": 1, "datastore_path": "/tmp/ch-dir/", "listen_addr": "::1", "raft_servers": [ { "id": 1, "host": {"ipv6": "::1"} } ] } }' { "request_id": "e6b66ca9-10fa-421b-ac46-0e470d8e5512", "error_code": "Internal", "message": "Internal Server Error" # From the logs {"msg":"request completed","v":0,"name":"clickhouse-admin-keeper","level":30,"time":"2025-01-21T02:28:12.925343Z","hostname":"ixchel","pid":59371,"uri":"/config","method":"PUT","req_id":"e6b66ca9-10fa-421b-ac46-0e470d8e5512","remote_addr":"[::1]:64494","local_addr":"[::1]:8888","component":"dropshot","file":"/Users/karcar/.cargo/registry/src/index.crates.io-6f17d22bba15001f/dropshot-0.13.0/src/server.rs:851","error_message_external":"Internal Server Error","error_message_internal":"current generation is greater than incoming generation","latency_us":180,"response_code":"500"} ``` Closes: https://github.com/oxidecomputer/omicron/issues/7137 --- Cargo.lock | 2 + clickhouse-admin/Cargo.toml | 1 + clickhouse-admin/api/src/lib.rs | 29 +- clickhouse-admin/src/clickhouse_cli.rs | 3 +- clickhouse-admin/src/clickward.rs | 9 +- clickhouse-admin/src/context.rs | 558 +++++++++++++++++- clickhouse-admin/src/http_entrypoints.rs | 147 ++--- clickhouse-admin/src/lib.rs | 26 +- clickhouse-admin/tests/integration_test.rs | 8 +- clickhouse-admin/types/src/config.rs | 23 +- clickhouse-admin/types/src/lib.rs | 242 +++++--- .../types/testutils/keeper_config.xml | 2 +- .../types/testutils/malformed_1.xml | 10 + .../types/testutils/malformed_2.xml | 11 + .../types/testutils/malformed_3.xml | 11 + .../types/testutils/replica-server-config.xml | 2 +- .../execution/src/clickhouse.rs | 5 +- openapi/clickhouse-admin-keeper.json | 247 +++++++- openapi/clickhouse-admin-server.json | 212 ++++++- package-manifest.toml | 2 - sled-agent/Cargo.toml | 1 + sled-agent/src/services.rs | 26 +- smf/clickhouse_keeper/manifest.xml | 6 +- smf/clickhouse_keeper/method_script.sh | 9 - smf/clickhouse_server/manifest.xml | 6 +- smf/clickhouse_server/method_script.sh | 9 - 26 files changed, 1322 insertions(+), 285 deletions(-) create mode 100644 clickhouse-admin/types/testutils/malformed_1.xml create mode 100644 clickhouse-admin/types/testutils/malformed_2.xml create mode 100644 clickhouse-admin/types/testutils/malformed_3.xml delete mode 100755 smf/clickhouse_keeper/method_script.sh delete mode 100755 smf/clickhouse_server/method_script.sh diff --git a/Cargo.lock b/Cargo.lock index d06b2ee5320..602ae6a21fa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6713,6 +6713,7 @@ dependencies = [ "clickward", "dropshot 0.15.1", "expectorate", + "flume", "http", "illumos-utils", "omicron-common", @@ -7325,6 +7326,7 @@ dependencies = [ "cfg-if", "chrono", "clap", + "clickhouse-admin-types", "crucible-agent-client", "derive_more", "display-error-chain", diff --git a/clickhouse-admin/Cargo.toml b/clickhouse-admin/Cargo.toml index 65ceb3fdbf8..02e31eb61ff 100644 --- a/clickhouse-admin/Cargo.toml +++ b/clickhouse-admin/Cargo.toml @@ -12,6 +12,7 @@ clap.workspace = true clickhouse-admin-api.workspace = true clickhouse-admin-types.workspace = true dropshot.workspace = true +flume = {workspace = true, features = ["async"]} http.workspace = true illumos-utils.workspace = true omicron-common.workspace = true diff --git a/clickhouse-admin/api/src/lib.rs b/clickhouse-admin/api/src/lib.rs index d83cfd4549e..fbd8a78a0a5 100644 --- a/clickhouse-admin/api/src/lib.rs +++ b/clickhouse-admin/api/src/lib.rs @@ -3,15 +3,16 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use clickhouse_admin_types::{ - ClickhouseKeeperClusterMembership, DistributedDdlQueue, KeeperConf, - KeeperConfig, KeeperConfigurableSettings, Lgif, MetricInfoPath, RaftConfig, - ReplicaConfig, ServerConfigurableSettings, SystemTimeSeries, + ClickhouseKeeperClusterMembership, DistributedDdlQueue, + GenerateConfigResult, KeeperConf, KeeperConfigurableSettings, Lgif, + MetricInfoPath, RaftConfig, ServerConfigurableSettings, SystemTimeSeries, TimeSeriesSettingsQuery, }; use dropshot::{ HttpError, HttpResponseCreated, HttpResponseOk, HttpResponseUpdatedNoContent, Path, Query, RequestContext, TypedBody, }; +use omicron_common::api::external::Generation; /// API interface for our clickhouse-admin-keeper server /// @@ -40,7 +41,16 @@ pub trait ClickhouseAdminKeeperApi { async fn generate_config_and_enable_svc( rqctx: RequestContext, body: TypedBody, - ) -> Result, HttpError>; + ) -> Result, HttpError>; + + /// Retrieve the generation number of a configuration + #[endpoint { + method = GET, + path = "/generation", + }] + async fn generation( + rqctx: RequestContext, + ) -> Result, HttpError>; /// Retrieve a logically grouped information file from a keeper node. /// This information is used internally by ZooKeeper to manage snapshots @@ -106,7 +116,16 @@ pub trait ClickhouseAdminServerApi { async fn generate_config_and_enable_svc( rqctx: RequestContext, body: TypedBody, - ) -> Result, HttpError>; + ) -> Result, HttpError>; + + /// Retrieve the generation number of a configuration + #[endpoint { + method = GET, + path = "/generation", + }] + async fn generation( + rqctx: RequestContext, + ) -> Result, HttpError>; /// Contains information about distributed ddl queries (ON CLUSTER clause) /// that were executed on a cluster. diff --git a/clickhouse-admin/src/clickhouse_cli.rs b/clickhouse-admin/src/clickhouse_cli.rs index 34ea746a0ee..a6e3c8b91c3 100644 --- a/clickhouse-admin/src/clickhouse_cli.rs +++ b/clickhouse-admin/src/clickhouse_cli.rs @@ -95,8 +95,9 @@ impl ClickhouseCli { pub fn new( binary_path: Utf8PathBuf, listen_address: SocketAddrV6, - log: Logger, + log: &Logger, ) -> Self { + let log = log.new(slog::o!("component" => "ClickhouseCli")); Self { binary_path, listen_address, log } } diff --git a/clickhouse-admin/src/clickward.rs b/clickhouse-admin/src/clickward.rs index a6339c3888e..b9355dc2c30 100644 --- a/clickhouse-admin/src/clickward.rs +++ b/clickhouse-admin/src/clickward.rs @@ -3,7 +3,8 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use clickhouse_admin_types::{ - KeeperConfig, KeeperSettings, ReplicaConfig, ServerSettings, + KeeperConfig, KeeperConfigurableSettings, ReplicaConfig, + ServerConfigurableSettings, }; use dropshot::HttpError; use slog_error_chain::{InlineErrorChain, SlogInlineError}; @@ -35,7 +36,7 @@ impl From for HttpError { } } -#[derive(Debug)] +#[derive(Debug, Clone, Copy)] pub struct Clickward {} impl Clickward { @@ -45,7 +46,7 @@ impl Clickward { pub fn generate_server_config( &self, - settings: ServerSettings, + settings: &ServerConfigurableSettings, ) -> Result { let replica_config = settings .generate_xml_file() @@ -56,7 +57,7 @@ impl Clickward { pub fn generate_keeper_config( &self, - settings: KeeperSettings, + settings: &KeeperConfigurableSettings, ) -> Result { let keeper_config = settings .generate_xml_file() diff --git a/clickhouse-admin/src/context.rs b/clickhouse-admin/src/context.rs index 702c423130f..aca948614c9 100644 --- a/clickhouse-admin/src/context.rs +++ b/clickhouse-admin/src/context.rs @@ -3,84 +3,578 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use crate::{ClickhouseCli, Clickward}; + +use anyhow::{anyhow, bail, Context, Result}; +use camino::Utf8PathBuf; +use clickhouse_admin_types::{ + GenerateConfigResult, KeeperConfigurableSettings, + ServerConfigurableSettings, CLICKHOUSE_KEEPER_CONFIG_DIR, + CLICKHOUSE_KEEPER_CONFIG_FILE, CLICKHOUSE_SERVER_CONFIG_DIR, + CLICKHOUSE_SERVER_CONFIG_FILE, +}; +use dropshot::HttpError; +use flume::{Receiver, Sender, TrySendError}; +use http::StatusCode; +use illumos_utils::svcadm::Svcadm; use omicron_common::address::CLICKHOUSE_TCP_PORT; +use omicron_common::api::external::Generation; use oximeter_db::Client as OximeterClient; +use oximeter_db::OXIMETER_VERSION; use slog::Logger; +use slog::{error, info}; +use slog_error_chain::InlineErrorChain; +use std::fs::File; +use std::io::{BufRead, BufReader}; use std::net::SocketAddrV6; -use std::sync::Arc; -use tokio::sync::Mutex; +use tokio::sync::{oneshot, watch}; pub struct KeeperServerContext { clickward: Clickward, clickhouse_cli: ClickhouseCli, log: Logger, + generate_config_tx: Sender, + generation_rx: watch::Receiver>, } impl KeeperServerContext { - pub fn new(clickhouse_cli: ClickhouseCli) -> Self { - let log = clickhouse_cli - .log - .new(slog::o!("component" => "KeeperServerContext")); + pub fn new( + log: &Logger, + binary_path: Utf8PathBuf, + listen_address: SocketAddrV6, + ) -> Result { + let clickhouse_cli = + ClickhouseCli::new(binary_path, listen_address, log); + let log = log.new(slog::o!("component" => "KeeperServerContext")); let clickward = Clickward::new(); - Self { clickward, clickhouse_cli, log } + let config_path = Utf8PathBuf::from(CLICKHOUSE_KEEPER_CONFIG_DIR) + .join(CLICKHOUSE_KEEPER_CONFIG_FILE); + + // If there is already a configuration file with a generation number we'll + // use that. Otherwise, we set the generation number to None. + let gen = read_generation_from_file(config_path)?; + let (generation_tx, generation_rx) = watch::channel(gen); + + // We only want to handle one in flight request at a time. Reconfigurator execution will retry + // again later anyway. We use flume bounded channels with a size of 0 to act as a rendezvous channel. + let (generate_config_tx, generate_config_rx) = flume::bounded(0); + tokio::spawn(long_running_generate_config_task( + generate_config_rx, + generation_tx, + )); + + Ok(Self { + clickward, + clickhouse_cli, + log, + generate_config_tx, + generation_rx, + }) } - pub fn clickward(&self) -> &Clickward { - &self.clickward + pub async fn generate_config_and_enable_svc( + &self, + keeper_settings: KeeperConfigurableSettings, + ) -> Result { + let clickward = self.clickward(); + let log = self.log(); + let node_settings = NodeSettings::Keeper { + settings: keeper_settings, + fmri: "svc:/oxide/clickhouse_keeper:default".to_string(), + }; + + generate_config_request( + &self.generate_config_tx, + node_settings, + clickward, + log, + ) + .await + } + + pub fn clickward(&self) -> Clickward { + self.clickward } pub fn clickhouse_cli(&self) -> &ClickhouseCli { &self.clickhouse_cli } - pub fn log(&self) -> &Logger { - &self.log + pub fn log(&self) -> Logger { + self.log.clone() + } + + pub fn generation(&self) -> Option { + *self.generation_rx.borrow() } } pub struct ServerContext { clickhouse_cli: ClickhouseCli, clickward: Clickward, - oximeter_client: OximeterClient, - initialization_lock: Arc>, + clickhouse_address: SocketAddrV6, log: Logger, + generate_config_tx: Sender, + generation_rx: watch::Receiver>, + db_init_tx: Sender, } impl ServerContext { - pub fn new(clickhouse_cli: ClickhouseCli) -> Self { - let ip = clickhouse_cli.listen_address.ip(); - let address = SocketAddrV6::new(*ip, CLICKHOUSE_TCP_PORT, 0, 0); - let oximeter_client = - OximeterClient::new(address.into(), &clickhouse_cli.log); + pub fn new( + log: &Logger, + binary_path: Utf8PathBuf, + listen_address: SocketAddrV6, + ) -> Result { + let clickhouse_cli = + ClickhouseCli::new(binary_path, listen_address, log); + + let ip = listen_address.ip(); + let clickhouse_address = + SocketAddrV6::new(*ip, CLICKHOUSE_TCP_PORT, 0, 0); let clickward = Clickward::new(); - let log = - clickhouse_cli.log.new(slog::o!("component" => "ServerContext")); - Self { + let log = log.new(slog::o!("component" => "ServerContext")); + + let config_path = Utf8PathBuf::from(CLICKHOUSE_SERVER_CONFIG_DIR) + .join(CLICKHOUSE_SERVER_CONFIG_FILE); + + // If there is already a configuration file with a generation number we'll + // use that. Otherwise, we set the generation number to None. + let gen = read_generation_from_file(config_path)?; + let (generation_tx, generation_rx) = watch::channel(gen); + + // We only want to handle one in flight request at a time. Reconfigurator execution will retry + // again later anyway. We use flume bounded channels with a size of 0 to act as a rendezvous channel. + let (generate_config_tx, generate_config_rx) = flume::bounded(0); + tokio::spawn(long_running_generate_config_task( + generate_config_rx, + generation_tx, + )); + + let (db_init_tx, db_init_rx) = flume::bounded(0); + tokio::spawn(long_running_db_init_task(db_init_rx)); + + Ok(Self { clickhouse_cli, clickward, - oximeter_client, - initialization_lock: Arc::new(Mutex::new(())), + clickhouse_address, log, - } + generate_config_tx, + generation_rx, + db_init_tx, + }) + } + + pub async fn generate_config_and_enable_svc( + &self, + replica_settings: ServerConfigurableSettings, + ) -> Result { + let clickward = self.clickward(); + let log = self.log(); + let node_settings = NodeSettings::Replica { + settings: replica_settings, + fmri: "svc:/oxide/clickhouse_server:default".to_string(), + }; + generate_config_request( + &self.generate_config_tx, + node_settings, + clickward, + log, + ) + .await + } + + pub async fn init_db(&self, replicated: bool) -> Result<(), HttpError> { + let log = self.log(); + let clickhouse_address = self.clickhouse_address(); + + let (response_tx, response_rx) = oneshot::channel(); + self.db_init_tx + .try_send(DbInitRequest::DbInit { + clickhouse_address, + log, + replicated, + response: response_tx, + }) + .map_err(|e| match e { + TrySendError::Full(_) => HttpError::for_unavail( + None, + "channel full: another config request is still running" + .to_string(), + ), + TrySendError::Disconnected(_) => HttpError::for_internal_error( + "long-running generate-config task died".to_string(), + ), + })?; + response_rx.await.map_err(|e| { + HttpError::for_internal_error(format!( + "failure to receive response; long-running task died before it could respond: {e}" + )) + })? } pub fn clickhouse_cli(&self) -> &ClickhouseCli { &self.clickhouse_cli } - pub fn clickward(&self) -> &Clickward { - &self.clickward + pub fn clickward(&self) -> Clickward { + self.clickward + } + + pub fn clickhouse_address(&self) -> SocketAddrV6 { + self.clickhouse_address } - pub fn oximeter_client(&self) -> &OximeterClient { - &self.oximeter_client + pub fn log(&self) -> Logger { + self.log.clone() } - pub fn initialization_lock(&self) -> Arc> { - self.initialization_lock.clone() + pub fn generation(&self) -> Option { + *self.generation_rx.borrow() + } +} + +pub enum DbInitRequest { + /// Initiliases the oximeter database on either a single node + /// ClickHouse installation or a replicated cluster. + DbInit { + clickhouse_address: SocketAddrV6, + log: Logger, + replicated: bool, + response: oneshot::Sender>, + }, +} + +async fn long_running_db_init_task(incoming: Receiver) { + while let Ok(request) = incoming.recv_async().await { + match request { + DbInitRequest::DbInit { + clickhouse_address, + log, + replicated, + response, + } => { + let result = + init_db(clickhouse_address, log.clone(), replicated).await; + if let Err(e) = response.send(result) { + error!( + log, + "failed to send value from database initialization to channel: {e:?}" + ); + }; + } + } + } +} + +pub enum GenerateConfigRequest { + /// Generates a configuration file for a server or keeper node + GenerateConfig { + clickward: Clickward, + log: Logger, + node_settings: NodeSettings, + response: oneshot::Sender>, + }, +} + +async fn long_running_generate_config_task( + incoming: Receiver, + generation_tx: watch::Sender>, +) { + while let Ok(request) = incoming.recv_async().await { + match request { + GenerateConfigRequest::GenerateConfig { + clickward, + log, + node_settings, + response, + } => { + let result = generate_config_and_enable_svc( + generation_tx.clone(), + clickward, + node_settings, + ); + if let Err(e) = response.send(result) { + error!( + &log, + "failed to send value from configuration generation to channel: {e:?}" + ); + }; + } + } + } +} + +#[derive(Debug)] +pub enum NodeSettings { + Replica { settings: ServerConfigurableSettings, fmri: String }, + Keeper { settings: KeeperConfigurableSettings, fmri: String }, +} + +impl NodeSettings { + fn fmri(self) -> String { + match self { + NodeSettings::Replica { fmri, .. } => fmri, + NodeSettings::Keeper { fmri, .. } => fmri, + } + } + + fn generation(&self) -> Generation { + match self { + NodeSettings::Replica { settings, .. } => settings.generation(), + NodeSettings::Keeper { settings, .. } => settings.generation(), + } } +} + +fn generate_config_and_enable_svc( + generation_tx: watch::Sender>, + clickward: Clickward, + node_settings: NodeSettings, +) -> Result { + let incoming_generation = node_settings.generation(); + let current_generation = *generation_tx.borrow(); + + // If the incoming generation number is lower, then we have a problem. + // We should return an error instead of silently skipping the configuration + // file generation. + if let Some(current) = current_generation { + if current > incoming_generation { + return Err(HttpError::for_client_error( + Some(String::from("Conflict")), + StatusCode::CONFLICT, + format!( + "current generation '{}' is greater than incoming generation '{}'", + current, + incoming_generation, + ) + )); + } + }; + + let output = match &node_settings { + NodeSettings::Replica { settings, .. } => { + GenerateConfigResult::Replica( + clickward.generate_server_config(settings)?, + ) + } + NodeSettings::Keeper { settings, .. } => GenerateConfigResult::Keeper( + clickward.generate_keeper_config(settings)?, + ), + }; + + // We want to update the generation number only if the config file has been + // generated successfully. + generation_tx.send_replace(Some(incoming_generation)); + + // Once we have generated the client we can safely enable the clickhouse_server service + Svcadm::enable_service(node_settings.fmri())?; + + Ok(output) +} + +async fn generate_config_request( + generate_config_tx: &Sender, + node_settings: NodeSettings, + clickward: Clickward, + log: Logger, +) -> Result { + let (response_tx, response_rx) = oneshot::channel(); + generate_config_tx + .try_send(GenerateConfigRequest::GenerateConfig { + clickward, + log, + node_settings, + response: response_tx, + }) + .map_err(|e| match e { + TrySendError::Full(_) => HttpError::for_unavail( + None, + "channel full: another config request is still running" + .to_string(), + ), + TrySendError::Disconnected(_) => HttpError::for_internal_error( + "long-running generate-config task died".to_string(), + ), + })?; + response_rx.await.map_err(|e| { + HttpError::for_internal_error(format!( + "failure to receive response; long-running task died before it could respond: {e}" + )) + })? +} + +async fn init_db( + clickhouse_address: SocketAddrV6, + log: Logger, + replicated: bool, +) -> Result<(), HttpError> { + // We initialise the Oximeter client here instead of keeping it as part of the context as + // this client is not cloneable, copyable by design. + let client = OximeterClient::new(clickhouse_address.into(), &log); + + // Initialize the database only if it was not previously initialized. + // TODO: Migrate schema to newer version without wiping data. + let version = client.read_latest_version().await.map_err(|e| { + HttpError::for_internal_error(format!( + "can't read ClickHouse version: {}", + InlineErrorChain::new(&e), + )) + })?; + if version == 0 { + info!( + log, + "initializing oximeter database to version {OXIMETER_VERSION}" + ); + client + .initialize_db_with_version(replicated, OXIMETER_VERSION) + .await + .map_err(|e| { + HttpError::for_internal_error(format!( + "can't initialize oximeter database \ + to version {OXIMETER_VERSION}: {}", + InlineErrorChain::new(&e), + )) + })?; + } else { + info!( + log, + "skipping initialization of oximeter database at version {version}" + ); + } + + Ok(()) +} + +fn read_generation_from_file(path: Utf8PathBuf) -> Result> { + // When the configuration file does not exist yet, this means it's a new server. + // It won't have a running clickhouse server and no generation number yet. + if !path.exists() { + return Ok(None); + } + + let file = + File::open(&path).with_context(|| format!("failed to open {path}"))?; + let reader = BufReader::new(file); + // We know the generation number is on the top of the file so we only + // need the first line. + let first_line = match reader.lines().next() { + Some(g) => g?, + // When the clickhouse configuration file exists but has no contents, + // it means something went wrong when creating the file earlier. + // We should return because something is definitely broken. + None => bail!( + "clickhouse configuration file exists at {}, but is empty", + path + ), + }; + + let line_parts: Vec<&str> = first_line.rsplit(':').collect(); + if line_parts.len() != 2 { + bail!( + "first line of configuration file '{}' is malformed: {}", + path, + first_line + ); + } + + // It's safe to unwrap since we already know `line_parts` contains two items. + let line_end_part: Vec<&str> = + line_parts.first().unwrap().split_terminator(" -->").collect(); + if line_end_part.len() != 1 { + bail!( + "first line of configuration file '{}' is malformed: {}", + path, + first_line + ); + } + + // It's safe to unwrap since we already know `line_end_part` contains an item. + let gen_u64: u64 = line_end_part.first().unwrap().parse().map_err(|e| { + anyhow!( + concat!( + "first line of configuration file '{}' is malformed: {}; ", + "error = {}", + ), + path, + first_line, + e + ) + })?; + + let gen = Generation::try_from(gen_u64)?; + + Ok(Some(gen)) +} + +#[cfg(test)] +mod tests { + use super::read_generation_from_file; + use camino::Utf8PathBuf; + use clickhouse_admin_types::CLICKHOUSE_SERVER_CONFIG_FILE; + use omicron_common::api::external::Generation; + use std::str::FromStr; + + #[test] + fn test_read_generation_from_file_success() { + let dir = Utf8PathBuf::from_str("types/testutils") + .unwrap() + .join(CLICKHOUSE_SERVER_CONFIG_FILE); + let generation = read_generation_from_file(dir).unwrap().unwrap(); + + assert_eq!(Generation::from(1), generation); + } + + #[test] + fn test_read_generation_from_file_none() { + let dir = Utf8PathBuf::from_str("types/testutils") + .unwrap() + .join("i-dont-exist.xml"); + let generation = read_generation_from_file(dir).unwrap(); + + assert_eq!(None, generation); + } + + #[test] + fn test_read_generation_from_file_malformed_1() { + let dir = Utf8PathBuf::from_str("types/testutils") + .unwrap() + .join("malformed_1.xml"); + let result = read_generation_from_file(dir); + let error = result.unwrap_err(); + let root_cause = error.root_cause(); + + assert_eq!( + format!("{}", root_cause), + "first line of configuration file 'types/testutils/malformed_1.xml' is malformed: " + ); + } + + #[test] + fn test_read_generation_from_file_malformed_2() { + let dir = Utf8PathBuf::from_str("types/testutils") + .unwrap() + .join("malformed_2.xml"); + let result = read_generation_from_file(dir); + let error = result.unwrap_err(); + let root_cause = error.root_cause(); + + assert_eq!( + format!("{}", root_cause), + "first line of configuration file 'types/testutils/malformed_2.xml' is malformed: ; error = invalid digit found in string" + ); + } + + #[test] + fn test_read_generation_from_file_malformed_3() { + let dir = Utf8PathBuf::from_str("types/testutils") + .unwrap() + .join("malformed_3.xml"); + let result = read_generation_from_file(dir); + let error = result.unwrap_err(); + let root_cause = error.root_cause(); - pub fn log(&self) -> &Logger { - &self.log + assert_eq!( + format!("{}", root_cause), + "first line of configuration file 'types/testutils/malformed_3.xml' is malformed: -->" + ); } } diff --git a/clickhouse-admin/src/http_entrypoints.rs b/clickhouse-admin/src/http_entrypoints.rs index cd8523de472..754266d4021 100644 --- a/clickhouse-admin/src/http_entrypoints.rs +++ b/clickhouse-admin/src/http_entrypoints.rs @@ -5,18 +5,17 @@ use crate::context::{KeeperServerContext, ServerContext}; use clickhouse_admin_api::*; use clickhouse_admin_types::{ - ClickhouseKeeperClusterMembership, DistributedDdlQueue, KeeperConf, - KeeperConfig, KeeperConfigurableSettings, Lgif, MetricInfoPath, RaftConfig, - ReplicaConfig, ServerConfigurableSettings, SystemTimeSeries, + ClickhouseKeeperClusterMembership, DistributedDdlQueue, + GenerateConfigResult, KeeperConf, KeeperConfigurableSettings, Lgif, + MetricInfoPath, RaftConfig, ServerConfigurableSettings, SystemTimeSeries, SystemTimeSeriesSettings, TimeSeriesSettingsQuery, }; use dropshot::{ ApiDescription, HttpError, HttpResponseCreated, HttpResponseOk, HttpResponseUpdatedNoContent, Path, Query, RequestContext, TypedBody, }; -use illumos_utils::svcadm::Svcadm; -use oximeter_db::OXIMETER_VERSION; -use slog::info; +use http::StatusCode; +use omicron_common::api::external::Generation; use std::sync::Arc; pub fn clickhouse_admin_server_api() -> ApiDescription> { @@ -43,17 +42,29 @@ impl ClickhouseAdminServerApi for ClickhouseAdminServerImpl { async fn generate_config_and_enable_svc( rqctx: RequestContext, body: TypedBody, - ) -> Result, HttpError> { + ) -> Result, HttpError> { let ctx = rqctx.context(); - let replica_server = body.into_inner(); - let output = - ctx.clickward().generate_server_config(replica_server.settings)?; - - // Once we have generated the client we can safely enable the clickhouse_server service - let fmri = "svc:/oxide/clickhouse_server:default".to_string(); - Svcadm::enable_service(fmri)?; + let replica_settings = body.into_inner(); + let result = + ctx.generate_config_and_enable_svc(replica_settings).await?; + Ok(HttpResponseCreated(result)) + } - Ok(HttpResponseCreated(output)) + async fn generation( + rqctx: RequestContext, + ) -> Result, HttpError> { + let ctx = rqctx.context(); + let gen = match ctx.generation() { + Some(g) => g, + None => { + return Err(HttpError::for_client_error( + Some(String::from("ObjectNotFound")), + StatusCode::NOT_FOUND, + "no generation number found".to_string(), + )) + } + }; + Ok(HttpResponseOk(gen)) } async fn distributed_ddl_queue( @@ -83,43 +94,8 @@ impl ClickhouseAdminServerApi for ClickhouseAdminServerImpl { rqctx: RequestContext, ) -> Result { let ctx = rqctx.context(); - let log = ctx.log(); - - // Database initialization is idempotent, but not concurrency-safe. - // Use a mutex to serialize requests. - let lock = ctx.initialization_lock(); - let _guard = lock.lock().await; - - // Initialize the database only if it was not previously initialized. - // TODO: Migrate schema to newer version without wiping data. - let client = ctx.oximeter_client(); - let version = client.read_latest_version().await.map_err(|e| { - HttpError::for_internal_error(format!( - "can't read ClickHouse version: {e}", - )) - })?; - if version == 0 { - info!( - log, - "initializing replicated ClickHouse cluster to version {OXIMETER_VERSION}" - ); - let replicated = true; - ctx.oximeter_client() - .initialize_db_with_version(replicated, OXIMETER_VERSION) - .await - .map_err(|e| { - HttpError::for_internal_error(format!( - "can't initialize replicated ClickHouse cluster \ - to version {OXIMETER_VERSION}: {e}", - )) - })?; - } else { - info!( - log, - "skipping initialization of replicated ClickHouse cluster at version {version}" - ); - } - + let replicated = true; + ctx.init_db(replicated).await?; Ok(HttpResponseUpdatedNoContent()) } } @@ -132,16 +108,29 @@ impl ClickhouseAdminKeeperApi for ClickhouseAdminKeeperImpl { async fn generate_config_and_enable_svc( rqctx: RequestContext, body: TypedBody, - ) -> Result, HttpError> { + ) -> Result, HttpError> { let ctx = rqctx.context(); - let keeper = body.into_inner(); - let output = ctx.clickward().generate_keeper_config(keeper.settings)?; - - // Once we have generated the client we can safely enable the clickhouse_keeper service - let fmri = "svc:/oxide/clickhouse_keeper:default".to_string(); - Svcadm::enable_service(fmri)?; + let keeper_settings = body.into_inner(); + let result = + ctx.generate_config_and_enable_svc(keeper_settings).await?; + Ok(HttpResponseCreated(result)) + } - Ok(HttpResponseCreated(output)) + async fn generation( + rqctx: RequestContext, + ) -> Result, HttpError> { + let ctx = rqctx.context(); + let gen = match ctx.generation() { + Some(g) => g, + None => { + return Err(HttpError::for_client_error( + Some(String::from("ObjectNotFound")), + StatusCode::NOT_FOUND, + "no generation number found".to_string(), + )) + } + }; + Ok(HttpResponseOk(gen)) } async fn lgif( @@ -187,42 +176,8 @@ impl ClickhouseAdminSingleApi for ClickhouseAdminSingleImpl { rqctx: RequestContext, ) -> Result { let ctx = rqctx.context(); - let log = ctx.log(); - - // Database initialization is idempotent, but not concurrency-safe. - // Use a mutex to serialize requests. - let lock = ctx.initialization_lock(); - let _guard = lock.lock().await; - - // Initialize the database only if it was not previously initialized. - // TODO: Migrate schema to newer version without wiping data. - let client = ctx.oximeter_client(); - let version = client.read_latest_version().await.map_err(|e| { - HttpError::for_internal_error(format!( - "can't read ClickHouse version: {e}", - )) - })?; - if version == 0 { - info!( - log, - "initializing single-node ClickHouse to version {OXIMETER_VERSION}" - ); - ctx.oximeter_client() - .initialize_db_with_version(false, OXIMETER_VERSION) - .await - .map_err(|e| { - HttpError::for_internal_error(format!( - "can't initialize single-node ClickHouse \ - to version {OXIMETER_VERSION}: {e}", - )) - })?; - } else { - info!( - log, - "skipping initialization of single-node ClickHouse at version {version}" - ); - } - + let replicated = false; + ctx.init_db(replicated).await?; Ok(HttpResponseUpdatedNoContent()) } diff --git a/clickhouse-admin/src/lib.rs b/clickhouse-admin/src/lib.rs index d6a6f1e366d..5f1ac6f0f57 100644 --- a/clickhouse-admin/src/lib.rs +++ b/clickhouse-admin/src/lib.rs @@ -31,6 +31,8 @@ pub enum StartError { RegisterDtraceProbes(String), #[error("failed to initialize HTTP server")] InitializeHttpServer(#[source] dropshot::BuildError), + #[error("failed to initialize context")] + InitializeContext(#[source] anyhow::Error), } /// Start the dropshot server for `clickhouse-admin-server` which @@ -58,12 +60,8 @@ pub async fn start_server_admin_server( } } - let clickhouse_cli = ClickhouseCli::new( - binary_path, - listen_address, - log.new(slog::o!("component" => "ClickhouseCli")), - ); - let context = ServerContext::new(clickhouse_cli); + let context = ServerContext::new(&log, binary_path, listen_address) + .map_err(StartError::InitializeContext)?; dropshot::ServerBuilder::new( http_entrypoints::clickhouse_admin_server_api(), Arc::new(context), @@ -99,12 +97,8 @@ pub async fn start_keeper_admin_server( } } - let clickhouse_cli = ClickhouseCli::new( - binary_path, - listen_address, - log.new(slog::o!("component" => "ClickhouseCli")), - ); - let context = KeeperServerContext::new(clickhouse_cli); + let context = KeeperServerContext::new(&log, binary_path, listen_address) + .map_err(StartError::InitializeContext)?; dropshot::ServerBuilder::new( http_entrypoints::clickhouse_admin_keeper_api(), Arc::new(context), @@ -140,12 +134,8 @@ pub async fn start_single_admin_server( } } - let clickhouse_cli = ClickhouseCli::new( - binary_path, - listen_address, - log.new(slog::o!("component" => "ClickhouseCli")), - ); - let context = ServerContext::new(clickhouse_cli); + let context = ServerContext::new(&log, binary_path, listen_address) + .map_err(StartError::InitializeContext)?; dropshot::ServerBuilder::new( http_entrypoints::clickhouse_admin_single_api(), Arc::new(context), diff --git a/clickhouse-admin/tests/integration_test.rs b/clickhouse-admin/tests/integration_test.rs index 2715afd9cf0..b6d56618f7e 100644 --- a/clickhouse-admin/tests/integration_test.rs +++ b/clickhouse-admin/tests/integration_test.rs @@ -50,7 +50,7 @@ async fn test_lgif_parsing() -> anyhow::Result<()> { 0, 0, ), - log(), + &log(), ); let lgif = clickhouse_cli.lgif().await.unwrap(); @@ -71,7 +71,7 @@ async fn test_raft_config_parsing() -> anyhow::Result<()> { 0, 0, ), - log(), + &log(), ); let raft_config = clickhouse_cli.raft_config().await.unwrap(); @@ -107,7 +107,7 @@ async fn test_keeper_conf_parsing() -> anyhow::Result<()> { 0, 0, ), - log(), + &log(), ); let conf = clickhouse_cli.keeper_conf().await.unwrap(); @@ -127,7 +127,7 @@ async fn test_keeper_cluster_membership() -> anyhow::Result<()> { 0, 0, ), - log(), + &log(), ); let keeper_cluster_membership = diff --git a/clickhouse-admin/types/src/config.rs b/clickhouse-admin/types/src/config.rs index 120ff323128..0b1c7ffe1be 100644 --- a/clickhouse-admin/types/src/config.rs +++ b/clickhouse-admin/types/src/config.rs @@ -10,11 +10,20 @@ use omicron_common::address::{ CLICKHOUSE_KEEPER_RAFT_PORT, CLICKHOUSE_KEEPER_TCP_PORT, CLICKHOUSE_TCP_PORT, }; +use omicron_common::api::external::Generation; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use std::net::{Ipv4Addr, Ipv6Addr}; use std::{fmt::Display, str::FromStr}; +/// Result after generating a configuration file +#[derive(Debug, Clone, PartialEq, Eq, JsonSchema, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum GenerateConfigResult { + Replica(ReplicaConfig), + Keeper(KeeperConfig), +} + /// Configuration for a ClickHouse replica server #[derive(Debug, Clone, PartialEq, Eq, JsonSchema, Serialize, Deserialize)] pub struct ReplicaConfig { @@ -38,6 +47,8 @@ pub struct ReplicaConfig { /// Directory for all files generated by ClickHouse itself #[schemars(schema_with = "path_schema")] pub data_path: Utf8PathBuf, + /// A unique identifier for the configuration generation. + pub generation: Generation, } impl ReplicaConfig { @@ -49,6 +60,7 @@ impl ReplicaConfig { remote_servers: Vec, keepers: Vec, path: Utf8PathBuf, + generation: Generation, ) -> Self { let data_path = path.join("data"); let remote_servers = RemoteServers::new(remote_servers); @@ -64,6 +76,7 @@ impl ReplicaConfig { remote_servers, keepers, data_path, + generation, } } @@ -78,6 +91,7 @@ impl ReplicaConfig { remote_servers, keepers, data_path, + generation, } = self; let logger = logger.to_xml(); let cluster = macros.cluster.clone(); @@ -89,7 +103,7 @@ impl ReplicaConfig { let temp_files_path = data_path.clone().join("tmp"); let format_schema_path = data_path.clone().join("format_schemas"); format!( - " + " {logger} {data_path} @@ -541,6 +555,8 @@ pub struct KeeperConfig { /// Directory for all files generated by ClickHouse itself #[schemars(schema_with = "path_schema")] pub datastore_path: Utf8PathBuf, + /// A unique identifier for the configuration generation. + pub generation: Generation, } impl KeeperConfig { @@ -551,6 +567,7 @@ impl KeeperConfig { server_id: KeeperId, datastore_path: Utf8PathBuf, raft_config: RaftServers, + generation: Generation, ) -> Self { let coordination_path = datastore_path.join("coordination"); let log_storage_path = coordination_path.join("log"); @@ -566,6 +583,7 @@ impl KeeperConfig { coordination_settings, raft_config, datastore_path, + generation, } } @@ -580,6 +598,7 @@ impl KeeperConfig { coordination_settings, raft_config, datastore_path, + generation, } = self; let logger = logger.to_xml(); let KeeperCoordinationSettings { @@ -589,7 +608,7 @@ impl KeeperConfig { } = coordination_settings; let raft_servers = raft_config.to_xml(); format!( - " + " {logger} {listen_host} diff --git a/clickhouse-admin/types/src/lib.rs b/clickhouse-admin/types/src/lib.rs index b4dd21652c2..5797d396969 100644 --- a/clickhouse-admin/types/src/lib.rs +++ b/clickhouse-admin/types/src/lib.rs @@ -26,11 +26,19 @@ use std::str::FromStr; mod config; pub use config::{ - ClickhouseHost, KeeperConfig, KeeperConfigsForReplica, KeeperNodeConfig, - LogConfig, LogLevel, Macros, NodeType, RaftServerConfig, - RaftServerSettings, RaftServers, ReplicaConfig, ServerNodeConfig, + ClickhouseHost, GenerateConfigResult, KeeperConfig, + KeeperConfigsForReplica, KeeperNodeConfig, LogConfig, LogLevel, Macros, + NodeType, RaftServerConfig, RaftServerSettings, RaftServers, ReplicaConfig, + ServerNodeConfig, }; +pub const CLICKHOUSE_SERVER_CONFIG_DIR: &str = + "/opt/oxide/clickhouse_server/config.d"; +pub const CLICKHOUSE_SERVER_CONFIG_FILE: &str = "replica-server-config.xml"; +pub const CLICKHOUSE_KEEPER_CONFIG_DIR: &str = "/opt/oxide/clickhouse_keeper"; +pub const CLICKHOUSE_KEEPER_CONFIG_FILE: &str = "keeper_config.xml"; +pub const OXIMETER_CLUSTER: &str = "oximeter_cluster"; + // Used for schemars to be able to be used with camino: // See https://github.com/camino-rs/camino/issues/91#issuecomment-2027908513 pub fn path_schema(gen: &mut SchemaGenerator) -> Schema { @@ -39,8 +47,6 @@ pub fn path_schema(gen: &mut SchemaGenerator) -> Schema { schema.into() } -pub const OXIMETER_CLUSTER: &str = "oximeter_cluster"; - /// A unique ID for a ClickHouse Keeper #[derive( Debug, @@ -91,6 +97,69 @@ pub struct ServerConfigurableSettings { pub settings: ServerSettings, } +impl ServerConfigurableSettings { + /// Generate a configuration file for a replica server node + pub fn generate_xml_file(&self) -> Result { + let logger = LogConfig::new( + self.settings.datastore_path.clone(), + NodeType::Server, + ); + let macros = Macros::new(self.settings.id); + + let keepers: Vec = self + .settings + .keepers + .iter() + .map(|host| KeeperNodeConfig::new(host.clone())) + .collect(); + + let servers: Vec = self + .settings + .remote_servers + .iter() + .map(|host| ServerNodeConfig::new(host.clone())) + .collect(); + + let config = ReplicaConfig::new( + logger, + macros, + self.listen_addr(), + servers.clone(), + keepers.clone(), + self.datastore_path(), + self.generation(), + ); + + match create_dir(self.settings.config_dir.clone()) { + Ok(_) => (), + Err(e) if e.kind() == ErrorKind::AlreadyExists => (), + Err(e) => return Err(e.into()), + }; + + let path = self.settings.config_dir.join("replica-server-config.xml"); + AtomicFile::new( + path.clone(), + atomicwrites::OverwriteBehavior::AllowOverwrite, + ) + .write(|f| f.write_all(config.to_xml().as_bytes())) + .with_context(|| format!("failed to write to `{}`", path))?; + + Ok(config) + } + + pub fn generation(&self) -> Generation { + self.generation + } + + fn listen_addr(&self) -> Ipv6Addr { + self.settings.listen_addr + } + + fn datastore_path(&self) -> Utf8PathBuf { + self.settings.datastore_path.clone() + } +} + /// The top most type for configuring clickhouse-servers via /// clickhouse-admin-keeper-api #[derive(Debug, Serialize, Deserialize, JsonSchema)] @@ -101,6 +170,65 @@ pub struct KeeperConfigurableSettings { pub settings: KeeperSettings, } +impl KeeperConfigurableSettings { + /// Generate a configuration file for a keeper node + pub fn generate_xml_file(&self) -> Result { + let logger = LogConfig::new( + self.settings.datastore_path.clone(), + NodeType::Keeper, + ); + + let raft_servers = self + .settings + .raft_servers + .iter() + .map(|settings| RaftServerConfig::new(settings.clone())) + .collect(); + let raft_config = RaftServers::new(raft_servers); + + let config = KeeperConfig::new( + logger, + self.listen_addr(), + self.id(), + self.datastore_path(), + raft_config, + self.generation(), + ); + + match create_dir(self.settings.config_dir.clone()) { + Ok(_) => (), + Err(e) if e.kind() == ErrorKind::AlreadyExists => (), + Err(e) => return Err(e.into()), + }; + + let path = self.settings.config_dir.join("keeper_config.xml"); + AtomicFile::new( + path.clone(), + atomicwrites::OverwriteBehavior::AllowOverwrite, + ) + .write(|f| f.write_all(config.to_xml().as_bytes())) + .with_context(|| format!("failed to write to `{}`", path))?; + + Ok(config) + } + + pub fn generation(&self) -> Generation { + self.generation + } + + fn listen_addr(&self) -> Ipv6Addr { + self.settings.listen_addr + } + + fn id(&self) -> KeeperId { + self.settings.id + } + + fn datastore_path(&self) -> Utf8PathBuf { + self.settings.datastore_path.clone() + } +} + /// Configurable settings for a ClickHouse replica server node. #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] #[serde(rename_all = "snake_case")] @@ -139,50 +267,6 @@ impl ServerSettings { remote_servers, } } - - /// Generate a configuration file for a replica server node - pub fn generate_xml_file(&self) -> Result { - let logger = - LogConfig::new(self.datastore_path.clone(), NodeType::Server); - let macros = Macros::new(self.id); - - let keepers: Vec = self - .keepers - .iter() - .map(|host| KeeperNodeConfig::new(host.clone())) - .collect(); - - let servers: Vec = self - .remote_servers - .iter() - .map(|host| ServerNodeConfig::new(host.clone())) - .collect(); - - let config = ReplicaConfig::new( - logger, - macros, - self.listen_addr, - servers.clone(), - keepers.clone(), - self.datastore_path.clone(), - ); - - match create_dir(self.config_dir.clone()) { - Ok(_) => (), - Err(e) if e.kind() == ErrorKind::AlreadyExists => (), - Err(e) => return Err(e.into()), - }; - - let path = self.config_dir.join("replica-server-config.xml"); - AtomicFile::new( - path.clone(), - atomicwrites::OverwriteBehavior::AllowOverwrite, - ) - .write(|f| f.write_all(config.to_xml().as_bytes())) - .with_context(|| format!("failed to write to `{}`", path))?; - - Ok(config) - } } /// Configurable settings for a ClickHouse keeper node. @@ -213,43 +297,6 @@ impl KeeperSettings { ) -> Self { Self { config_dir, id, raft_servers, datastore_path, listen_addr } } - - /// Generate a configuration file for a keeper node - pub fn generate_xml_file(&self) -> Result { - let logger = - LogConfig::new(self.datastore_path.clone(), NodeType::Keeper); - - let raft_servers = self - .raft_servers - .iter() - .map(|settings| RaftServerConfig::new(settings.clone())) - .collect(); - let raft_config = RaftServers::new(raft_servers); - - let config = KeeperConfig::new( - logger, - self.listen_addr, - self.id, - self.datastore_path.clone(), - raft_config, - ); - - match create_dir(self.config_dir.clone()) { - Ok(_) => (), - Err(e) if e.kind() == ErrorKind::AlreadyExists => (), - Err(e) => return Err(e.into()), - }; - - let path = self.config_dir.join("keeper_config.xml"); - AtomicFile::new( - path.clone(), - atomicwrites::OverwriteBehavior::AllowOverwrite, - ) - .write(|f| f.write_all(config.to_xml().as_bytes())) - .with_context(|| format!("failed to write to `{}`", path))?; - - Ok(config) - } } #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] @@ -1240,6 +1287,7 @@ mod tests { use camino::Utf8PathBuf; use camino_tempfile::Builder; use chrono::{DateTime, Utc}; + use omicron_common::api::external::Generation; use slog::{o, Drain}; use slog_term::{FullFormat, PlainDecorator, TestStdoutWriter}; use std::collections::BTreeMap; @@ -1247,10 +1295,11 @@ mod tests { use std::str::FromStr; use crate::{ - ClickhouseHost, DistributedDdlQueue, KeeperConf, KeeperId, - KeeperServerInfo, KeeperServerType, KeeperSettings, Lgif, LogLevel, - RaftConfig, RaftServerSettings, ServerId, ServerSettings, - SystemTimeSeries, + ClickhouseHost, DistributedDdlQueue, KeeperConf, + KeeperConfigurableSettings, KeeperId, KeeperServerInfo, + KeeperServerType, KeeperSettings, Lgif, LogLevel, RaftConfig, + RaftServerSettings, ServerConfigurableSettings, ServerId, + ServerSettings, SystemTimeSeries, }; fn log() -> slog::Logger { @@ -1287,7 +1336,7 @@ mod tests { }, ]; - let config = KeeperSettings::new( + let settings = KeeperSettings::new( Utf8PathBuf::from(config_dir.path()), KeeperId(1), keepers, @@ -1295,6 +1344,11 @@ mod tests { Ipv6Addr::from_str("ff::08").unwrap(), ); + let config = KeeperConfigurableSettings { + generation: Generation::new(), + settings, + }; + config.generate_xml_file().unwrap(); let expected_file = Utf8PathBuf::from_str("./testutils") @@ -1327,7 +1381,7 @@ mod tests { ClickhouseHost::DomainName("ohai.com".to_string()), ]; - let config = ServerSettings::new( + let settings = ServerSettings::new( Utf8PathBuf::from(config_dir.path()), ServerId(1), Utf8PathBuf::from_str("./").unwrap(), @@ -1336,6 +1390,10 @@ mod tests { servers, ); + let config = ServerConfigurableSettings { + settings, + generation: Generation::new(), + }; config.generate_xml_file().unwrap(); let expected_file = Utf8PathBuf::from_str("./testutils") diff --git a/clickhouse-admin/types/testutils/keeper_config.xml b/clickhouse-admin/types/testutils/keeper_config.xml index 72def3887e9..690b174a6c5 100644 --- a/clickhouse-admin/types/testutils/keeper_config.xml +++ b/clickhouse-admin/types/testutils/keeper_config.xml @@ -1,4 +1,4 @@ - + diff --git a/clickhouse-admin/types/testutils/malformed_1.xml b/clickhouse-admin/types/testutils/malformed_1.xml new file mode 100644 index 00000000000..a26423ba678 --- /dev/null +++ b/clickhouse-admin/types/testutils/malformed_1.xml @@ -0,0 +1,10 @@ + + + + trace + ./log/clickhouse-keeper.log + ./log/clickhouse-keeper.err.log + 100M + 1 + + \ No newline at end of file diff --git a/clickhouse-admin/types/testutils/malformed_2.xml b/clickhouse-admin/types/testutils/malformed_2.xml new file mode 100644 index 00000000000..aa2acbbafd9 --- /dev/null +++ b/clickhouse-admin/types/testutils/malformed_2.xml @@ -0,0 +1,11 @@ + + + + + trace + ./log/clickhouse-keeper.log + ./log/clickhouse-keeper.err.log + 100M + 1 + + \ No newline at end of file diff --git a/clickhouse-admin/types/testutils/malformed_3.xml b/clickhouse-admin/types/testutils/malformed_3.xml new file mode 100644 index 00000000000..c4a424cf88e --- /dev/null +++ b/clickhouse-admin/types/testutils/malformed_3.xml @@ -0,0 +1,11 @@ + --> + + + + trace + ./log/clickhouse-keeper.log + ./log/clickhouse-keeper.err.log + 100M + 1 + + \ No newline at end of file diff --git a/clickhouse-admin/types/testutils/replica-server-config.xml b/clickhouse-admin/types/testutils/replica-server-config.xml index 8a0687e9afd..121ea072ebd 100644 --- a/clickhouse-admin/types/testutils/replica-server-config.xml +++ b/clickhouse-admin/types/testutils/replica-server-config.xml @@ -1,4 +1,4 @@ - + diff --git a/nexus/reconfigurator/execution/src/clickhouse.rs b/nexus/reconfigurator/execution/src/clickhouse.rs index 05da6de064d..a5642015c47 100644 --- a/nexus/reconfigurator/execution/src/clickhouse.rs +++ b/nexus/reconfigurator/execution/src/clickhouse.rs @@ -16,6 +16,8 @@ use clickhouse_admin_types::KeeperSettings; use clickhouse_admin_types::RaftServerSettings; use clickhouse_admin_types::ServerConfigurableSettings; use clickhouse_admin_types::ServerSettings; +use clickhouse_admin_types::CLICKHOUSE_KEEPER_CONFIG_DIR; +use clickhouse_admin_types::CLICKHOUSE_SERVER_CONFIG_DIR; use futures::future::Either; use futures::stream::FuturesUnordered; use futures::stream::StreamExt; @@ -36,9 +38,6 @@ use std::net::SocketAddr; use std::net::SocketAddrV6; use std::str::FromStr; -const CLICKHOUSE_SERVER_CONFIG_DIR: &str = - "/opt/oxide/clickhouse_server/config.d"; -const CLICKHOUSE_KEEPER_CONFIG_DIR: &str = "/opt/oxide/clickhouse_keeper"; const CLICKHOUSE_DATA_DIR: &str = "/data"; pub(crate) async fn deploy_nodes( diff --git a/openapi/clickhouse-admin-keeper.json b/openapi/clickhouse-admin-keeper.json index a9539f4c16b..5099e2fe595 100644 --- a/openapi/clickhouse-admin-keeper.json +++ b/openapi/clickhouse-admin-keeper.json @@ -104,7 +104,31 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/KeeperConfig" + "$ref": "#/components/schemas/GenerateConfigResult" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/generation": { + "get": { + "summary": "Retrieve the generation number of a configuration", + "operationId": "generation", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Generation" } } } @@ -240,6 +264,35 @@ "request_id" ] }, + "GenerateConfigResult": { + "description": "Result after generating a configuration file", + "oneOf": [ + { + "type": "object", + "properties": { + "replica": { + "$ref": "#/components/schemas/ReplicaConfig" + } + }, + "required": [ + "replica" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "keeper": { + "$ref": "#/components/schemas/KeeperConfig" + } + }, + "required": [ + "keeper" + ], + "additionalProperties": false + } + ] + }, "Generation": { "description": "Generation numbers stored in the database, used for optimistic concurrency control", "type": "integer", @@ -500,6 +553,14 @@ "type": "string", "format": "Utf8PathBuf" }, + "generation": { + "description": "A unique identifier for the configuration generation.", + "allOf": [ + { + "$ref": "#/components/schemas/Generation" + } + ] + }, "listen_host": { "description": "Address the keeper is listening on", "type": "string", @@ -549,6 +610,7 @@ "required": [ "coordination_settings", "datastore_path", + "generation", "listen_host", "log_storage_path", "logger", @@ -558,6 +620,20 @@ "tcp_port" ] }, + "KeeperConfigsForReplica": { + "type": "object", + "properties": { + "nodes": { + "type": "array", + "items": { + "$ref": "#/components/schemas/KeeperNodeConfig" + } + } + }, + "required": [ + "nodes" + ] + }, "KeeperConfigurableSettings": { "description": "The top most type for configuring clickhouse-servers via clickhouse-admin-keeper-api", "type": "object", @@ -613,6 +689,23 @@ "format": "uint64", "minimum": 0 }, + "KeeperNodeConfig": { + "type": "object", + "properties": { + "host": { + "$ref": "#/components/schemas/ClickhouseHost" + }, + "port": { + "type": "integer", + "format": "uint16", + "minimum": 0 + } + }, + "required": [ + "host", + "port" + ] + }, "KeeperServerInfo": { "type": "object", "properties": { @@ -815,6 +908,27 @@ "debug" ] }, + "Macros": { + "type": "object", + "properties": { + "cluster": { + "type": "string" + }, + "replica": { + "$ref": "#/components/schemas/ServerId" + }, + "shard": { + "type": "integer", + "format": "uint64", + "minimum": 0 + } + }, + "required": [ + "cluster", + "replica", + "shard" + ] + }, "RaftConfig": { "description": "Keeper raft configuration information", "type": "object", @@ -880,6 +994,137 @@ "required": [ "servers" ] + }, + "RemoteServers": { + "type": "object", + "properties": { + "cluster": { + "type": "string" + }, + "replicas": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ServerNodeConfig" + } + }, + "secret": { + "type": "string" + } + }, + "required": [ + "cluster", + "replicas", + "secret" + ] + }, + "ReplicaConfig": { + "description": "Configuration for a ClickHouse replica server", + "type": "object", + "properties": { + "data_path": { + "description": "Directory for all files generated by ClickHouse itself", + "type": "string", + "format": "Utf8PathBuf" + }, + "generation": { + "description": "A unique identifier for the configuration generation.", + "allOf": [ + { + "$ref": "#/components/schemas/Generation" + } + ] + }, + "http_port": { + "description": "Port for HTTP connections", + "type": "integer", + "format": "uint16", + "minimum": 0 + }, + "interserver_http_port": { + "description": "Port for interserver HTTP connections", + "type": "integer", + "format": "uint16", + "minimum": 0 + }, + "keepers": { + "description": "Contains settings that allow ClickHouse servers to interact with a Keeper cluster", + "allOf": [ + { + "$ref": "#/components/schemas/KeeperConfigsForReplica" + } + ] + }, + "listen_host": { + "description": "Address the server is listening on", + "type": "string", + "format": "ipv6" + }, + "logger": { + "description": "Logging settings", + "allOf": [ + { + "$ref": "#/components/schemas/LogConfig" + } + ] + }, + "macros": { + "description": "Parameter substitutions for replicated tables", + "allOf": [ + { + "$ref": "#/components/schemas/Macros" + } + ] + }, + "remote_servers": { + "description": "Configuration of clusters used by the Distributed table engine and bythe cluster table function", + "allOf": [ + { + "$ref": "#/components/schemas/RemoteServers" + } + ] + }, + "tcp_port": { + "description": "Port for TCP connections", + "type": "integer", + "format": "uint16", + "minimum": 0 + } + }, + "required": [ + "data_path", + "generation", + "http_port", + "interserver_http_port", + "keepers", + "listen_host", + "logger", + "macros", + "remote_servers", + "tcp_port" + ] + }, + "ServerId": { + "description": "A unique ID for a Clickhouse Server", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, + "ServerNodeConfig": { + "type": "object", + "properties": { + "host": { + "$ref": "#/components/schemas/ClickhouseHost" + }, + "port": { + "type": "integer", + "format": "uint16", + "minimum": 0 + } + }, + "required": [ + "host", + "port" + ] } }, "responses": { diff --git a/openapi/clickhouse-admin-server.json b/openapi/clickhouse-admin-server.json index 5da8a53838d..5b63767bd23 100644 --- a/openapi/clickhouse-admin-server.json +++ b/openapi/clickhouse-admin-server.json @@ -31,7 +31,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ReplicaConfig" + "$ref": "#/components/schemas/GenerateConfigResult" } } } @@ -74,6 +74,30 @@ } } }, + "/generation": { + "get": { + "summary": "Retrieve the generation number of a configuration", + "operationId": "generation", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Generation" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/init": { "put": { "summary": "Idempotently initialize a replicated ClickHouse cluster database.", @@ -331,12 +355,125 @@ "request_id" ] }, + "GenerateConfigResult": { + "description": "Result after generating a configuration file", + "oneOf": [ + { + "type": "object", + "properties": { + "replica": { + "$ref": "#/components/schemas/ReplicaConfig" + } + }, + "required": [ + "replica" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "keeper": { + "$ref": "#/components/schemas/KeeperConfig" + } + }, + "required": [ + "keeper" + ], + "additionalProperties": false + } + ] + }, "Generation": { "description": "Generation numbers stored in the database, used for optimistic concurrency control", "type": "integer", "format": "uint64", "minimum": 0 }, + "KeeperConfig": { + "description": "Configuration for a ClickHouse keeper", + "type": "object", + "properties": { + "coordination_settings": { + "description": "Internal coordination settings", + "allOf": [ + { + "$ref": "#/components/schemas/KeeperCoordinationSettings" + } + ] + }, + "datastore_path": { + "description": "Directory for all files generated by ClickHouse itself", + "type": "string", + "format": "Utf8PathBuf" + }, + "generation": { + "description": "A unique identifier for the configuration generation.", + "allOf": [ + { + "$ref": "#/components/schemas/Generation" + } + ] + }, + "listen_host": { + "description": "Address the keeper is listening on", + "type": "string", + "format": "ipv6" + }, + "log_storage_path": { + "description": "Directory for coordination logs", + "type": "string", + "format": "Utf8PathBuf" + }, + "logger": { + "description": "Logging settings", + "allOf": [ + { + "$ref": "#/components/schemas/LogConfig" + } + ] + }, + "raft_config": { + "description": "Settings for each server in the keeper cluster", + "allOf": [ + { + "$ref": "#/components/schemas/RaftServers" + } + ] + }, + "server_id": { + "description": "Unique ID for this keeper node", + "allOf": [ + { + "$ref": "#/components/schemas/KeeperId" + } + ] + }, + "snapshot_storage_path": { + "description": "Directory for coordination snapshot storage", + "type": "string", + "format": "Utf8PathBuf" + }, + "tcp_port": { + "description": "Port for TCP connections", + "type": "integer", + "format": "uint16", + "minimum": 0 + } + }, + "required": [ + "coordination_settings", + "datastore_path", + "generation", + "listen_host", + "log_storage_path", + "logger", + "raft_config", + "server_id", + "snapshot_storage_path", + "tcp_port" + ] + }, "KeeperConfigsForReplica": { "type": "object", "properties": { @@ -351,6 +488,35 @@ "nodes" ] }, + "KeeperCoordinationSettings": { + "type": "object", + "properties": { + "operation_timeout_ms": { + "type": "integer", + "format": "uint32", + "minimum": 0 + }, + "raft_logs_level": { + "$ref": "#/components/schemas/LogLevel" + }, + "session_timeout_ms": { + "type": "integer", + "format": "uint32", + "minimum": 0 + } + }, + "required": [ + "operation_timeout_ms", + "raft_logs_level", + "session_timeout_ms" + ] + }, + "KeeperId": { + "description": "A unique ID for a ClickHouse Keeper", + "type": "integer", + "format": "uint64", + "minimum": 0 + }, "KeeperNodeConfig": { "type": "object", "properties": { @@ -429,6 +595,41 @@ "shard" ] }, + "RaftServerConfig": { + "type": "object", + "properties": { + "hostname": { + "$ref": "#/components/schemas/ClickhouseHost" + }, + "id": { + "$ref": "#/components/schemas/KeeperId" + }, + "port": { + "type": "integer", + "format": "uint16", + "minimum": 0 + } + }, + "required": [ + "hostname", + "id", + "port" + ] + }, + "RaftServers": { + "type": "object", + "properties": { + "servers": { + "type": "array", + "items": { + "$ref": "#/components/schemas/RaftServerConfig" + } + } + }, + "required": [ + "servers" + ] + }, "RemoteServers": { "type": "object", "properties": { @@ -460,6 +661,14 @@ "type": "string", "format": "Utf8PathBuf" }, + "generation": { + "description": "A unique identifier for the configuration generation.", + "allOf": [ + { + "$ref": "#/components/schemas/Generation" + } + ] + }, "http_port": { "description": "Port for HTTP connections", "type": "integer", @@ -518,6 +727,7 @@ }, "required": [ "data_path", + "generation", "http_port", "interserver_http_port", "keepers", diff --git a/package-manifest.toml b/package-manifest.toml index 918ea4ed96c..2afe01b8d7c 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -210,7 +210,6 @@ source.type = "local" source.paths = [ { from = "out/clickhouse", to = "/opt/oxide/clickhouse_server" }, { from = "smf/clickhouse_server/manifest.xml", to = "/var/svc/manifest/site/clickhouse_server/manifest.xml" }, - { from = "smf/clickhouse_server/method_script.sh", to = "/opt/oxide/lib/svc/manifest/clickhouse_server.sh" }, { from = "smf/clickhouse-admin-server", to = "/var/svc/manifest/site/clickhouse-admin-server" }, ] output.type = "zone" @@ -240,7 +239,6 @@ source.type = "local" source.paths = [ { from = "out/clickhouse", to = "/opt/oxide/clickhouse_keeper" }, { from = "smf/clickhouse_keeper/manifest.xml", to = "/var/svc/manifest/site/clickhouse_keeper/manifest.xml" }, - { from = "smf/clickhouse_keeper/method_script.sh", to = "/opt/oxide/lib/svc/manifest/clickhouse_keeper.sh" }, { from = "smf/clickhouse-admin-keeper", to = "/var/svc/manifest/site/clickhouse-admin-keeper" }, ] output.type = "zone" diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index fa437cb6ba3..ffef7c6597e 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -22,6 +22,7 @@ cancel-safe-futures.workspace = true cfg-if.workspace = true chrono.workspace = true clap.workspace = true +clickhouse-admin-types.workspace = true # Only used by the simulated sled agent. crucible-agent-client.workspace = true derive_more.workspace = true diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index bac21cbe504..4eba7e13c2e 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -37,6 +37,10 @@ use crate::zone_bundle::BundleError; use crate::zone_bundle::ZoneBundler; use anyhow::anyhow; use camino::{Utf8Path, Utf8PathBuf}; +use clickhouse_admin_types::CLICKHOUSE_KEEPER_CONFIG_DIR; +use clickhouse_admin_types::CLICKHOUSE_KEEPER_CONFIG_FILE; +use clickhouse_admin_types::CLICKHOUSE_SERVER_CONFIG_DIR; +use clickhouse_admin_types::CLICKHOUSE_SERVER_CONFIG_FILE; use dpd_client::{types as DpdTypes, Client as DpdClient, Error as DpdError}; use dropshot::HandlerTaskMode; use illumos_utils::addrobj::AddrObject; @@ -1641,10 +1645,19 @@ impl ServiceManager { let dns_service = Self::dns_install(info, None, None).await?; + let clickhouse_server_config = + PropertyGroupBuilder::new("config") + .add_property( + "config_path", + "astring", + format!("{CLICKHOUSE_SERVER_CONFIG_DIR}/{CLICKHOUSE_SERVER_CONFIG_FILE}"), + ); let disabled_clickhouse_server_service = ServiceBuilder::new("oxide/clickhouse_server") .add_instance( - ServiceInstanceBuilder::new("default").disable(), + ServiceInstanceBuilder::new("default") + .disable() + .add_property_group(clickhouse_server_config), ); // We shouldn't need to hardcode a port here: @@ -1722,10 +1735,19 @@ impl ServiceManager { let dns_service = Self::dns_install(info, None, None).await?; + let clickhouse_keeper_config = + PropertyGroupBuilder::new("config") + .add_property( + "config_path", + "astring", + format!("{CLICKHOUSE_KEEPER_CONFIG_DIR}/{CLICKHOUSE_KEEPER_CONFIG_FILE}"), + ); let disaled_clickhouse_keeper_service = ServiceBuilder::new("oxide/clickhouse_keeper") .add_instance( - ServiceInstanceBuilder::new("default").disable(), + ServiceInstanceBuilder::new("default") + .disable() + .add_property_group(clickhouse_keeper_config), ); // We shouldn't need to hardcode a port here: diff --git a/smf/clickhouse_keeper/manifest.xml b/smf/clickhouse_keeper/manifest.xml index 8af4dba3f20..775f40b0be7 100644 --- a/smf/clickhouse_keeper/manifest.xml +++ b/smf/clickhouse_keeper/manifest.xml @@ -22,10 +22,14 @@ + + + + diff --git a/smf/clickhouse_keeper/method_script.sh b/smf/clickhouse_keeper/method_script.sh deleted file mode 100755 index cf8a5d629cf..00000000000 --- a/smf/clickhouse_keeper/method_script.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -set -x -set -o errexit -set -o pipefail - -. /lib/svc/share/smf_include.sh - -exec /opt/oxide/clickhouse_keeper/clickhouse keeper --config /opt/oxide/clickhouse_keeper/keeper_config.xml & diff --git a/smf/clickhouse_server/manifest.xml b/smf/clickhouse_server/manifest.xml index c0eff8aae31..12ba074ce58 100644 --- a/smf/clickhouse_server/manifest.xml +++ b/smf/clickhouse_server/manifest.xml @@ -22,10 +22,14 @@ + + + + diff --git a/smf/clickhouse_server/method_script.sh b/smf/clickhouse_server/method_script.sh deleted file mode 100755 index 46ed0bbdacc..00000000000 --- a/smf/clickhouse_server/method_script.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -set -x -set -o errexit -set -o pipefail - -. /lib/svc/share/smf_include.sh - -exec /opt/oxide/clickhouse_server/clickhouse server --config /opt/oxide/clickhouse_server/config.d/replica-server-config.xml & From b1b3dc1516cd3dbff6b43bfd6bc0f2cb638bcf84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karen=20C=C3=A1rcamo?= Date: Mon, 3 Feb 2025 22:53:47 -0800 Subject: [PATCH 7/8] Fix borked CI (#7473) I I broke things here https://github.com/oxidecomputer/omicron/pull/7347 --- clickhouse-admin/src/context.rs | 5 ++--- clickhouse-admin/src/http_entrypoints.rs | 10 +++++----- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/clickhouse-admin/src/context.rs b/clickhouse-admin/src/context.rs index aca948614c9..96f5a18f07a 100644 --- a/clickhouse-admin/src/context.rs +++ b/clickhouse-admin/src/context.rs @@ -12,9 +12,8 @@ use clickhouse_admin_types::{ CLICKHOUSE_KEEPER_CONFIG_FILE, CLICKHOUSE_SERVER_CONFIG_DIR, CLICKHOUSE_SERVER_CONFIG_FILE, }; -use dropshot::HttpError; +use dropshot::{ClientErrorStatusCode, HttpError}; use flume::{Receiver, Sender, TrySendError}; -use http::StatusCode; use illumos_utils::svcadm::Svcadm; use omicron_common::address::CLICKHOUSE_TCP_PORT; use omicron_common::api::external::Generation; @@ -340,7 +339,7 @@ fn generate_config_and_enable_svc( if current > incoming_generation { return Err(HttpError::for_client_error( Some(String::from("Conflict")), - StatusCode::CONFLICT, + ClientErrorStatusCode::CONFLICT, format!( "current generation '{}' is greater than incoming generation '{}'", current, diff --git a/clickhouse-admin/src/http_entrypoints.rs b/clickhouse-admin/src/http_entrypoints.rs index 754266d4021..15d025c17bc 100644 --- a/clickhouse-admin/src/http_entrypoints.rs +++ b/clickhouse-admin/src/http_entrypoints.rs @@ -11,10 +11,10 @@ use clickhouse_admin_types::{ SystemTimeSeriesSettings, TimeSeriesSettingsQuery, }; use dropshot::{ - ApiDescription, HttpError, HttpResponseCreated, HttpResponseOk, - HttpResponseUpdatedNoContent, Path, Query, RequestContext, TypedBody, + ApiDescription, ClientErrorStatusCode, HttpError, HttpResponseCreated, + HttpResponseOk, HttpResponseUpdatedNoContent, Path, Query, RequestContext, + TypedBody, }; -use http::StatusCode; use omicron_common::api::external::Generation; use std::sync::Arc; @@ -59,7 +59,7 @@ impl ClickhouseAdminServerApi for ClickhouseAdminServerImpl { None => { return Err(HttpError::for_client_error( Some(String::from("ObjectNotFound")), - StatusCode::NOT_FOUND, + ClientErrorStatusCode::NOT_FOUND, "no generation number found".to_string(), )) } @@ -125,7 +125,7 @@ impl ClickhouseAdminKeeperApi for ClickhouseAdminKeeperImpl { None => { return Err(HttpError::for_client_error( Some(String::from("ObjectNotFound")), - StatusCode::NOT_FOUND, + ClientErrorStatusCode::NOT_FOUND, "no generation number found".to_string(), )) } From a997c826ed6bf0ac5c96ebdc38ebf6cb4d6d09cf Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 4 Feb 2025 14:32:15 -0500 Subject: [PATCH 8/8] [oximeter] Fix self_stat count test flakes (#7456) #7262 tried to fix the off-by-one test flake #7255, but it's still around. I think based on discussion on that PR (https://github.com/oxidecomputer/omicron/pull/7262/files#r1890548730), the assertion in it was adding one to the wrong side: it's possible the fake server may have seen one more request than the agent has finished noting, which means `server_count` may be equal to either `count` or `count + 1`, not `count` or `count - 1`. The change from `assert_eq` to `assert` also caused the new flakes to not log the actual values; this puts them back into the assertion message in case this fix is still not quite right. --- oximeter/collector/src/agent.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/oximeter/collector/src/agent.rs b/oximeter/collector/src/agent.rs index 7e28831fa0e..cb1eccc5213 100644 --- a/oximeter/collector/src/agent.rs +++ b/oximeter/collector/src/agent.rs @@ -747,10 +747,10 @@ mod tests { assert!(count != 0); let server_count = collection_count.load(Ordering::SeqCst); assert!( - count == server_count || count - 1 == server_count, + count == server_count || count + 1 == server_count, "number of collections reported by the collection \ - task differs from the number reported by the empty \ - producer server itself" + task ({count}) differs from the number reported by the empty \ + producer server itself ({server_count})" ); assert!(stats.failed_collections.is_empty()); logctx.cleanup_successful(); @@ -901,10 +901,10 @@ mod tests { // server. let server_count = collection_count.load(Ordering::SeqCst); assert!( - count == server_count || count - 1 == server_count, + count == server_count || count + 1 == server_count, "number of collections reported by the collection \ - task differs from the number reported by the always-ded \ - producer server itself" + task ({count}) differs from the number reported by the always-ded \ + producer server itself ({server_count})" ); assert_eq!(stats.failed_collections.len(), 1); logctx.cleanup_successful();