Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expunge and Decommission disks in planner #7286

Merged
merged 38 commits into from
Feb 22, 2025
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
900319b
WIP: Expunge and Decommission disks in planner
andrewjstone Dec 10, 2024
a78777c
Merge branch 'main' into expunge-and-decommission-disks-in-planner
andrewjstone Feb 5, 2025
34e1215
planner tests run again
andrewjstone Feb 5, 2025
31f19c6
Filter disks for blippy
andrewjstone Feb 6, 2025
14e3d35
Show disposition and state for disks tables
andrewjstone Feb 6, 2025
af64c4c
update some expected disk output
andrewjstone Feb 6, 2025
5f46bd1
Merge branch 'main' into expunge-and-decommission-disks-in-planner
andrewjstone Feb 6, 2025
8af63fc
fix build
andrewjstone Feb 6, 2025
953657e
Fix planner tests
andrewjstone Feb 6, 2025
5178c08
allow execution steps to fail - still need to fix expunge/decommissio…
andrewjstone Feb 7, 2025
fb4b45f
Merge branch 'main' into expunge-and-decommission-disks-in-planner
andrewjstone Feb 7, 2025
89596c1
Decommission disks based on blueprint
andrewjstone Feb 7, 2025
f479f29
decommission sleds properly
andrewjstone Feb 7, 2025
e1b7fb1
remove stale code
andrewjstone Feb 7, 2025
9825039
fix planner bug
andrewjstone Feb 7, 2025
729723c
planner tests pass again
andrewjstone Feb 7, 2025
816a5e4
schema metadata
andrewjstone Feb 8, 2025
22f9f63
Merge branch 'main' into expunge-and-decommission-disks-in-planner
andrewjstone Feb 10, 2025
de4a080
review fixes
andrewjstone Feb 10, 2025
1822a2b
Merge branch 'main' into expunge-and-decommission-disks-in-planner
andrewjstone Feb 20, 2025
7be692d
Use expunged disposition fields rather than state
andrewjstone Feb 20, 2025
01cbf78
self review fixes
andrewjstone Feb 20, 2025
e2e2cd5
schema migration
andrewjstone Feb 20, 2025
a718ce4
Merge branch 'main' into expunge-and-decommission-disks-in-planner
andrewjstone Feb 20, 2025
7d67be7
expectorate
andrewjstone Feb 20, 2025
9ca1a2d
Make `physical_disk_decommission` idempotent
andrewjstone Feb 20, 2025
55a65f8
review fixes
andrewjstone Feb 20, 2025
b209435
Merge branch 'main' into expunge-and-decommission-disks-in-planner
andrewjstone Feb 20, 2025
2531105
Review fixes
andrewjstone Feb 21, 2025
4bb5048
fix tests
andrewjstone Feb 21, 2025
f256467
Fix disk diffs
andrewjstone Feb 21, 2025
a65ac3b
fix typo
andrewjstone Feb 21, 2025
c9b4441
remove a4x2 detritus
andrewjstone Feb 21, 2025
a46493a
sort modified disks in diffs
andrewjstone Feb 21, 2025
ec69035
Merge branch 'main' into expunge-and-decommission-disks-in-planner
andrewjstone Feb 21, 2025
7ab7957
cargo fmt
andrewjstone Feb 21, 2025
1b5a681
clippy
andrewjstone Feb 21, 2025
7999177
Merge branch 'main' into expunge-and-decommission-disks-in-planner
andrewjstone Feb 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 80 additions & 27 deletions nexus/db-model/src/deployment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,30 +167,66 @@ impl_enum_type!(
Expunged => b"expunged"
);

/// Converts a [`BlueprintPhysicalDiskDisposition`] to a version that can be inserted
/// into a database.
pub fn to_db_bp_physical_disk_disposition(
disposition: BlueprintPhysicalDiskDisposition,
) -> DbBpPhysicalDiskDisposition {
match disposition {
BlueprintPhysicalDiskDisposition::InService => {
DbBpPhysicalDiskDisposition::InService
}
BlueprintPhysicalDiskDisposition::Expunged => {
DbBpPhysicalDiskDisposition::Expunged
struct DbBpPhysicalDiskDispositionColumns {
disposition: DbBpPhysicalDiskDisposition,
expunged_as_of_generation: Option<Generation>,
expunged_ready_for_cleanup: bool,
}

impl From<BlueprintPhysicalDiskDisposition>
for DbBpPhysicalDiskDispositionColumns
{
fn from(value: BlueprintPhysicalDiskDisposition) -> Self {
let (
disposition,
disposition_expunged_as_of_generation,
disposition_expunged_ready_for_cleanup,
) = match value {
BlueprintPhysicalDiskDisposition::InService => {
(DbBpPhysicalDiskDisposition::InService, None, false)
}
BlueprintPhysicalDiskDisposition::Expunged {
as_of_generation,
ready_for_cleanup,
} => (
DbBpPhysicalDiskDisposition::Expunged,
Some(Generation(as_of_generation)),
ready_for_cleanup,
),
};
Self {
disposition,
expunged_as_of_generation: disposition_expunged_as_of_generation,
expunged_ready_for_cleanup: disposition_expunged_ready_for_cleanup,
}
}
}

impl From<DbBpPhysicalDiskDisposition> for BlueprintPhysicalDiskDisposition {
fn from(disposition: DbBpPhysicalDiskDisposition) -> Self {
match disposition {
DbBpPhysicalDiskDisposition::InService => {
BlueprintPhysicalDiskDisposition::InService
impl TryFrom<DbBpPhysicalDiskDispositionColumns>
for BlueprintPhysicalDiskDisposition
{
type Error = anyhow::Error;

fn try_from(
value: DbBpPhysicalDiskDispositionColumns,
) -> Result<Self, Self::Error> {
match (value.disposition, value.expunged_as_of_generation) {
(DbBpPhysicalDiskDisposition::InService, None) => {
Ok(Self::InService)
}
DbBpPhysicalDiskDisposition::Expunged => {
BlueprintPhysicalDiskDisposition::Expunged
(DbBpPhysicalDiskDisposition::Expunged, Some(as_of_generation)) => {
Ok(Self::Expunged {
as_of_generation: *as_of_generation,
ready_for_cleanup: value.expunged_ready_for_cleanup,
})
}
(DbBpPhysicalDiskDisposition::InService, Some(_))
| (DbBpPhysicalDiskDisposition::Expunged, None) => Err(anyhow!(
"illegal database state (CHECK constraint broken?!): \
disposition {:?}, disposition_expunged_as_of_generation {:?}",
value.disposition,
value.expunged_as_of_generation,
)),
}
}
}
Expand Down Expand Up @@ -232,7 +268,9 @@ pub struct BpOmicronPhysicalDisk {
pub id: DbTypedUuid<PhysicalDiskKind>,
pub pool_id: Uuid,

pub disposition: DbBpPhysicalDiskDisposition,
disposition: DbBpPhysicalDiskDisposition,
disposition_expunged_as_of_generation: Option<Generation>,
disposition_expunged_ready_for_cleanup: bool,
}

impl BpOmicronPhysicalDisk {
Expand All @@ -241,6 +279,11 @@ impl BpOmicronPhysicalDisk {
sled_id: SledUuid,
disk_config: &BlueprintPhysicalDiskConfig,
) -> Self {
let DbBpPhysicalDiskDispositionColumns {
disposition,
expunged_as_of_generation: disposition_expunged_as_of_generation,
expunged_ready_for_cleanup: disposition_expunged_ready_for_cleanup,
} = disk_config.disposition.into();
Self {
blueprint_id: blueprint_id.into(),
sled_id: sled_id.into(),
Expand All @@ -249,25 +292,35 @@ impl BpOmicronPhysicalDisk {
model: disk_config.identity.model.clone(),
id: disk_config.id.into(),
pool_id: disk_config.pool_id.into_untyped_uuid(),
disposition: to_db_bp_physical_disk_disposition(
disk_config.disposition,
),
disposition,
disposition_expunged_as_of_generation,
disposition_expunged_ready_for_cleanup,
}
}
}

impl From<BpOmicronPhysicalDisk> for BlueprintPhysicalDiskConfig {
fn from(disk: BpOmicronPhysicalDisk) -> Self {
Self {
disposition: disk.disposition.into(),
impl TryFrom<BpOmicronPhysicalDisk> for BlueprintPhysicalDiskConfig {
type Error = anyhow::Error;

fn try_from(disk: BpOmicronPhysicalDisk) -> Result<Self, Self::Error> {
let disposition_cols = DbBpPhysicalDiskDispositionColumns {
disposition: disk.disposition,
expunged_as_of_generation: disk
.disposition_expunged_as_of_generation,
expunged_ready_for_cleanup: disk
.disposition_expunged_ready_for_cleanup,
};

Ok(Self {
disposition: disposition_cols.try_into()?,
identity: DiskIdentity {
vendor: disk.vendor,
serial: disk.serial,
model: disk.model,
},
id: disk.id.into(),
pool_id: ZpoolUuid::from_untyped_uuid(disk.pool_id),
}
})
}
}

Expand Down
2 changes: 2 additions & 0 deletions nexus/db-model/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1686,6 +1686,8 @@ table! {
pool_id -> Uuid,

disposition -> crate::DbBpPhysicalDiskDispositionEnum,
disposition_expunged_as_of_generation -> Nullable<Int8>,
disposition_expunged_ready_for_cleanup -> Bool,
}
}

Expand Down
3 changes: 2 additions & 1 deletion nexus/db-model/src/schema_versions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use std::collections::BTreeMap;
///
/// This must be updated when you change the database schema. Refer to
/// schema/crdb/README.adoc in the root of this repository for details.
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(124, 0, 0);
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(125, 0, 0);

/// List of all past database schema versions, in *reverse* order
///
Expand All @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy<Vec<KnownVersion>> = Lazy::new(|| {
// | leaving the first copy as an example for the next person.
// v
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
KnownVersion::new(125, "bp-physical-disk-state"),
KnownVersion::new(124, "support-read-only-region-replacement"),
KnownVersion::new(123, "vpc-subnet-contention"),
KnownVersion::new(122, "tuf-artifact-replication"),
Expand Down
13 changes: 10 additions & 3 deletions nexus/db-queries/src/db/datastore/deployment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -839,7 +839,13 @@ impl DataStore {
d.id, d.sled_id
))
})?;
sled_disks.disks.insert(d.into());
let disk_id = d.id;
sled_disks.disks.insert(d.try_into().map_err(|e| {
Error::internal_error(&format!(
"Cannot convert BpOmicronPhysicalDisk {}: {e}",
disk_id
))
})?);
}
}
}
Expand Down Expand Up @@ -2346,12 +2352,12 @@ mod tests {
assert_eq!(
EnsureMultiple::from(
builder
.sled_ensure_disks(
.sled_add_disks(
new_sled_id,
&planning_input
.sled_lookup(SledFilter::Commissioned, new_sled_id)
.unwrap()
.resources,
.resources
)
.unwrap()
.disks
Expand All @@ -2360,6 +2366,7 @@ mod tests {
added: 4,
updated: 0,
expunged: 0,
decommissioned: 0,
removed: 0
}
);
Expand Down
21 changes: 14 additions & 7 deletions nexus/db-queries/src/db/datastore/physical_disk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,12 +154,13 @@ impl DataStore {
) -> Result<(), Error> {
opctx.authorize(authz::Action::Read, &authz::FLEET).await?;
use db::schema::physical_disk::dsl;
let now = Utc::now();

diesel::update(
dsl::physical_disk.filter(dsl::id.eq(to_db_typed_uuid(id))),
)
.filter(dsl::time_deleted.is_null())
.set(dsl::disk_policy.eq(policy))
.set((dsl::disk_policy.eq(policy), dsl::time_modified.eq(now)))
.execute_async(&*self.pool_connection_authorized(&opctx).await?)
.await
.map_err(|err| public_error_from_diesel(err, ErrorHandler::Server))?;
Expand All @@ -174,12 +175,13 @@ impl DataStore {
) -> Result<(), Error> {
opctx.authorize(authz::Action::Read, &authz::FLEET).await?;
use db::schema::physical_disk::dsl;
let now = Utc::now();

diesel::update(
dsl::physical_disk.filter(dsl::id.eq(to_db_typed_uuid(id))),
)
.filter(dsl::time_deleted.is_null())
.set(dsl::disk_state.eq(state))
.set((dsl::disk_state.eq(state), dsl::time_modified.eq(now)))
.execute_async(&*self.pool_connection_authorized(&opctx).await?)
.await
.map_err(|err| public_error_from_diesel(err, ErrorHandler::Server))?;
Expand Down Expand Up @@ -281,19 +283,24 @@ impl DataStore {
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
}

/// Decommissions all expunged disks.
pub async fn physical_disk_decommission_all_expunged(
/// Decommissions a single expunged disk.
pub async fn physical_disk_decommission(
&self,
opctx: &OpContext,
id: PhysicalDiskUuid,
) -> Result<(), Error> {
opctx.authorize(authz::Action::Modify, &authz::FLEET).await?;
use db::schema::physical_disk::dsl;

let now = Utc::now();
let conn = &*self.pool_connection_authorized(&opctx).await?;
diesel::update(dsl::physical_disk)
.filter(dsl::id.eq(to_db_typed_uuid(id)))
.filter(dsl::time_deleted.is_null())
.physical_disk_filter(DiskFilter::ExpungedButActive)
.set(dsl::disk_state.eq(PhysicalDiskState::Decommissioned))
.filter(dsl::disk_policy.eq(PhysicalDiskPolicy::Expunged))
.set((
dsl::disk_state.eq(PhysicalDiskState::Decommissioned),
dsl::time_modified.eq(now),
))
.execute_async(conn)
.await
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two questions:

  • Will this keep moving time_modified if called on one disk multiple times? (Do we need to filter on the disk_state too?)
  • Do any callers care whether the disk was actually marked decommissioned? (If this query updates no rows, I think it will still return Ok(()), right?) If not, maybe note in the doc comment that this is a no-op if the disk is already decommissioned?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bump on these questions

Copy link
Contributor Author

@andrewjstone andrewjstone Feb 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ooh, great catch. Sorry for not replying earlier. You are correct here. I added a filter for the disk_state also which should prevent the time_modified bump in 9ca1a2d.

I also went ahead and added a comment since callers should not care if the disk was actually decommissioned.

Expand Down
2 changes: 1 addition & 1 deletion nexus/db-queries/src/db/datastore/vpc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3198,7 +3198,7 @@ mod tests {
.expect("created blueprint builder");
for &sled_id in &sled_ids {
builder
.sled_ensure_disks(
.sled_add_disks(
sled_id,
&planning_input
.sled_lookup(SledFilter::InService, sled_id)
Expand Down
73 changes: 36 additions & 37 deletions nexus/reconfigurator/blippy/src/checks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use nexus_sled_agent_shared::inventory::ZoneKind;
use nexus_types::deployment::blueprint_zone_type;
use nexus_types::deployment::BlueprintDatasetConfig;
use nexus_types::deployment::BlueprintDatasetFilter;
use nexus_types::deployment::BlueprintPhysicalDiskDisposition;
use nexus_types::deployment::BlueprintZoneConfig;
use nexus_types::deployment::BlueprintZoneFilter;
use nexus_types::deployment::BlueprintZoneType;
Expand Down Expand Up @@ -461,11 +462,13 @@ fn check_datasets(blippy: &mut Blippy<'_>) {
let mut expected_datasets = BTreeSet::new();

// All disks should have debug and zone root datasets.
//
// TODO-correctness We currently only include in-service disks in the
// blueprint; once we include expunged or decommissioned disks too, we
// should filter here to only in-service.
for (&sled_id, disk_config) in &blippy.blueprint().blueprint_disks {
for (sled_id, disk) in blippy
.blueprint()
.all_omicron_disks(BlueprintPhysicalDiskDisposition::is_in_service)
{
// Note: This may be called multiple times per `sled_id`,
// which is somewhat inefficient. However it will still only report
// one error note per `sled_id`.
let Some(sled_datasets) = datasets.get_sled_or_note_missing(
blippy,
sled_id,
Expand All @@ -474,41 +477,37 @@ fn check_datasets(blippy: &mut Blippy<'_>) {
continue;
};

for disk in &disk_config.disks {
let sled_datasets = sled_datasets.get(&disk.pool_id);
let sled_datasets = sled_datasets.get(&disk.pool_id);

match sled_datasets
.and_then(|by_zpool| by_zpool.get(&DatasetKind::Debug))
{
Some(dataset) => {
expected_datasets.insert(dataset.id);
}
None => {
blippy.push_sled_note(
sled_id,
Severity::Fatal,
SledKind::ZpoolMissingDebugDataset {
zpool: disk.pool_id,
},
);
}
match sled_datasets
.and_then(|by_zpool| by_zpool.get(&DatasetKind::Debug))
{
Some(dataset) => {
expected_datasets.insert(dataset.id);
}
None => {
blippy.push_sled_note(
sled_id,
Severity::Fatal,
SledKind::ZpoolMissingDebugDataset { zpool: disk.pool_id },
);
}
}

match sled_datasets.and_then(|by_zpool| {
by_zpool.get(&DatasetKind::TransientZoneRoot)
}) {
Some(dataset) => {
expected_datasets.insert(dataset.id);
}
None => {
blippy.push_sled_note(
sled_id,
Severity::Fatal,
SledKind::ZpoolMissingZoneRootDataset {
zpool: disk.pool_id,
},
);
}
match sled_datasets
.and_then(|by_zpool| by_zpool.get(&DatasetKind::TransientZoneRoot))
{
Some(dataset) => {
expected_datasets.insert(dataset.id);
}
None => {
blippy.push_sled_note(
sled_id,
Severity::Fatal,
SledKind::ZpoolMissingZoneRootDataset {
zpool: disk.pool_id,
},
);
}
}
}
Expand Down
Loading