Skip to content

Commit ee1b02e

Browse files
authored
[nexus] Make Instance Deletion actually idempotent (#7556)
Marking an instance deleted, within the "instance deletion saga", invokes `datastore.project_delete_instance`. This API performs the following tasks: 1. Marks the instance record "Destroyed", and detaches all disks 2. Performs a variety of other cleanup tasks (deleting SSH keys, not-yet-but-soon cleaning up affinity groups, marking migrations as deleted). These operations are not in a transaction, so it's possible for step (1) to succeed, but step (2) to not happen. In this case, it's critical that a second invocation of this API be able to actually perform cleanup. This PR adjusts the implementation of `project_delete_instance` to make this actually true.
1 parent 80dac52 commit ee1b02e

File tree

2 files changed

+134
-47
lines changed

2 files changed

+134
-47
lines changed

nexus/db-queries/src/db/datastore/instance.rs

Lines changed: 134 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1304,6 +1304,12 @@ impl DataStore {
13041304
}
13051305
}
13061306

1307+
/// Deletes the provided `authz_instance`, as long as it is eligible for
1308+
/// deletion (in either the [`InstanceState::NoVmm`] or
1309+
/// [`InstanceState::Failed`] state, or it has already started being
1310+
/// deleted successfully).
1311+
///
1312+
/// This function is idempotent, but not atomic.
13071313
pub async fn project_delete_instance(
13081314
&self,
13091315
opctx: &OpContext,
@@ -1321,8 +1327,9 @@ impl DataStore {
13211327
.await
13221328
}
13231329

1324-
/// Delete the provided `authz_instance`, as long as it is in one of the
1325-
/// provided set of [`InstanceState`]s.
1330+
/// Deletes the provided `authz_instance`, as long as it is in one of the
1331+
/// provided set of [`InstanceState`]s. This function is idempotent, but
1332+
/// not atomic.
13261333
///
13271334
/// A.K.A. "[`project_delete_instance`], hard mode". Typically, instances
13281335
/// may only be deleted if they are in the [`InstanceState::NoVmm`] or
@@ -1341,6 +1348,44 @@ impl DataStore {
13411348
opctx: &OpContext,
13421349
authz_instance: &authz::Instance,
13431350
ok_to_delete_instance_states: &'static [InstanceState],
1351+
) -> DeleteResult {
1352+
// First, mark the instance record as destroyed and detach all disks.
1353+
//
1354+
// We do this before other cleanup to prevent concurrent operations from
1355+
// accessing and modifying the instance while it's being torn down.
1356+
self.instance_mark_destroyed_and_detach_disks(
1357+
opctx,
1358+
authz_instance,
1359+
ok_to_delete_instance_states,
1360+
)
1361+
.await?;
1362+
1363+
// Next, delete all other data associated with the instance.
1364+
//
1365+
// Note that due to idempotency of this function, it's possible that
1366+
// "authz_instance.id()" has already been deleted.
1367+
let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id());
1368+
self.instance_ssh_keys_delete(opctx, instance_id).await?;
1369+
self.instance_mark_migrations_deleted(opctx, instance_id).await?;
1370+
1371+
Ok(())
1372+
}
1373+
1374+
// Marks the instance "Destroyed" and detaches disks.
1375+
//
1376+
// This is only one internal part of destroying an instance, see:
1377+
// [`project_delete_instance`] for a more holistic usage. It is has been
1378+
// factored out for readability.
1379+
//
1380+
// This function is idempotent, and should return "Ok(())" on repeated
1381+
// invocations.
1382+
//
1383+
// [`project_delete_instance`]: Self::project_delete_instance
1384+
async fn instance_mark_destroyed_and_detach_disks(
1385+
&self,
1386+
opctx: &OpContext,
1387+
authz_instance: &authz::Instance,
1388+
ok_to_delete_instance_states: &'static [InstanceState],
13441389
) -> DeleteResult {
13451390
use db::schema::{disk, instance};
13461391

@@ -1377,44 +1422,43 @@ impl DataStore {
13771422
)),
13781423
);
13791424

1380-
let _instance = stmt
1381-
.detach_and_get_result_async(
1382-
&*self.pool_connection_authorized(opctx).await?,
1383-
)
1384-
.await
1385-
.map_err(|e| match e {
1386-
DetachManyError::CollectionNotFound => Error::not_found_by_id(
1387-
ResourceType::Instance,
1388-
&authz_instance.id(),
1389-
),
1390-
DetachManyError::NoUpdate { collection } => {
1391-
if collection.runtime_state.propolis_id.is_some() {
1392-
return Error::invalid_request(
1425+
stmt.detach_and_get_result_async(
1426+
&*self.pool_connection_authorized(opctx).await?,
1427+
)
1428+
.await
1429+
.map(|_instance| ())
1430+
.or_else(|e| match e {
1431+
// Note that if the instance is not found, we return "Ok" explicitly here.
1432+
//
1433+
// This is important for idempotency - if we crashed after setting the state,
1434+
// but before doing cleanup, we allow other cleanup to make progress.
1435+
//
1436+
// See also: "test_instance_deletion_is_idempotent".
1437+
DetachManyError::CollectionNotFound => Ok(()),
1438+
DetachManyError::NoUpdate { collection } => {
1439+
if collection.runtime_state.propolis_id.is_some() {
1440+
return Err(Error::invalid_request(
13931441
"cannot delete instance: instance is running or has \
13941442
not yet fully stopped",
1395-
);
1396-
}
1397-
let instance_state =
1398-
collection.runtime_state.nexus_state.state();
1399-
match instance_state {
1400-
api::external::InstanceState::Stopped
1401-
| api::external::InstanceState::Failed => {
1402-
Error::internal_error("cannot delete instance")
1403-
}
1404-
_ => Error::invalid_request(&format!(
1405-
"instance cannot be deleted in state \"{}\"",
1406-
instance_state,
1407-
)),
1408-
}
1443+
));
14091444
}
1410-
DetachManyError::DatabaseError(e) => {
1411-
public_error_from_diesel(e, ErrorHandler::Server)
1445+
let instance_state =
1446+
collection.runtime_state.nexus_state.state();
1447+
match instance_state {
1448+
api::external::InstanceState::Stopped
1449+
| api::external::InstanceState::Failed => {
1450+
Err(Error::internal_error("cannot delete instance"))
1451+
}
1452+
_ => Err(Error::invalid_request(&format!(
1453+
"instance cannot be deleted in state \"{}\"",
1454+
instance_state,
1455+
))),
14121456
}
1413-
})?;
1414-
1415-
let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id());
1416-
self.instance_ssh_keys_delete(opctx, instance_id).await?;
1417-
self.instance_mark_migrations_deleted(opctx, instance_id).await?;
1457+
}
1458+
DetachManyError::DatabaseError(e) => {
1459+
Err(public_error_from_diesel(e, ErrorHandler::Server))
1460+
}
1461+
})?;
14181462

14191463
Ok(())
14201464
}
@@ -2382,6 +2426,60 @@ mod tests {
23822426
logctx.cleanup_successful();
23832427
}
23842428

2429+
// Validates idempotency of instance deletion.
2430+
//
2431+
// Instance deletion is invoked from a saga, so it must be idempotent.
2432+
// Additionally, to reduce database contention, we perform the steps of
2433+
// instance deletion non-atomically. However, this means that it must be
2434+
// possible to re-invoke instance deletion repeatedly to ensure that cleanup
2435+
// proceeds.
2436+
#[tokio::test]
2437+
async fn test_instance_deletion_is_idempotent() {
2438+
// Setup
2439+
let logctx =
2440+
dev::test_setup_log("test_instance_deletion_is_idempotent");
2441+
let db = TestDatabase::new_with_datastore(&logctx.log).await;
2442+
let (opctx, datastore) = (db.opctx(), db.datastore());
2443+
let (authz_project, _) = create_test_project(&datastore, &opctx).await;
2444+
let authz_instance = create_test_instance(
2445+
&datastore,
2446+
&opctx,
2447+
&authz_project,
2448+
"my-great-instance",
2449+
)
2450+
.await;
2451+
2452+
// Move the instance into an "okay-to-delete" state...
2453+
datastore
2454+
.instance_update_runtime(
2455+
&InstanceUuid::from_untyped_uuid(authz_instance.id()),
2456+
&InstanceRuntimeState {
2457+
time_updated: Utc::now(),
2458+
r#gen: Generation(external::Generation::from_u32(2)),
2459+
propolis_id: None,
2460+
dst_propolis_id: None,
2461+
migration_id: None,
2462+
nexus_state: InstanceState::NoVmm,
2463+
time_last_auto_restarted: None,
2464+
},
2465+
)
2466+
.await
2467+
.expect("should update state successfully");
2468+
2469+
// This is the first "normal" deletion
2470+
dbg!(datastore.project_delete_instance(&opctx, &authz_instance).await)
2471+
.expect("instance should be deleted");
2472+
2473+
// The next deletion should also succeed, even though the instance has already
2474+
// been marked deleted.
2475+
dbg!(datastore.project_delete_instance(&opctx, &authz_instance).await)
2476+
.expect("instance should remain deleted");
2477+
2478+
// Clean up.
2479+
db.terminate().await;
2480+
logctx.cleanup_successful();
2481+
}
2482+
23852483
#[tokio::test]
23862484
async fn test_unlocking_a_deleted_instance_is_okay() {
23872485
// Setup

nexus/src/app/sagas/instance_delete.rs

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ use super::NexusSaga;
1010
use crate::app::sagas::declare_saga_actions;
1111
use nexus_db_queries::db::lookup::LookupPath;
1212
use nexus_db_queries::{authn, authz, db};
13-
use omicron_common::api::external::{Error, ResourceType};
1413
use omicron_common::api::internal::shared::SwitchLocation;
1514
use serde::Deserialize;
1615
use serde::Serialize;
@@ -84,16 +83,6 @@ async fn sid_delete_instance_record(
8483
.datastore()
8584
.project_delete_instance(&opctx, &params.authz_instance)
8685
.await
87-
.or_else(|err| {
88-
// Necessary for idempotency
89-
match err {
90-
Error::ObjectNotFound {
91-
type_name: ResourceType::Instance,
92-
lookup_type: _,
93-
} => Ok(()),
94-
_ => Err(err),
95-
}
96-
})
9786
.map_err(ActionError::action_failed)?;
9887
Ok(())
9988
}

0 commit comments

Comments
 (0)