From ee1b02e68ad3c6028dbda25c0182c3d99be61d63 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 19 Feb 2025 17:42:22 -0800 Subject: [PATCH] [nexus] Make Instance Deletion actually idempotent (#7556) Marking an instance deleted, within the "instance deletion saga", invokes `datastore.project_delete_instance`. This API performs the following tasks: 1. Marks the instance record "Destroyed", and detaches all disks 2. Performs a variety of other cleanup tasks (deleting SSH keys, not-yet-but-soon cleaning up affinity groups, marking migrations as deleted). These operations are not in a transaction, so it's possible for step (1) to succeed, but step (2) to not happen. In this case, it's critical that a second invocation of this API be able to actually perform cleanup. This PR adjusts the implementation of `project_delete_instance` to make this actually true. --- nexus/db-queries/src/db/datastore/instance.rs | 170 ++++++++++++++---- nexus/src/app/sagas/instance_delete.rs | 11 -- 2 files changed, 134 insertions(+), 47 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 751531bf38e..b2358c0d291 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -1304,6 +1304,12 @@ impl DataStore { } } + /// Deletes the provided `authz_instance`, as long as it is eligible for + /// deletion (in either the [`InstanceState::NoVmm`] or + /// [`InstanceState::Failed`] state, or it has already started being + /// deleted successfully). + /// + /// This function is idempotent, but not atomic. pub async fn project_delete_instance( &self, opctx: &OpContext, @@ -1321,8 +1327,9 @@ impl DataStore { .await } - /// Delete the provided `authz_instance`, as long as it is in one of the - /// provided set of [`InstanceState`]s. + /// Deletes the provided `authz_instance`, as long as it is in one of the + /// provided set of [`InstanceState`]s. This function is idempotent, but + /// not atomic. /// /// A.K.A. "[`project_delete_instance`], hard mode". Typically, instances /// may only be deleted if they are in the [`InstanceState::NoVmm`] or @@ -1341,6 +1348,44 @@ impl DataStore { opctx: &OpContext, authz_instance: &authz::Instance, ok_to_delete_instance_states: &'static [InstanceState], + ) -> DeleteResult { + // First, mark the instance record as destroyed and detach all disks. + // + // We do this before other cleanup to prevent concurrent operations from + // accessing and modifying the instance while it's being torn down. + self.instance_mark_destroyed_and_detach_disks( + opctx, + authz_instance, + ok_to_delete_instance_states, + ) + .await?; + + // Next, delete all other data associated with the instance. + // + // Note that due to idempotency of this function, it's possible that + // "authz_instance.id()" has already been deleted. + let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); + self.instance_ssh_keys_delete(opctx, instance_id).await?; + self.instance_mark_migrations_deleted(opctx, instance_id).await?; + + Ok(()) + } + + // Marks the instance "Destroyed" and detaches disks. + // + // This is only one internal part of destroying an instance, see: + // [`project_delete_instance`] for a more holistic usage. It is has been + // factored out for readability. + // + // This function is idempotent, and should return "Ok(())" on repeated + // invocations. + // + // [`project_delete_instance`]: Self::project_delete_instance + async fn instance_mark_destroyed_and_detach_disks( + &self, + opctx: &OpContext, + authz_instance: &authz::Instance, + ok_to_delete_instance_states: &'static [InstanceState], ) -> DeleteResult { use db::schema::{disk, instance}; @@ -1377,44 +1422,43 @@ impl DataStore { )), ); - let _instance = stmt - .detach_and_get_result_async( - &*self.pool_connection_authorized(opctx).await?, - ) - .await - .map_err(|e| match e { - DetachManyError::CollectionNotFound => Error::not_found_by_id( - ResourceType::Instance, - &authz_instance.id(), - ), - DetachManyError::NoUpdate { collection } => { - if collection.runtime_state.propolis_id.is_some() { - return Error::invalid_request( + stmt.detach_and_get_result_async( + &*self.pool_connection_authorized(opctx).await?, + ) + .await + .map(|_instance| ()) + .or_else(|e| match e { + // Note that if the instance is not found, we return "Ok" explicitly here. + // + // This is important for idempotency - if we crashed after setting the state, + // but before doing cleanup, we allow other cleanup to make progress. + // + // See also: "test_instance_deletion_is_idempotent". + DetachManyError::CollectionNotFound => Ok(()), + DetachManyError::NoUpdate { collection } => { + if collection.runtime_state.propolis_id.is_some() { + return Err(Error::invalid_request( "cannot delete instance: instance is running or has \ not yet fully stopped", - ); - } - let instance_state = - collection.runtime_state.nexus_state.state(); - match instance_state { - api::external::InstanceState::Stopped - | api::external::InstanceState::Failed => { - Error::internal_error("cannot delete instance") - } - _ => Error::invalid_request(&format!( - "instance cannot be deleted in state \"{}\"", - instance_state, - )), - } + )); } - DetachManyError::DatabaseError(e) => { - public_error_from_diesel(e, ErrorHandler::Server) + let instance_state = + collection.runtime_state.nexus_state.state(); + match instance_state { + api::external::InstanceState::Stopped + | api::external::InstanceState::Failed => { + Err(Error::internal_error("cannot delete instance")) + } + _ => Err(Error::invalid_request(&format!( + "instance cannot be deleted in state \"{}\"", + instance_state, + ))), } - })?; - - let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); - self.instance_ssh_keys_delete(opctx, instance_id).await?; - self.instance_mark_migrations_deleted(opctx, instance_id).await?; + } + DetachManyError::DatabaseError(e) => { + Err(public_error_from_diesel(e, ErrorHandler::Server)) + } + })?; Ok(()) } @@ -2382,6 +2426,60 @@ mod tests { logctx.cleanup_successful(); } + // Validates idempotency of instance deletion. + // + // Instance deletion is invoked from a saga, so it must be idempotent. + // Additionally, to reduce database contention, we perform the steps of + // instance deletion non-atomically. However, this means that it must be + // possible to re-invoke instance deletion repeatedly to ensure that cleanup + // proceeds. + #[tokio::test] + async fn test_instance_deletion_is_idempotent() { + // Setup + let logctx = + dev::test_setup_log("test_instance_deletion_is_idempotent"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + let (authz_project, _) = create_test_project(&datastore, &opctx).await; + let authz_instance = create_test_instance( + &datastore, + &opctx, + &authz_project, + "my-great-instance", + ) + .await; + + // Move the instance into an "okay-to-delete" state... + datastore + .instance_update_runtime( + &InstanceUuid::from_untyped_uuid(authz_instance.id()), + &InstanceRuntimeState { + time_updated: Utc::now(), + r#gen: Generation(external::Generation::from_u32(2)), + propolis_id: None, + dst_propolis_id: None, + migration_id: None, + nexus_state: InstanceState::NoVmm, + time_last_auto_restarted: None, + }, + ) + .await + .expect("should update state successfully"); + + // This is the first "normal" deletion + dbg!(datastore.project_delete_instance(&opctx, &authz_instance).await) + .expect("instance should be deleted"); + + // The next deletion should also succeed, even though the instance has already + // been marked deleted. + dbg!(datastore.project_delete_instance(&opctx, &authz_instance).await) + .expect("instance should remain deleted"); + + // Clean up. + db.terminate().await; + logctx.cleanup_successful(); + } + #[tokio::test] async fn test_unlocking_a_deleted_instance_is_okay() { // Setup diff --git a/nexus/src/app/sagas/instance_delete.rs b/nexus/src/app/sagas/instance_delete.rs index c1ff9c0bfbe..780f0407a3b 100644 --- a/nexus/src/app/sagas/instance_delete.rs +++ b/nexus/src/app/sagas/instance_delete.rs @@ -10,7 +10,6 @@ use super::NexusSaga; use crate::app::sagas::declare_saga_actions; use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::{authn, authz, db}; -use omicron_common::api::external::{Error, ResourceType}; use omicron_common::api::internal::shared::SwitchLocation; use serde::Deserialize; use serde::Serialize; @@ -84,16 +83,6 @@ async fn sid_delete_instance_record( .datastore() .project_delete_instance(&opctx, ¶ms.authz_instance) .await - .or_else(|err| { - // Necessary for idempotency - match err { - Error::ObjectNotFound { - type_name: ResourceType::Instance, - lookup_type: _, - } => Ok(()), - _ => Err(err), - } - }) .map_err(ActionError::action_failed)?; Ok(()) }