@@ -1304,6 +1304,12 @@ impl DataStore {
1304
1304
}
1305
1305
}
1306
1306
1307
+ /// Deletes the provided `authz_instance`, as long as it is eligible for
1308
+ /// deletion (in either the [`InstanceState::NoVmm`] or
1309
+ /// [`InstanceState::Failed`] state, or it has already started being
1310
+ /// deleted successfully).
1311
+ ///
1312
+ /// This function is idempotent, but not atomic.
1307
1313
pub async fn project_delete_instance (
1308
1314
& self ,
1309
1315
opctx : & OpContext ,
@@ -1321,8 +1327,9 @@ impl DataStore {
1321
1327
. await
1322
1328
}
1323
1329
1324
- /// Delete the provided `authz_instance`, as long as it is in one of the
1325
- /// provided set of [`InstanceState`]s.
1330
+ /// Deletes the provided `authz_instance`, as long as it is in one of the
1331
+ /// provided set of [`InstanceState`]s. This function is idempotent, but
1332
+ /// not atomic.
1326
1333
///
1327
1334
/// A.K.A. "[`project_delete_instance`], hard mode". Typically, instances
1328
1335
/// may only be deleted if they are in the [`InstanceState::NoVmm`] or
@@ -1341,6 +1348,44 @@ impl DataStore {
1341
1348
opctx : & OpContext ,
1342
1349
authz_instance : & authz:: Instance ,
1343
1350
ok_to_delete_instance_states : & ' static [ InstanceState ] ,
1351
+ ) -> DeleteResult {
1352
+ // First, mark the instance record as destroyed and detach all disks.
1353
+ //
1354
+ // We do this before other cleanup to prevent concurrent operations from
1355
+ // accessing and modifying the instance while it's being torn down.
1356
+ self . instance_mark_destroyed_and_detach_disks (
1357
+ opctx,
1358
+ authz_instance,
1359
+ ok_to_delete_instance_states,
1360
+ )
1361
+ . await ?;
1362
+
1363
+ // Next, delete all other data associated with the instance.
1364
+ //
1365
+ // Note that due to idempotency of this function, it's possible that
1366
+ // "authz_instance.id()" has already been deleted.
1367
+ let instance_id = InstanceUuid :: from_untyped_uuid ( authz_instance. id ( ) ) ;
1368
+ self . instance_ssh_keys_delete ( opctx, instance_id) . await ?;
1369
+ self . instance_mark_migrations_deleted ( opctx, instance_id) . await ?;
1370
+
1371
+ Ok ( ( ) )
1372
+ }
1373
+
1374
+ // Marks the instance "Destroyed" and detaches disks.
1375
+ //
1376
+ // This is only one internal part of destroying an instance, see:
1377
+ // [`project_delete_instance`] for a more holistic usage. It is has been
1378
+ // factored out for readability.
1379
+ //
1380
+ // This function is idempotent, and should return "Ok(())" on repeated
1381
+ // invocations.
1382
+ //
1383
+ // [`project_delete_instance`]: Self::project_delete_instance
1384
+ async fn instance_mark_destroyed_and_detach_disks (
1385
+ & self ,
1386
+ opctx : & OpContext ,
1387
+ authz_instance : & authz:: Instance ,
1388
+ ok_to_delete_instance_states : & ' static [ InstanceState ] ,
1344
1389
) -> DeleteResult {
1345
1390
use db:: schema:: { disk, instance} ;
1346
1391
@@ -1377,44 +1422,43 @@ impl DataStore {
1377
1422
) ) ,
1378
1423
) ;
1379
1424
1380
- let _instance = stmt
1381
- . detach_and_get_result_async (
1382
- & * self . pool_connection_authorized ( opctx) . await ?,
1383
- )
1384
- . await
1385
- . map_err ( |e| match e {
1386
- DetachManyError :: CollectionNotFound => Error :: not_found_by_id (
1387
- ResourceType :: Instance ,
1388
- & authz_instance. id ( ) ,
1389
- ) ,
1390
- DetachManyError :: NoUpdate { collection } => {
1391
- if collection. runtime_state . propolis_id . is_some ( ) {
1392
- return Error :: invalid_request (
1425
+ stmt. detach_and_get_result_async (
1426
+ & * self . pool_connection_authorized ( opctx) . await ?,
1427
+ )
1428
+ . await
1429
+ . map ( |_instance| ( ) )
1430
+ . or_else ( |e| match e {
1431
+ // Note that if the instance is not found, we return "Ok" explicitly here.
1432
+ //
1433
+ // This is important for idempotency - if we crashed after setting the state,
1434
+ // but before doing cleanup, we allow other cleanup to make progress.
1435
+ //
1436
+ // See also: "test_instance_deletion_is_idempotent".
1437
+ DetachManyError :: CollectionNotFound => Ok ( ( ) ) ,
1438
+ DetachManyError :: NoUpdate { collection } => {
1439
+ if collection. runtime_state . propolis_id . is_some ( ) {
1440
+ return Err ( Error :: invalid_request (
1393
1441
"cannot delete instance: instance is running or has \
1394
1442
not yet fully stopped",
1395
- ) ;
1396
- }
1397
- let instance_state =
1398
- collection. runtime_state . nexus_state . state ( ) ;
1399
- match instance_state {
1400
- api:: external:: InstanceState :: Stopped
1401
- | api:: external:: InstanceState :: Failed => {
1402
- Error :: internal_error ( "cannot delete instance" )
1403
- }
1404
- _ => Error :: invalid_request ( & format ! (
1405
- "instance cannot be deleted in state \" {}\" " ,
1406
- instance_state,
1407
- ) ) ,
1408
- }
1443
+ ) ) ;
1409
1444
}
1410
- DetachManyError :: DatabaseError ( e) => {
1411
- public_error_from_diesel ( e, ErrorHandler :: Server )
1445
+ let instance_state =
1446
+ collection. runtime_state . nexus_state . state ( ) ;
1447
+ match instance_state {
1448
+ api:: external:: InstanceState :: Stopped
1449
+ | api:: external:: InstanceState :: Failed => {
1450
+ Err ( Error :: internal_error ( "cannot delete instance" ) )
1451
+ }
1452
+ _ => Err ( Error :: invalid_request ( & format ! (
1453
+ "instance cannot be deleted in state \" {}\" " ,
1454
+ instance_state,
1455
+ ) ) ) ,
1412
1456
}
1413
- } ) ? ;
1414
-
1415
- let instance_id = InstanceUuid :: from_untyped_uuid ( authz_instance . id ( ) ) ;
1416
- self . instance_ssh_keys_delete ( opctx , instance_id ) . await ? ;
1417
- self . instance_mark_migrations_deleted ( opctx , instance_id ) . await ?;
1457
+ }
1458
+ DetachManyError :: DatabaseError ( e ) => {
1459
+ Err ( public_error_from_diesel ( e , ErrorHandler :: Server ) )
1460
+ }
1461
+ } ) ?;
1418
1462
1419
1463
Ok ( ( ) )
1420
1464
}
@@ -2382,6 +2426,60 @@ mod tests {
2382
2426
logctx. cleanup_successful ( ) ;
2383
2427
}
2384
2428
2429
+ // Validates idempotency of instance deletion.
2430
+ //
2431
+ // Instance deletion is invoked from a saga, so it must be idempotent.
2432
+ // Additionally, to reduce database contention, we perform the steps of
2433
+ // instance deletion non-atomically. However, this means that it must be
2434
+ // possible to re-invoke instance deletion repeatedly to ensure that cleanup
2435
+ // proceeds.
2436
+ #[ tokio:: test]
2437
+ async fn test_instance_deletion_is_idempotent ( ) {
2438
+ // Setup
2439
+ let logctx =
2440
+ dev:: test_setup_log ( "test_instance_deletion_is_idempotent" ) ;
2441
+ let db = TestDatabase :: new_with_datastore ( & logctx. log ) . await ;
2442
+ let ( opctx, datastore) = ( db. opctx ( ) , db. datastore ( ) ) ;
2443
+ let ( authz_project, _) = create_test_project ( & datastore, & opctx) . await ;
2444
+ let authz_instance = create_test_instance (
2445
+ & datastore,
2446
+ & opctx,
2447
+ & authz_project,
2448
+ "my-great-instance" ,
2449
+ )
2450
+ . await ;
2451
+
2452
+ // Move the instance into an "okay-to-delete" state...
2453
+ datastore
2454
+ . instance_update_runtime (
2455
+ & InstanceUuid :: from_untyped_uuid ( authz_instance. id ( ) ) ,
2456
+ & InstanceRuntimeState {
2457
+ time_updated : Utc :: now ( ) ,
2458
+ r#gen : Generation ( external:: Generation :: from_u32 ( 2 ) ) ,
2459
+ propolis_id : None ,
2460
+ dst_propolis_id : None ,
2461
+ migration_id : None ,
2462
+ nexus_state : InstanceState :: NoVmm ,
2463
+ time_last_auto_restarted : None ,
2464
+ } ,
2465
+ )
2466
+ . await
2467
+ . expect ( "should update state successfully" ) ;
2468
+
2469
+ // This is the first "normal" deletion
2470
+ dbg ! ( datastore. project_delete_instance( & opctx, & authz_instance) . await )
2471
+ . expect ( "instance should be deleted" ) ;
2472
+
2473
+ // The next deletion should also succeed, even though the instance has already
2474
+ // been marked deleted.
2475
+ dbg ! ( datastore. project_delete_instance( & opctx, & authz_instance) . await )
2476
+ . expect ( "instance should remain deleted" ) ;
2477
+
2478
+ // Clean up.
2479
+ db. terminate ( ) . await ;
2480
+ logctx. cleanup_successful ( ) ;
2481
+ }
2482
+
2385
2483
#[ tokio:: test]
2386
2484
async fn test_unlocking_a_deleted_instance_is_okay ( ) {
2387
2485
// Setup
0 commit comments