Skip to content

Commit 99a18b6

Browse files
committed
[sled-agent] add test for propolis hang mid-shutdown
1 parent f8bfbe1 commit 99a18b6

File tree

3 files changed

+201
-6
lines changed

3 files changed

+201
-6
lines changed

Cargo.lock

Lines changed: 12 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -569,7 +569,7 @@ progenitor-client = "0.9.1"
569569
bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "95d6a559890c94e3aa62c8adcd7c4e123ec4c6dc" }
570570
propolis_api_types = { git = "https://github.com/oxidecomputer/propolis", rev = "95d6a559890c94e3aa62c8adcd7c4e123ec4c6dc" }
571571
propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "95d6a559890c94e3aa62c8adcd7c4e123ec4c6dc" }
572-
propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "95d6a559890c94e3aa62c8adcd7c4e123ec4c6dc" }
572+
propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "30e1fe25d093da04e2aeda397b27103743dd55fd" }
573573
# NOTE: see above!
574574
proptest = "1.5.0"
575575
qorb = "0.2.1"

sled-agent/src/instance.rs

Lines changed: 188 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2441,8 +2441,6 @@ mod tests {
24412441

24422442
// note the "mock" here is different from the vnic/zone contexts above.
24432443
// this is actually running code for a dropshot server from propolis.
2444-
// (might we want a locally-defined fake whose behavior we can control
2445-
// more directly from the test driver?)
24462444
// TODO: factor out, this is also in sled-agent-sim.
24472445
fn propolis_mock_server(
24482446
log: &Logger,
@@ -2465,6 +2463,30 @@ mod tests {
24652463
(srv, client)
24662464
}
24672465

2466+
async fn propolis_mock_set_mode(
2467+
client: &PropolisClient,
2468+
mode: propolis_mock_server::MockMode,
2469+
) {
2470+
let url = format!("{}/mock/mode", client.baseurl());
2471+
client
2472+
.client()
2473+
.put(url)
2474+
.json(&mode)
2475+
.send()
2476+
.await
2477+
.expect("setting mock mode failed unexpectedly");
2478+
}
2479+
2480+
async fn propolis_mock_step(client: &PropolisClient) {
2481+
let url = format!("{}/mock/step", client.baseurl());
2482+
client
2483+
.client()
2484+
.put(url)
2485+
.send()
2486+
.await
2487+
.expect("single-stepping mock server failed unexpectedly");
2488+
}
2489+
24682490
async fn setup_storage_manager(log: &Logger) -> StorageManagerTestHarness {
24692491
let mut harness = StorageManagerTestHarness::new(log).await;
24702492
let raw_disks =
@@ -3015,4 +3037,168 @@ mod tests {
30153037
storage_harness.cleanup().await;
30163038
logctx.cleanup_successful();
30173039
}
3040+
3041+
#[tokio::test]
3042+
async fn test_instance_manager_stop_timeout() {
3043+
let logctx = omicron_test_utils::dev::test_setup_log(
3044+
"test_instance_manager_stop_timeout",
3045+
);
3046+
let log = logctx.log.new(o!(FileKv));
3047+
3048+
// automock'd things used during this test
3049+
let _mock_vnic_contexts = mock_vnic_contexts();
3050+
let _mock_zone_contexts = mock_zone_contexts();
3051+
3052+
let mut storage_harness = setup_storage_manager(&logctx.log).await;
3053+
let storage_handle = storage_harness.handle().clone();
3054+
3055+
let FakeNexusParts {
3056+
nexus_client,
3057+
mut state_rx,
3058+
_dns_server,
3059+
_nexus_server,
3060+
} = FakeNexusParts::new(&log).await;
3061+
3062+
let temp_guard = Utf8TempDir::new().unwrap();
3063+
let temp_dir = temp_guard.path().to_string();
3064+
3065+
let (services, _metrics_rx) = fake_instance_manager_services(
3066+
&log,
3067+
storage_handle,
3068+
nexus_client,
3069+
&temp_dir,
3070+
);
3071+
let InstanceManagerServices {
3072+
nexus_client,
3073+
vnic_allocator: _,
3074+
port_manager,
3075+
storage,
3076+
zone_bundler,
3077+
zone_builder_factory,
3078+
metrics_queue,
3079+
} = services;
3080+
3081+
let etherstub = Etherstub("mystub".to_string());
3082+
3083+
let vmm_reservoir_manager = VmmReservoirManagerHandle::stub_for_test();
3084+
3085+
let mgr = crate::instance_manager::InstanceManager::new(
3086+
logctx.log.new(o!("component" => "InstanceManager")),
3087+
nexus_client,
3088+
etherstub,
3089+
port_manager,
3090+
storage,
3091+
zone_bundler,
3092+
zone_builder_factory,
3093+
vmm_reservoir_manager,
3094+
metrics_queue,
3095+
)
3096+
.unwrap();
3097+
3098+
let (propolis_server, propolis_client) =
3099+
propolis_mock_server(&logctx.log);
3100+
let propolis_addr = propolis_server.local_addr();
3101+
3102+
let instance_id = InstanceUuid::new_v4();
3103+
let propolis_id = PropolisUuid::from_untyped_uuid(PROPOLIS_ID);
3104+
let InstanceInitialState {
3105+
hardware,
3106+
vmm_runtime,
3107+
propolis_addr,
3108+
migration_id: _,
3109+
} = fake_instance_initial_state(propolis_addr);
3110+
3111+
let metadata = InstanceMetadata {
3112+
silo_id: Uuid::new_v4(),
3113+
project_id: Uuid::new_v4(),
3114+
};
3115+
let sled_identifiers = SledIdentifiers {
3116+
rack_id: Uuid::new_v4(),
3117+
sled_id: Uuid::new_v4(),
3118+
model: "fake-model".into(),
3119+
revision: 1,
3120+
serial: "fake-serial".into(),
3121+
};
3122+
3123+
mgr.ensure_registered(
3124+
propolis_id,
3125+
InstanceEnsureBody {
3126+
instance_id,
3127+
migration_id: None,
3128+
hardware,
3129+
vmm_runtime,
3130+
propolis_addr,
3131+
metadata,
3132+
},
3133+
sled_identifiers,
3134+
)
3135+
.await
3136+
.unwrap();
3137+
3138+
mgr.ensure_state(propolis_id, VmmStateRequested::Running)
3139+
.await
3140+
.unwrap();
3141+
3142+
timeout(
3143+
TIMEOUT_DURATION,
3144+
state_rx.wait_for(|maybe_state| match maybe_state {
3145+
ReceivedInstanceState::InstancePut(sled_inst_state) => {
3146+
sled_inst_state.vmm_state.state == VmmState::Running
3147+
}
3148+
_ => false,
3149+
}),
3150+
)
3151+
.await
3152+
.expect("timed out waiting for InstanceState::Running in FakeNexus")
3153+
.expect("failed to receive VmmState' InstanceState");
3154+
3155+
// Place the mock propolis in single-step mode.
3156+
propolis_mock_set_mode(
3157+
&propolis_client,
3158+
propolis_mock_server::MockMode::SingleStep,
3159+
)
3160+
.await;
3161+
3162+
// Request the VMM stop
3163+
mgr.ensure_state(propolis_id, VmmStateRequested::Stopped)
3164+
.await
3165+
.unwrap();
3166+
3167+
// Single-step once.
3168+
propolis_mock_step(&propolis_client).await;
3169+
3170+
timeout(
3171+
TIMEOUT_DURATION,
3172+
state_rx.wait_for(|maybe_state| match maybe_state {
3173+
ReceivedInstanceState::InstancePut(sled_inst_state) => {
3174+
sled_inst_state.vmm_state.state == VmmState::Stopping
3175+
}
3176+
_ => false,
3177+
}),
3178+
)
3179+
.await
3180+
.expect("timed out waiting for VmmState::Stopping in FakeNexus")
3181+
.expect("failed to receive FakeNexus' InstanceState");
3182+
3183+
// NOW WE STOP ADVANCING THE MOCK --- IT WILL NEVER REACH STOPPED
3184+
3185+
// The timeout should now fire and sled-agent will murder propolis,
3186+
// allowing the zone to be destroyed.
3187+
3188+
timeout(
3189+
TIMEOUT_DURATION,
3190+
state_rx.wait_for(|maybe_state| match maybe_state {
3191+
ReceivedInstanceState::InstancePut(sled_inst_state) => {
3192+
sled_inst_state.vmm_state.state == VmmState::Stopped
3193+
}
3194+
_ => false,
3195+
}),
3196+
)
3197+
.await
3198+
.expect("timed out waiting for VmmState::Stopped in FakeNexus")
3199+
.expect("failed to receive FakeNexus' InstanceState");
3200+
3201+
storage_harness.cleanup().await;
3202+
logctx.cleanup_successful();
3203+
}
30183204
}

0 commit comments

Comments
 (0)