Skip to content

Commit 031e055

Browse files
committed
roachtest: reset failures on vm preemption
This change resets failures in the case of a vm preemption, in case a timeout occurred which normally takes precedence over all other failures. While a timeout suggests that something should be fixed with the test (usually respecting the test context cancellation), we see that in practice, engineers tend to close the issue without investigating as soon as they see the preemption. This also removes the potential duplicate vm_preemption failure that may have been added by the preemption polling.
1 parent 2ff9ee5 commit 031e055

File tree

2 files changed

+33
-0
lines changed

2 files changed

+33
-0
lines changed

pkg/cmd/roachtest/test_runner.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1164,6 +1164,9 @@ func (r *testRunner) runTest(
11641164
// Note that this error message is referred for test selection in
11651165
// pkg/cmd/roachtest/testselector/snowflake_query.sql.
11661166
failureMsg = fmt.Sprintf("VMs preempted during the test run: %s\n\n**Other Failures:**\n%s", preemptedVMNames, failureMsg)
1167+
// Reset the failures as a timeout may have suppressed failures, but we
1168+
// want to propagate the preemption error and avoid creating an issue.
1169+
t.resetFailures()
11671170
t.Error(vmPreemptionError(preemptedVMNames))
11681171
}
11691172
hostErrorVMNames := getHostErrorVMNames(ctx, c, l)

pkg/cmd/roachtest/test_test.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -800,4 +800,34 @@ func TestVMPreemptionPolling(t *testing.T) {
800800

801801
require.NoError(t, err)
802802
})
803+
804+
// Test that if the test hangs until timeout, a VM preemption will still be caught.
805+
t.Run("test hangs and still catches preemption", func(t *testing.T) {
806+
// We don't want the polling to cancel the test early.
807+
setPollPreemptionInterval(10 * time.Minute)
808+
getPreemptedVMsHook = func(c cluster.Cluster, ctx context.Context, l *logger.Logger) ([]vm.PreemptedVM, error) {
809+
preemptedVMs := []vm.PreemptedVM{{
810+
Name: "test_node",
811+
PreemptedAt: time.Now(),
812+
}}
813+
return preemptedVMs, nil
814+
}
815+
816+
mockTest.Timeout = 10 * time.Millisecond
817+
// We expect the following to occur:
818+
// 1. The test blocks on the context, which is only cancelled when the test runner
819+
// returns after test completion. This effectively blocks the test forever.
820+
// 2. The test times out and the test runner marks it as failed.
821+
// 3. Normally, this would result in a failed test and runner.Run returning an error.
822+
// However, because we injected a preemption, the test runner marks it as a flake
823+
// instead and returns no errors.
824+
mockTest.Run = func(ctx context.Context, t test.Test, c cluster.Cluster) {
825+
<-ctx.Done()
826+
}
827+
828+
err := runner.Run(ctx, []registry.TestSpec{mockTest}, 1, /* count */
829+
defaultParallelism, copt, testOpts{}, lopt)
830+
831+
require.NoError(t, err)
832+
})
803833
}

0 commit comments

Comments
 (0)