Skip to content

Commit 15a3216

Browse files
craig[bot]DarrylWong
andcommitted
Merge #139075
139075: roachtest: mark polled VM preemptions as non reportable r=srosenberg a=DarrylWong Previously, polled VM preemptions would simply cancel the test, as post test processing would recheck for preemptions again. However, we've seen some cases in AWS where the post test check returns no preemptions despite the polling returning preemptions. This may be just be the AWS check being eventually consistent, so we want to avoid posting if either check finds preemptions. ---- The second change resets failures in the case of a vm preemption, in case a timeout occurred which normally takes precedence over all other failures. While a timeout suggests that something should be fixed with the test (usually respecting the test context cancellation), we see that in practice, engineers tend to close the issue without investigating as soon as they see the preemption. This also removes the potential duplicate vm_preemption failure that may have been added by the preemption polling. Fixes: #139004 Fixes: #139931 Release note: none Epic: none Co-authored-by: DarrylWong <[email protected]>
2 parents cb32622 + 031e055 commit 15a3216

File tree

2 files changed

+74
-4
lines changed

2 files changed

+74
-4
lines changed

pkg/cmd/roachtest/test_runner.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1164,6 +1164,9 @@ func (r *testRunner) runTest(
11641164
// Note that this error message is referred for test selection in
11651165
// pkg/cmd/roachtest/testselector/snowflake_query.sql.
11661166
failureMsg = fmt.Sprintf("VMs preempted during the test run: %s\n\n**Other Failures:**\n%s", preemptedVMNames, failureMsg)
1167+
// Reset the failures as a timeout may have suppressed failures, but we
1168+
// want to propagate the preemption error and avoid creating an issue.
1169+
t.resetFailures()
11671170
t.Error(vmPreemptionError(preemptedVMNames))
11681171
}
11691172
hostErrorVMNames := getHostErrorVMNames(ctx, c, l)
@@ -2171,11 +2174,15 @@ func monitorForPreemptedVMs(ctx context.Context, t test.Test, c cluster.Cluster,
21712174
continue
21722175
}
21732176

2174-
// If we find any preemptions, fail the test. Note that we will recheck for
2175-
// preemptions in post failure processing, which will correctly assign this
2176-
// failure as an infra flake.
2177+
// If we find any preemptions, fail the test. Note that while we will recheck for
2178+
// preemptions in post failure processing, we need to mark the test as a preemption
2179+
// failure here in case the recheck says there were no preemptions.
21772180
if len(preemptedVMs) != 0 {
2178-
t.Errorf("monitorForPreemptedVMs: Preempted VMs detected: %s", preemptedVMs)
2181+
var vmNames []string
2182+
for _, preemptedVM := range preemptedVMs {
2183+
vmNames = append(vmNames, preemptedVM.Name)
2184+
}
2185+
t.Errorf("monitorForPreemptedVMs detected VM Preemptions: %s", vmPreemptionError(getVMNames(vmNames)))
21792186
}
21802187
}
21812188
}

pkg/cmd/roachtest/test_test.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -767,4 +767,67 @@ func TestVMPreemptionPolling(t *testing.T) {
767767
// be treated as a flake instead of a failed test.
768768
require.NoError(t, err)
769769
})
770+
771+
// Test that if VM preemption polling finds a preempted VM but the post test failure
772+
// check doesn't, the test is still marked as a flake.
773+
t.Run("post test check doesn't catch preemption", func(t *testing.T) {
774+
setPollPreemptionInterval(10 * time.Millisecond)
775+
testPreemptedCh := make(chan struct{})
776+
getPreemptedVMsHook = func(c cluster.Cluster, ctx context.Context, l *logger.Logger) ([]vm.PreemptedVM, error) {
777+
preemptedVMs := []vm.PreemptedVM{{
778+
Name: "test_node",
779+
PreemptedAt: time.Now(),
780+
}}
781+
close(testPreemptedCh)
782+
return preemptedVMs, nil
783+
}
784+
785+
mockTest.Run = func(ctx context.Context, t test.Test, c cluster.Cluster) {
786+
defer func() {
787+
getPreemptedVMsHook = func(c cluster.Cluster, ctx context.Context, l *logger.Logger) ([]vm.PreemptedVM, error) {
788+
return nil, nil
789+
}
790+
}()
791+
// Make sure the preemption polling is called and the test context is cancelled
792+
// before unblocking. Under stress, the test may time out before the preemption
793+
// check is called otherwise.
794+
<-testPreemptedCh
795+
<-ctx.Done()
796+
}
797+
798+
err := runner.Run(ctx, []registry.TestSpec{mockTest}, 1, /* count */
799+
defaultParallelism, copt, testOpts{}, lopt)
800+
801+
require.NoError(t, err)
802+
})
803+
804+
// Test that if the test hangs until timeout, a VM preemption will still be caught.
805+
t.Run("test hangs and still catches preemption", func(t *testing.T) {
806+
// We don't want the polling to cancel the test early.
807+
setPollPreemptionInterval(10 * time.Minute)
808+
getPreemptedVMsHook = func(c cluster.Cluster, ctx context.Context, l *logger.Logger) ([]vm.PreemptedVM, error) {
809+
preemptedVMs := []vm.PreemptedVM{{
810+
Name: "test_node",
811+
PreemptedAt: time.Now(),
812+
}}
813+
return preemptedVMs, nil
814+
}
815+
816+
mockTest.Timeout = 10 * time.Millisecond
817+
// We expect the following to occur:
818+
// 1. The test blocks on the context, which is only cancelled when the test runner
819+
// returns after test completion. This effectively blocks the test forever.
820+
// 2. The test times out and the test runner marks it as failed.
821+
// 3. Normally, this would result in a failed test and runner.Run returning an error.
822+
// However, because we injected a preemption, the test runner marks it as a flake
823+
// instead and returns no errors.
824+
mockTest.Run = func(ctx context.Context, t test.Test, c cluster.Cluster) {
825+
<-ctx.Done()
826+
}
827+
828+
err := runner.Run(ctx, []registry.TestSpec{mockTest}, 1, /* count */
829+
defaultParallelism, copt, testOpts{}, lopt)
830+
831+
require.NoError(t, err)
832+
})
770833
}

0 commit comments

Comments
 (0)