Skip to content

Commit

Permalink
Merge #139075
Browse files Browse the repository at this point in the history
139075: roachtest: mark polled VM preemptions as non reportable r=srosenberg a=DarrylWong

Previously, polled VM preemptions would simply cancel the test, as post test processing would recheck for preemptions again. However, we've seen some cases in AWS where the post test check returns no preemptions despite the polling returning preemptions.

This may be just be the AWS check being eventually consistent, so we want to avoid posting if either check finds preemptions.

----

The second change resets failures in the case of a vm preemption, in case a timeout occurred which normally takes precedence over all other failures. While a timeout suggests that something should be fixed with the test (usually respecting the test context cancellation), we see that in practice, engineers tend to close the issue without investigating as soon as they see the preemption.

This also removes the potential duplicate vm_preemption failure that may have been added by the preemption polling.

Fixes: #139004
Fixes: #139931
Release note: none
Epic: none

Co-authored-by: DarrylWong <[email protected]>
  • Loading branch information
craig[bot] and DarrylWong committed Feb 6, 2025
2 parents cb32622 + 031e055 commit 15a3216
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 4 deletions.
15 changes: 11 additions & 4 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -1164,6 +1164,9 @@ func (r *testRunner) runTest(
// Note that this error message is referred for test selection in
// pkg/cmd/roachtest/testselector/snowflake_query.sql.
failureMsg = fmt.Sprintf("VMs preempted during the test run: %s\n\n**Other Failures:**\n%s", preemptedVMNames, failureMsg)
// Reset the failures as a timeout may have suppressed failures, but we
// want to propagate the preemption error and avoid creating an issue.
t.resetFailures()
t.Error(vmPreemptionError(preemptedVMNames))
}
hostErrorVMNames := getHostErrorVMNames(ctx, c, l)
Expand Down Expand Up @@ -2171,11 +2174,15 @@ func monitorForPreemptedVMs(ctx context.Context, t test.Test, c cluster.Cluster,
continue
}

// If we find any preemptions, fail the test. Note that we will recheck for
// preemptions in post failure processing, which will correctly assign this
// failure as an infra flake.
// If we find any preemptions, fail the test. Note that while we will recheck for
// preemptions in post failure processing, we need to mark the test as a preemption
// failure here in case the recheck says there were no preemptions.
if len(preemptedVMs) != 0 {
t.Errorf("monitorForPreemptedVMs: Preempted VMs detected: %s", preemptedVMs)
var vmNames []string
for _, preemptedVM := range preemptedVMs {
vmNames = append(vmNames, preemptedVM.Name)
}
t.Errorf("monitorForPreemptedVMs detected VM Preemptions: %s", vmPreemptionError(getVMNames(vmNames)))
}
}
}
Expand Down
63 changes: 63 additions & 0 deletions pkg/cmd/roachtest/test_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -767,4 +767,67 @@ func TestVMPreemptionPolling(t *testing.T) {
// be treated as a flake instead of a failed test.
require.NoError(t, err)
})

// Test that if VM preemption polling finds a preempted VM but the post test failure
// check doesn't, the test is still marked as a flake.
t.Run("post test check doesn't catch preemption", func(t *testing.T) {
setPollPreemptionInterval(10 * time.Millisecond)
testPreemptedCh := make(chan struct{})
getPreemptedVMsHook = func(c cluster.Cluster, ctx context.Context, l *logger.Logger) ([]vm.PreemptedVM, error) {
preemptedVMs := []vm.PreemptedVM{{
Name: "test_node",
PreemptedAt: time.Now(),
}}
close(testPreemptedCh)
return preemptedVMs, nil
}

mockTest.Run = func(ctx context.Context, t test.Test, c cluster.Cluster) {
defer func() {
getPreemptedVMsHook = func(c cluster.Cluster, ctx context.Context, l *logger.Logger) ([]vm.PreemptedVM, error) {
return nil, nil
}
}()
// Make sure the preemption polling is called and the test context is cancelled
// before unblocking. Under stress, the test may time out before the preemption
// check is called otherwise.
<-testPreemptedCh
<-ctx.Done()
}

err := runner.Run(ctx, []registry.TestSpec{mockTest}, 1, /* count */
defaultParallelism, copt, testOpts{}, lopt)

require.NoError(t, err)
})

// Test that if the test hangs until timeout, a VM preemption will still be caught.
t.Run("test hangs and still catches preemption", func(t *testing.T) {
// We don't want the polling to cancel the test early.
setPollPreemptionInterval(10 * time.Minute)
getPreemptedVMsHook = func(c cluster.Cluster, ctx context.Context, l *logger.Logger) ([]vm.PreemptedVM, error) {
preemptedVMs := []vm.PreemptedVM{{
Name: "test_node",
PreemptedAt: time.Now(),
}}
return preemptedVMs, nil
}

mockTest.Timeout = 10 * time.Millisecond
// We expect the following to occur:
// 1. The test blocks on the context, which is only cancelled when the test runner
// returns after test completion. This effectively blocks the test forever.
// 2. The test times out and the test runner marks it as failed.
// 3. Normally, this would result in a failed test and runner.Run returning an error.
// However, because we injected a preemption, the test runner marks it as a flake
// instead and returns no errors.
mockTest.Run = func(ctx context.Context, t test.Test, c cluster.Cluster) {
<-ctx.Done()
}

err := runner.Run(ctx, []registry.TestSpec{mockTest}, 1, /* count */
defaultParallelism, copt, testOpts{}, lopt)

require.NoError(t, err)
})
}

0 comments on commit 15a3216

Please sign in to comment.