Merge #139075

craig[bot] · DarrylWong · craig[bot] · commit 15a3216d965f · 2025-02-06T17:08:54.000Z
139075: roachtest: mark polled VM preemptions as non reportable r=srosenberg a=DarrylWong Previously, polled VM preemptions would simply cancel the test, as post test processing would recheck for preemptions again. However, we've seen some cases in AWS where the post test check returns no preemptions despite the polling returning preemptions. This may be just be the AWS check being eventually consistent, so we want to avoid posting if either check finds preemptions. ---- The second change resets failures in the case of a vm preemption, in case a timeout occurred which normally takes precedence over all other failures. While a timeout suggests that something should be fixed with the test (usually respecting the test context cancellation), we see that in practice, engineers tend to close the issue without investigating as soon as they see the preemption. This also removes the potential duplicate vm_preemption failure that may have been added by the preemption polling. Fixes: #139004 Fixes: #139931 Release note: none Epic: none Co-authored-by: DarrylWong <darryl@cockroachlabs.com>
diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go
@@ -1164,6 +1164,9 @@ func (r *testRunner) runTest(
 					// Note that this error message is referred for test selection in
 					// pkg/cmd/roachtest/testselector/snowflake_query.sql.
 					failureMsg = fmt.Sprintf("VMs preempted during the test run: %s\n\n**Other Failures:**\n%s", preemptedVMNames, failureMsg)
+					// Reset the failures as a timeout may have suppressed failures, but we
+					// want to propagate the preemption error and avoid creating an issue.
+					t.resetFailures()
 					t.Error(vmPreemptionError(preemptedVMNames))
 				}
 				hostErrorVMNames := getHostErrorVMNames(ctx, c, l)
@@ -2171,11 +2174,15 @@ func monitorForPreemptedVMs(ctx context.Context, t test.Test, c cluster.Cluster,
 					continue
 				}
 
-				// If we find any preemptions, fail the test. Note that we will recheck for
-				// preemptions in post failure processing, which will correctly assign this
-				// failure as an infra flake.
+				// If we find any preemptions, fail the test. Note that while we will recheck for
+				// preemptions in post failure processing, we need to mark the test as a preemption
+				// failure here in case the recheck says there were no preemptions.
 				if len(preemptedVMs) != 0 {
-					t.Errorf("monitorForPreemptedVMs: Preempted VMs detected: %s", preemptedVMs)
+					var vmNames []string
+					for _, preemptedVM := range preemptedVMs {
+						vmNames = append(vmNames, preemptedVM.Name)
+					}
+					t.Errorf("monitorForPreemptedVMs detected VM Preemptions: %s", vmPreemptionError(getVMNames(vmNames)))
 				}
 			}
 		}
diff --git a/pkg/cmd/roachtest/test_test.go b/pkg/cmd/roachtest/test_test.go
@@ -767,4 +767,67 @@ func TestVMPreemptionPolling(t *testing.T) {
 		// be treated as a flake instead of a failed test.
 		require.NoError(t, err)
 	})
+
+	// Test that if VM preemption polling finds a preempted VM but the post test failure
+	// check doesn't, the test is still marked as a flake.
+	t.Run("post test check doesn't catch preemption", func(t *testing.T) {
+		setPollPreemptionInterval(10 * time.Millisecond)
+		testPreemptedCh := make(chan struct{})
+		getPreemptedVMsHook = func(c cluster.Cluster, ctx context.Context, l *logger.Logger) ([]vm.PreemptedVM, error) {
+			preemptedVMs := []vm.PreemptedVM{{
+				Name:        "test_node",
+				PreemptedAt: time.Now(),
+			}}
+			close(testPreemptedCh)
+			return preemptedVMs, nil
+		}
+
+		mockTest.Run = func(ctx context.Context, t test.Test, c cluster.Cluster) {
+			defer func() {
+				getPreemptedVMsHook = func(c cluster.Cluster, ctx context.Context, l *logger.Logger) ([]vm.PreemptedVM, error) {
+					return nil, nil
+				}
+			}()
+			// Make sure the preemption polling is called and the test context is cancelled
+			// before unblocking. Under stress, the test may time out before the preemption
+			// check is called otherwise.
+			<-testPreemptedCh
+			<-ctx.Done()
+		}
+
+		err := runner.Run(ctx, []registry.TestSpec{mockTest}, 1, /* count */
+			defaultParallelism, copt, testOpts{}, lopt)
+
+		require.NoError(t, err)
+	})
+
+	// Test that if the test hangs until timeout, a VM preemption will still be caught.
+	t.Run("test hangs and still catches preemption", func(t *testing.T) {
+		// We don't want the polling to cancel the test early.
+		setPollPreemptionInterval(10 * time.Minute)
+		getPreemptedVMsHook = func(c cluster.Cluster, ctx context.Context, l *logger.Logger) ([]vm.PreemptedVM, error) {
+			preemptedVMs := []vm.PreemptedVM{{
+				Name:        "test_node",
+				PreemptedAt: time.Now(),
+			}}
+			return preemptedVMs, nil
+		}
+
+		mockTest.Timeout = 10 * time.Millisecond
+		// We expect the following to occur:
+		//	1. The test blocks on the context, which is only cancelled when the test runner
+		//		 returns after test completion. This effectively blocks the test forever.
+		//  2. The test times out and the test runner marks it as failed.
+		//  3. Normally, this would result in a failed test and runner.Run returning an error.
+		//     However, because we injected a preemption, the test runner marks it as a flake
+		//     instead and returns no errors.
+		mockTest.Run = func(ctx context.Context, t test.Test, c cluster.Cluster) {
+			<-ctx.Done()
+		}
+
+		err := runner.Run(ctx, []registry.TestSpec{mockTest}, 1, /* count */
+			defaultParallelism, copt, testOpts{}, lopt)
+
+		require.NoError(t, err)
+	})
 }

Original file line number	Diff line number	Diff line change
`@@ -1164,6 +1164,9 @@ func (r *testRunner) runTest(`
`1164`	`1164`	`// Note that this error message is referred for test selection in`
`1165`	`1165`	`// pkg/cmd/roachtest/testselector/snowflake_query.sql.`
`1166`	`1166`	`failureMsg = fmt.Sprintf("VMs preempted during the test run: %s\n\nOther Failures:\n%s", preemptedVMNames, failureMsg)`
	`1167`	`+ // Reset the failures as a timeout may have suppressed failures, but we`
	`1168`	`+ // want to propagate the preemption error and avoid creating an issue.`
	`1169`	`+ t.resetFailures()`
`1167`	`1170`	`t.Error(vmPreemptionError(preemptedVMNames))`
`1168`	`1171`	`}`
`1169`	`1172`	`hostErrorVMNames := getHostErrorVMNames(ctx, c, l)`
`@@ -2171,11 +2174,15 @@ func monitorForPreemptedVMs(ctx context.Context, t test.Test, c cluster.Cluster,`
`2171`	`2174`	`continue`
`2172`	`2175`	`}`
`2173`	`2176`
`2174`		`- // If we find any preemptions, fail the test. Note that we will recheck for`
`2175`		`- // preemptions in post failure processing, which will correctly assign this`
`2176`		`- // failure as an infra flake.`
	`2177`	`+ // If we find any preemptions, fail the test. Note that while we will recheck for`
	`2178`	`+ // preemptions in post failure processing, we need to mark the test as a preemption`
	`2179`	`+ // failure here in case the recheck says there were no preemptions.`
`2177`	`2180`	`if len(preemptedVMs) != 0 {`
`2178`		`- t.Errorf("monitorForPreemptedVMs: Preempted VMs detected: %s", preemptedVMs)`
	`2181`	`+ var vmNames []string`
	`2182`	`+ for _, preemptedVM := range preemptedVMs {`
	`2183`	`+ vmNames = append(vmNames, preemptedVM.Name)`
	`2184`	`+ }`
	`2185`	`+ t.Errorf("monitorForPreemptedVMs detected VM Preemptions: %s", vmPreemptionError(getVMNames(vmNames)))`
`2179`	`2186`	`}`
`2180`	`2187`	`}`
`2181`	`2188`	`}`