From b5be747e19f9640cf9bab22a56dd309f2c6fe0ca Mon Sep 17 00:00:00 2001 From: Rueian Date: Sat, 16 Dec 2023 00:08:52 +0800 Subject: [PATCH] [Refactor] Do not use RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV as timeout of status check in tests It is only an insurance to keep reconciliation continuously triggered to hopefully fix an unexpected state. Relying on it to fix state transitions is not a good pratice. Therefore, We should not reference it in tests at all. --- .../controllers/ray/raycluster_controller_test.go | 8 ++++---- ray-operator/controllers/ray/rayjob_controller_test.go | 2 +- ray-operator/controllers/ray/suite_test.go | 5 ++++- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/ray-operator/controllers/ray/raycluster_controller_test.go b/ray-operator/controllers/ray/raycluster_controller_test.go index bfd46bb1669..8cb7186bc16 100644 --- a/ray-operator/controllers/ray/raycluster_controller_test.go +++ b/ray-operator/controllers/ray/raycluster_controller_test.go @@ -205,7 +205,7 @@ var _ = Context("Inside the default namespace", func() { // We need to figure out the behavior. See https://github.com/ray-project/kuberay/issues/1736 for more details. Eventually( getClusterState(ctx, "default", myRayCluster.Name), - time.Second*(utils.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS+5), time.Millisecond*500).Should(Equal(rayv1.Ready)) + time.Second*15, time.Millisecond*500).Should(Equal(rayv1.Ready)) }) It("should re-create a deleted worker", func() { @@ -311,7 +311,7 @@ var _ = Context("Inside the default namespace", func() { It("cluster's .status.state should be updated to 'suspended' shortly after all Pods are terminated", func() { Eventually( getClusterState(ctx, "default", myRayCluster.Name), - time.Second*(utils.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS+5), time.Millisecond*500).Should(Equal(rayv1.Suspended)) + time.Second*15, time.Millisecond*500).Should(Equal(rayv1.Suspended)) }) It("set suspend to false and then revert it to true before all Pods are running", func() { @@ -363,7 +363,7 @@ var _ = Context("Inside the default namespace", func() { // RayCluster should be in Suspended state. Eventually( getClusterState(ctx, "default", myRayCluster.Name), - time.Second*(utils.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS+5), time.Millisecond*500).Should(Equal(rayv1.Suspended)) + time.Second*15, time.Millisecond*500).Should(Equal(rayv1.Suspended)) }) It("should run all head and worker pods if un-suspended", func() { @@ -402,7 +402,7 @@ var _ = Context("Inside the default namespace", func() { It("cluster's .status.state should be updated back to 'ready' after being un-suspended", func() { Eventually( getClusterState(ctx, "default", myRayCluster.Name), - time.Second*(utils.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS+5), time.Millisecond*500).Should(Equal(rayv1.Ready)) + time.Second*15, time.Millisecond*500).Should(Equal(rayv1.Ready)) }) }) }) diff --git a/ray-operator/controllers/ray/rayjob_controller_test.go b/ray-operator/controllers/ray/rayjob_controller_test.go index 38f16e9cef5..c37720b2139 100644 --- a/ray-operator/controllers/ray/rayjob_controller_test.go +++ b/ray-operator/controllers/ray/rayjob_controller_test.go @@ -270,7 +270,7 @@ var _ = Context("Inside the default namespace", func() { // The RayCluster.Status.State should be Ready. Eventually( getClusterState(ctx, "default", myRayCluster.Name), - time.Second*(utils.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS+5), time.Millisecond*500).Should(Equal(rayv1.Ready)) + time.Second*15, time.Millisecond*500).Should(Equal(rayv1.Ready)) }) It("Dashboard URL should be set", func() { diff --git a/ray-operator/controllers/ray/suite_test.go b/ray-operator/controllers/ray/suite_test.go index f15cd497f6a..884cbe768d4 100644 --- a/ray-operator/controllers/ray/suite_test.go +++ b/ray-operator/controllers/ray/suite_test.go @@ -79,7 +79,10 @@ var _ = BeforeSuite(func(ctx SpecContext) { Expect(err).ToNot(HaveOccurred()) Expect(k8sClient).ToNot(BeNil()) - // Suggested way to run tests + // The RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV is an insurance to keep reconciliation continuously triggered to hopefully fix an unexpected state. + // In a production environment, the requeue period is set to five minutes by default, which is relatively infrequent. + // TODO: We probably should not shorten RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV here just to make tests pass. + // Instead, we should fix the reconciliation if any unexpected happened. os.Setenv(utils.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV, "10") mgr, err := ctrl.NewManager(cfg, ctrl.Options{ Scheme: scheme.Scheme,