diff --git a/ray-operator/controllers/ray/raycluster_controller_test.go b/ray-operator/controllers/ray/raycluster_controller_test.go index bfd46bb1669..8cb7186bc16 100644 --- a/ray-operator/controllers/ray/raycluster_controller_test.go +++ b/ray-operator/controllers/ray/raycluster_controller_test.go @@ -205,7 +205,7 @@ var _ = Context("Inside the default namespace", func() { // We need to figure out the behavior. See https://github.com/ray-project/kuberay/issues/1736 for more details. Eventually( getClusterState(ctx, "default", myRayCluster.Name), - time.Second*(utils.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS+5), time.Millisecond*500).Should(Equal(rayv1.Ready)) + time.Second*15, time.Millisecond*500).Should(Equal(rayv1.Ready)) }) It("should re-create a deleted worker", func() { @@ -311,7 +311,7 @@ var _ = Context("Inside the default namespace", func() { It("cluster's .status.state should be updated to 'suspended' shortly after all Pods are terminated", func() { Eventually( getClusterState(ctx, "default", myRayCluster.Name), - time.Second*(utils.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS+5), time.Millisecond*500).Should(Equal(rayv1.Suspended)) + time.Second*15, time.Millisecond*500).Should(Equal(rayv1.Suspended)) }) It("set suspend to false and then revert it to true before all Pods are running", func() { @@ -363,7 +363,7 @@ var _ = Context("Inside the default namespace", func() { // RayCluster should be in Suspended state. Eventually( getClusterState(ctx, "default", myRayCluster.Name), - time.Second*(utils.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS+5), time.Millisecond*500).Should(Equal(rayv1.Suspended)) + time.Second*15, time.Millisecond*500).Should(Equal(rayv1.Suspended)) }) It("should run all head and worker pods if un-suspended", func() { @@ -402,7 +402,7 @@ var _ = Context("Inside the default namespace", func() { It("cluster's .status.state should be updated back to 'ready' after being un-suspended", func() { Eventually( getClusterState(ctx, "default", myRayCluster.Name), - time.Second*(utils.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS+5), time.Millisecond*500).Should(Equal(rayv1.Ready)) + time.Second*15, time.Millisecond*500).Should(Equal(rayv1.Ready)) }) }) }) diff --git a/ray-operator/controllers/ray/rayjob_controller_test.go b/ray-operator/controllers/ray/rayjob_controller_test.go index 38f16e9cef5..c37720b2139 100644 --- a/ray-operator/controllers/ray/rayjob_controller_test.go +++ b/ray-operator/controllers/ray/rayjob_controller_test.go @@ -270,7 +270,7 @@ var _ = Context("Inside the default namespace", func() { // The RayCluster.Status.State should be Ready. Eventually( getClusterState(ctx, "default", myRayCluster.Name), - time.Second*(utils.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS+5), time.Millisecond*500).Should(Equal(rayv1.Ready)) + time.Second*15, time.Millisecond*500).Should(Equal(rayv1.Ready)) }) It("Dashboard URL should be set", func() { diff --git a/ray-operator/controllers/ray/suite_test.go b/ray-operator/controllers/ray/suite_test.go index f15cd497f6a..0c6d9bb26a1 100644 --- a/ray-operator/controllers/ray/suite_test.go +++ b/ray-operator/controllers/ray/suite_test.go @@ -79,7 +79,9 @@ var _ = BeforeSuite(func(ctx SpecContext) { Expect(err).ToNot(HaveOccurred()) Expect(k8sClient).ToNot(BeNil()) - // Suggested way to run tests + // The RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV is an insurance to keep reconciliation continuously triggered to hopefully fix an unexpected state. + // In a production environment, the requeue period is set to five minutes by default, which is relatively infrequent. + // TODO: We probably should not shorten RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV here just to make tests pass. We should fix the reconciliation instead. os.Setenv(utils.RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV, "10") mgr, err := ctrl.NewManager(cfg, ctrl.Options{ Scheme: scheme.Scheme,