Skip to content

Commit aebdb4d

Browse files
authored
Disable component-level failure detection for Ray (#174) (#177)
In KubeRay 1.1, status.state == failed is not a stable terminal state, therefore we cannot treat it as a signal to initiate a resetOrFail operation on the AppWrapper.
1 parent 5a3de24 commit aebdb4d

File tree

1 file changed

+9
-0
lines changed

1 file changed

+9
-0
lines changed

internal/controller/appwrapper/appwrapper_controller.go

+9
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,10 @@ func (r *AppWrapperReconciler) getComponentStatus(ctx context.Context, aw *workl
583583
if obj.GetDeletionTimestamp().IsZero() {
584584
summary.deployed += 1
585585

586+
/* Disabled because failed is not a terminal state.
587+
* We've observed RC transiently entering "failed" before becoming "ready" due to ingress not being ready
588+
* TODO: Explore fixing in upstream projects.
589+
586590
// RayCluster is failed if status.State is "failed"
587591
status, ok := obj.UnstructuredContent()["status"]
588592
if !ok {
@@ -595,6 +599,7 @@ func (r *AppWrapperReconciler) getComponentStatus(ctx context.Context, aw *workl
595599
if state.(string) == "failed" {
596600
summary.failed += 1
597601
}
602+
*/
598603
}
599604
} else if !apierrors.IsNotFound(err) {
600605
return nil, err
@@ -608,6 +613,9 @@ func (r *AppWrapperReconciler) getComponentStatus(ctx context.Context, aw *workl
608613
if obj.GetDeletionTimestamp().IsZero() {
609614
summary.deployed += 1
610615

616+
/* Disabled because we are not sure if failed is a terminal state.
617+
* TODO: Determine whether or not RayJob has the same issue as RayCluster
618+
611619
// RayJob is failed if status.jobsStatus is "FAILED"
612620
status, ok := obj.UnstructuredContent()["status"]
613621
if !ok {
@@ -620,6 +628,7 @@ func (r *AppWrapperReconciler) getComponentStatus(ctx context.Context, aw *workl
620628
if jobStatus.(string) == "FAILED" {
621629
summary.failed += 1
622630
}
631+
*/
623632
}
624633
} else if !apierrors.IsNotFound(err) {
625634
return nil, err

0 commit comments

Comments
 (0)