Skip to content

Commit 62bbc13

Browse files
authored
[RayJob][Status][4/n] Remove some JobDeploymentStatus and updateState function calls (#1743)
1 parent f56c66f commit 62bbc13

File tree

5 files changed

+26
-105
lines changed

5 files changed

+26
-105
lines changed

ray-operator/apis/ray/v1/rayjob_types.go

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,10 @@ func IsJobTerminal(status JobStatus) bool {
3333
type JobDeploymentStatus string
3434

3535
const (
36-
JobDeploymentStatusInitializing JobDeploymentStatus = "Initializing"
37-
JobDeploymentStatusFailedToGetOrCreateRayCluster JobDeploymentStatus = "FailedToGetOrCreateRayCluster"
38-
JobDeploymentStatusWaitForK8sJob JobDeploymentStatus = "WaitForK8sJob"
39-
JobDeploymentStatusFailedJobDeploy JobDeploymentStatus = "FailedJobDeploy"
40-
JobDeploymentStatusRunning JobDeploymentStatus = "Running"
41-
JobDeploymentStatusFailedToGetJobStatus JobDeploymentStatus = "FailedToGetJobStatus"
42-
JobDeploymentStatusComplete JobDeploymentStatus = "Complete"
43-
JobDeploymentStatusSuspended JobDeploymentStatus = "Suspended"
36+
JobDeploymentStatusInitializing JobDeploymentStatus = "Initializing"
37+
JobDeploymentStatusRunning JobDeploymentStatus = "Running"
38+
JobDeploymentStatusComplete JobDeploymentStatus = "Complete"
39+
JobDeploymentStatusSuspended JobDeploymentStatus = "Suspended"
4440
)
4541

4642
// RayJobSpec defines the desired state of RayJob

ray-operator/apis/ray/v1alpha1/rayjob_types.go

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,10 @@ func IsJobTerminal(status JobStatus) bool {
3333
type JobDeploymentStatus string
3434

3535
const (
36-
JobDeploymentStatusInitializing JobDeploymentStatus = "Initializing"
37-
JobDeploymentStatusFailedToGetOrCreateRayCluster JobDeploymentStatus = "FailedToGetOrCreateRayCluster"
38-
JobDeploymentStatusWaitForK8sJob JobDeploymentStatus = "WaitForK8sJob"
39-
JobDeploymentStatusFailedJobDeploy JobDeploymentStatus = "FailedJobDeploy"
40-
JobDeploymentStatusRunning JobDeploymentStatus = "Running"
41-
JobDeploymentStatusFailedToGetJobStatus JobDeploymentStatus = "FailedToGetJobStatus"
42-
JobDeploymentStatusComplete JobDeploymentStatus = "Complete"
43-
JobDeploymentStatusSuspended JobDeploymentStatus = "Suspended"
36+
JobDeploymentStatusInitializing JobDeploymentStatus = "Initializing"
37+
JobDeploymentStatusRunning JobDeploymentStatus = "Running"
38+
JobDeploymentStatusComplete JobDeploymentStatus = "Complete"
39+
JobDeploymentStatusSuspended JobDeploymentStatus = "Suspended"
4440
)
4541

4642
// RayJobSpec defines the desired state of RayJob

ray-operator/controllers/ray/rayjob_controller.go

Lines changed: 17 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ import (
66
"time"
77

88
"github.com/go-logr/logr"
9-
fmtErrors "github.com/pkg/errors"
109
batchv1 "k8s.io/api/batch/v1"
1110
corev1 "k8s.io/api/core/v1"
1211
"k8s.io/apimachinery/pkg/api/errors"
@@ -146,14 +145,14 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
146145
if !errors.IsNotFound(err) {
147146
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
148147
}
149-
if err = r.updateState(ctx, rayJobInstance, nil, rayJobInstance.Status.JobStatus, rayv1.JobDeploymentStatusComplete, nil); err != nil {
148+
if err = r.updateState(ctx, rayJobInstance, nil, rayJobInstance.Status.JobStatus, rayv1.JobDeploymentStatusComplete); err != nil {
150149
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
151150
}
152151
return ctrl.Result{}, nil
153152
}
154153

155154
if rayClusterInstance.DeletionTimestamp != nil {
156-
if err = r.updateState(ctx, rayJobInstance, nil, rayJobInstance.Status.JobStatus, rayv1.JobDeploymentStatusComplete, nil); err != nil {
155+
if err = r.updateState(ctx, rayJobInstance, nil, rayJobInstance.Status.JobStatus, rayv1.JobDeploymentStatusComplete); err != nil {
157156
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
158157
}
159158
return ctrl.Result{}, nil
@@ -168,7 +167,6 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
168167

169168
var rayClusterInstance *rayv1.RayCluster
170169
if rayClusterInstance, err = r.getOrCreateRayClusterInstance(ctx, rayJobInstance); err != nil {
171-
err = r.updateState(ctx, rayJobInstance, nil, rayJobInstance.Status.JobStatus, rayv1.JobDeploymentStatusFailedToGetOrCreateRayCluster, err)
172170
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
173171
}
174172
// If there is no cluster instance and no error suspend the job deployment
@@ -177,7 +175,7 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
177175
if rayJobInstance.Status.JobDeploymentStatus == rayv1.JobDeploymentStatusSuspended {
178176
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
179177
}
180-
err = r.updateState(ctx, rayJobInstance, nil, rayJobInstance.Status.JobStatus, rayv1.JobDeploymentStatusSuspended, err)
178+
err = r.updateState(ctx, rayJobInstance, nil, rayJobInstance.Status.JobStatus, rayv1.JobDeploymentStatusSuspended)
181179
if err != nil {
182180
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
183181
}
@@ -216,7 +214,6 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
216214
// Ensure k8s job has been created
217215
jobName, wasJobCreated, err := r.getOrCreateK8sJob(ctx, rayJobInstance, rayClusterInstance)
218216
if err != nil {
219-
err = r.updateState(ctx, rayJobInstance, nil, rayJobInstance.Status.JobStatus, rayv1.JobDeploymentStatusFailedJobDeploy, err)
220217
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
221218
}
222219

@@ -234,27 +231,29 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
234231
if err != nil {
235232
if errors.IsNotFound(err) {
236233
r.Log.Info("Job not found", "RayJob", rayJobInstance.Name, "jobId", jobName)
237-
err = r.updateState(ctx, rayJobInstance, nil, rayJobInstance.Status.JobStatus, rayv1.JobDeploymentStatusWaitForK8sJob, err)
238234
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
239235
}
240236
r.Log.Error(err, "failed to get k8s job")
241-
err = r.updateState(ctx, rayJobInstance, nil, rayJobInstance.Status.JobStatus, rayv1.JobDeploymentStatusFailedToGetJobStatus, err)
242237
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
243238
}
244239

245240
// Check the current status of ray jobs
246241
jobInfo, err := rayDashboardClient.GetJobInfo(ctx, rayJobInstance.Status.JobId)
247242
if err != nil {
248243
r.Log.Error(err, "failed to get job info", "jobId", rayJobInstance.Status.JobId)
249-
err = r.updateState(ctx, rayJobInstance, jobInfo, rayJobInstance.Status.JobStatus, rayv1.JobDeploymentStatusFailedToGetJobStatus, err)
250244
// Dashboard service in head pod takes time to start, it's possible we get connection refused error.
251245
// Requeue after few seconds to avoid continuous connection errors.
252246
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
253247
}
254248

255-
// Update RayJob.Status (Kubernetes CR) from Ray Job Status from Dashboard service
256-
if r.shouldUpdateJobStatus(rayJobInstance.Status.JobStatus, rayJobInstance.Status.JobDeploymentStatus, jobInfo) {
257-
err = r.updateState(ctx, rayJobInstance, jobInfo, jobInfo.JobStatus, rayv1.JobDeploymentStatusRunning, nil)
249+
if jobInfo != nil {
250+
// TODO (kevin85421): `GetJobInfo` should not return both JobInfo and error with nil values,
251+
// but it does when the job is not found. This check is a workaround to avoid dereferencing
252+
// a nil pointer.
253+
err = r.updateState(ctx, rayJobInstance, jobInfo, jobInfo.JobStatus, rayv1.JobDeploymentStatusRunning)
254+
}
255+
256+
if err != nil {
258257
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
259258
}
260259

@@ -288,7 +287,7 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
288287
rayJobInstance.Status.DashboardURL = ""
289288
rayJobInstance.Status.JobId = ""
290289
rayJobInstance.Status.Message = ""
291-
err = r.updateState(ctx, rayJobInstance, jobInfo, rayv1.JobStatusStopped, rayv1.JobDeploymentStatusSuspended, nil)
290+
err = r.updateState(ctx, rayJobInstance, jobInfo, rayv1.JobStatusStopped, rayv1.JobDeploymentStatusSuspended)
292291
if err != nil {
293292
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
294293
}
@@ -520,22 +519,11 @@ func (r *RayJobReconciler) initRayJobStatusIfNeed(ctx context.Context, rayJob *r
520519
rayJob.Status.JobStatus = rayv1.JobStatusPending
521520
}
522521

523-
return r.updateState(ctx, rayJob, nil, rayJob.Status.JobStatus, rayv1.JobDeploymentStatusInitializing, nil)
524-
}
525-
526-
func (r *RayJobReconciler) shouldUpdateJobStatus(oldJobStatus rayv1.JobStatus, oldJobDeploymentStatus rayv1.JobDeploymentStatus, jobInfo *utils.RayJobInfo) bool {
527-
if jobInfo != nil {
528-
jobStatusChanged := (oldJobStatus != jobInfo.JobStatus)
529-
// If the status changed, or if we didn't have the status before and now we have it, update the status and deployment status.
530-
if jobStatusChanged || oldJobDeploymentStatus == rayv1.JobDeploymentStatusFailedToGetJobStatus {
531-
return true
532-
}
533-
}
534-
return false
522+
return r.updateState(ctx, rayJob, nil, rayJob.Status.JobStatus, rayv1.JobDeploymentStatusInitializing)
535523
}
536524

537525
// make sure the priority is correct
538-
func (r *RayJobReconciler) updateState(ctx context.Context, rayJob *rayv1.RayJob, jobInfo *utils.RayJobInfo, jobStatus rayv1.JobStatus, jobDeploymentStatus rayv1.JobDeploymentStatus, err error) error {
526+
func (r *RayJobReconciler) updateState(ctx context.Context, rayJob *rayv1.RayJob, jobInfo *utils.RayJobInfo, jobStatus rayv1.JobStatus, jobDeploymentStatus rayv1.JobDeploymentStatus) error {
539527
r.Log.Info("UpdateState", "oldJobStatus", rayJob.Status.JobStatus, "newJobStatus", jobStatus, "oldJobDeploymentStatus", rayJob.Status.JobDeploymentStatus, "newJobDeploymentStatus", jobDeploymentStatus)
540528

541529
// Let's skip update the APIServer if it's synced.
@@ -558,10 +546,10 @@ func (r *RayJobReconciler) updateState(ctx context.Context, rayJob *rayv1.RayJob
558546
// TODO (kevin85421): ObservedGeneration should be used to determine whether update this CR or not.
559547
rayJob.Status.ObservedGeneration = rayJob.ObjectMeta.Generation
560548

561-
if errStatus := r.Status().Update(ctx, rayJob); errStatus != nil {
562-
return fmtErrors.Errorf("combined error: %v %v", err, errStatus)
549+
if err := r.Status().Update(ctx, rayJob); err != nil {
550+
return err
563551
}
564-
return err
552+
return nil
565553
}
566554

567555
// TODO: select existing rayclusters by ClusterSelector
@@ -645,11 +633,6 @@ func (r *RayJobReconciler) getOrCreateRayClusterInstance(ctx context.Context, ra
645633
return nil, err
646634
}
647635

648-
// special case: is the job is complete status and cluster has been recycled.
649-
if isJobSucceedOrFail(rayJobInstance.Status.JobStatus) && rayJobInstance.Status.JobDeploymentStatus == rayv1.JobDeploymentStatusComplete {
650-
r.Log.Info("The cluster has been recycled for the job, skip duplicate creation", "rayjob", rayJobInstance.Name)
651-
return nil, err
652-
}
653636
// special case: don't create a cluster instance and don't return an error if the suspend flag of the job is true
654637
if rayJobInstance.Spec.Suspend {
655638
return nil, nil

ray-operator/controllers/ray/rayjob_controller_unit_test.go

Lines changed: 0 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -176,57 +176,3 @@ func TestGetSubmitterTemplate(t *testing.T) {
176176
}
177177
assert.True(t, found)
178178
}
179-
180-
func TestShouldUpdateJobStatus(t *testing.T) {
181-
r := &RayJobReconciler{}
182-
183-
tests := []struct {
184-
name string
185-
oldJobStatus rayv1.JobStatus
186-
oldJobDeploymentStatus rayv1.JobDeploymentStatus
187-
jobInfo *utils.RayJobInfo
188-
expectedShouldUpdate bool
189-
}{
190-
{
191-
name: "jobInfo is nil",
192-
oldJobStatus: rayv1.JobStatusPending,
193-
oldJobDeploymentStatus: rayv1.JobDeploymentStatusRunning,
194-
jobInfo: nil,
195-
expectedShouldUpdate: false,
196-
},
197-
{
198-
name: "job status changed",
199-
oldJobStatus: rayv1.JobStatusRunning,
200-
oldJobDeploymentStatus: rayv1.JobDeploymentStatusRunning,
201-
jobInfo: &utils.RayJobInfo{
202-
JobStatus: rayv1.JobStatusStopped,
203-
},
204-
expectedShouldUpdate: true,
205-
},
206-
{
207-
name: "job status same but JobDeploymentStatus failed to get status",
208-
oldJobStatus: rayv1.JobStatusRunning,
209-
oldJobDeploymentStatus: rayv1.JobDeploymentStatusFailedToGetJobStatus,
210-
jobInfo: &utils.RayJobInfo{
211-
JobStatus: rayv1.JobStatusRunning,
212-
},
213-
expectedShouldUpdate: true,
214-
},
215-
{
216-
name: "job status same and JobDeploymentStatus not failed to get status",
217-
oldJobStatus: rayv1.JobStatusRunning,
218-
oldJobDeploymentStatus: rayv1.JobDeploymentStatusRunning,
219-
jobInfo: &utils.RayJobInfo{
220-
JobStatus: rayv1.JobStatusRunning,
221-
},
222-
expectedShouldUpdate: false,
223-
},
224-
}
225-
226-
for _, tt := range tests {
227-
t.Run(tt.name, func(t *testing.T) {
228-
result := r.shouldUpdateJobStatus(tt.oldJobStatus, tt.oldJobDeploymentStatus, tt.jobInfo)
229-
assert.Equal(t, tt.expectedShouldUpdate, result)
230-
})
231-
}
232-
}

ray-operator/controllers/ray/utils/dashboard_httpclient.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,7 @@ func (r *RayDashboardClient) GetJobLog(ctx context.Context, jobName string, log
481481
var jobLog RayJobLogsResponse
482482
if err = json.Unmarshal(body, &jobLog); err != nil {
483483
// Maybe body is not valid json, raise an error with the body.
484-
return nil, fmt.Errorf("GetJobInfo fail: %s", string(body))
484+
return nil, fmt.Errorf("GetJobLog fail: %s", string(body))
485485
}
486486

487487
return &jobLog.Logs, nil

0 commit comments

Comments
 (0)