@@ -212,28 +212,7 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
212
212
rayDashboardClient .InitClient (rayJobInstance .Status .DashboardURL )
213
213
214
214
// Ensure k8s job has been created
215
- jobName , wasJobCreated , err := r .getOrCreateK8sJob (ctx , rayJobInstance , rayClusterInstance )
216
- if err != nil {
217
- return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
218
- }
219
-
220
- if wasJobCreated {
221
- r .Log .Info ("K8s job successfully created" , "RayJob" , rayJobInstance .Name , "jobId" , jobName )
222
- r .Recorder .Eventf (rayJobInstance , corev1 .EventTypeNormal , "Created" , "Created k8s job %s" , jobName )
223
- } else {
224
- r .Log .Info ("K8s job successfully retrieved" , "RayJob" , rayJobInstance .Name , "jobId" , jobName )
225
- }
226
-
227
- // Check the status of the k8s job and update the RayJobInstance status accordingly.
228
- // Get the k8s job
229
- k8sJob := & batchv1.Job {}
230
- err = r .Client .Get (ctx , types.NamespacedName {Name : jobName , Namespace : rayJobInstance .Namespace }, k8sJob )
231
- if err != nil {
232
- if errors .IsNotFound (err ) {
233
- r .Log .Info ("Job not found" , "RayJob" , rayJobInstance .Name , "jobId" , jobName )
234
- return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
235
- }
236
- r .Log .Error (err , "failed to get k8s job" )
215
+ if err := r .createK8sJobIfNeed (ctx , rayJobInstance , rayClusterInstance ); err != nil {
237
216
return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
238
217
}
239
218
@@ -262,17 +241,13 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
262
241
// the RayJob is submitted against the RayCluster created by THIS job, then
263
242
// try to gracefully stop the Ray job and delete (suspend) the cluster
264
243
if rayJobInstance .Spec .Suspend && len (rayJobInstance .Spec .ClusterSelector ) == 0 {
265
- info , err := rayDashboardClient .GetJobInfo (ctx , rayJobInstance .Status .JobId )
266
- if err != nil {
267
- return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
268
- }
269
- if ! rayv1 .IsJobTerminal (info .JobStatus ) {
244
+ if ! rayv1 .IsJobTerminal (jobInfo .JobStatus ) {
270
245
err := rayDashboardClient .StopJob (ctx , rayJobInstance .Status .JobId , & r .Log )
271
246
if err != nil {
272
247
return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
273
248
}
274
249
}
275
- if info .JobStatus != rayv1 .JobStatusStopped {
250
+ if jobInfo .JobStatus != rayv1 .JobStatusStopped {
276
251
return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, nil
277
252
}
278
253
@@ -325,6 +300,7 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
325
300
}
326
301
}
327
302
303
+ // TODO (kevin85421): Use the source of truth `jobInfo.JobStatus` instead.
328
304
if isJobPendingOrRunning (rayJobInstance .Status .JobStatus ) {
329
305
// Requeue the RayJob to poll its status from the running Ray job
330
306
return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, nil
@@ -334,8 +310,8 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
334
310
return ctrl.Result {}, nil
335
311
}
336
312
337
- // getOrCreateK8sJob creates a Kubernetes Job for the Ray Job if it doesn't exist, otherwise returns the existing one. It returns the Job name and a boolean indicating whether the Job was created .
338
- func (r * RayJobReconciler ) getOrCreateK8sJob (ctx context.Context , rayJobInstance * rayv1.RayJob , rayClusterInstance * rayv1.RayCluster ) ( string , bool , error ) {
313
+ // createK8sJobIfNeed creates a Kubernetes Job for the RayJob if it doesn't exist.
314
+ func (r * RayJobReconciler ) createK8sJobIfNeed (ctx context.Context , rayJobInstance * rayv1.RayJob , rayClusterInstance * rayv1.RayCluster ) error {
339
315
jobName := rayJobInstance .Name
340
316
jobNamespace := rayJobInstance .Namespace
341
317
@@ -346,18 +322,18 @@ func (r *RayJobReconciler) getOrCreateK8sJob(ctx context.Context, rayJobInstance
346
322
submitterTemplate , err := r .getSubmitterTemplate (rayJobInstance , rayClusterInstance )
347
323
if err != nil {
348
324
r .Log .Error (err , "failed to get submitter template" )
349
- return "" , false , err
325
+ return err
350
326
}
351
327
return r .createNewK8sJob (ctx , rayJobInstance , submitterTemplate )
352
328
}
353
329
354
330
// Some other error occurred while trying to get the Job
355
- r .Log .Error (err , "failed to get k8s Job" )
356
- return "" , false , err
331
+ r .Log .Error (err , "failed to get Kubernetes Job" )
332
+ return err
357
333
}
358
334
359
- // Job already exists, instead of returning an error we return a "success"
360
- return jobName , false , nil
335
+ r . Log . Info ( "Kubernetes Job already exists" , "RayJob" , rayJobInstance . Name , "Kubernetes Job" , job . Name )
336
+ return nil
361
337
}
362
338
363
339
// getSubmitterTemplate builds the submitter pod template for the Ray job.
@@ -399,8 +375,8 @@ func (r *RayJobReconciler) getSubmitterTemplate(rayJobInstance *rayv1.RayJob, ra
399
375
return submitterTemplate , nil
400
376
}
401
377
402
- // createNewK8sJob creates a new Kubernetes Job. It returns the Job's name and a boolean indicating whether a new Job was created .
403
- func (r * RayJobReconciler ) createNewK8sJob (ctx context.Context , rayJobInstance * rayv1.RayJob , submitterTemplate corev1.PodTemplateSpec ) ( string , bool , error ) {
378
+ // createNewK8sJob creates a new Kubernetes Job. It returns an error .
379
+ func (r * RayJobReconciler ) createNewK8sJob (ctx context.Context , rayJobInstance * rayv1.RayJob , submitterTemplate corev1.PodTemplateSpec ) error {
404
380
job := & batchv1.Job {
405
381
ObjectMeta : metav1.ObjectMeta {
406
382
Name : rayJobInstance .Name ,
@@ -422,17 +398,17 @@ func (r *RayJobReconciler) createNewK8sJob(ctx context.Context, rayJobInstance *
422
398
// Set the ownership in order to do the garbage collection by k8s.
423
399
if err := ctrl .SetControllerReference (rayJobInstance , job , r .Scheme ); err != nil {
424
400
r .Log .Error (err , "failed to set controller reference" )
425
- return "" , false , err
401
+ return err
426
402
}
427
403
428
404
// Create the Kubernetes Job
429
405
if err := r .Client .Create (ctx , job ); err != nil {
430
406
r .Log .Error (err , "failed to create k8s Job" )
431
- return "" , false , err
407
+ return err
432
408
}
433
-
434
- // Return the Job's name and true indicating a new job was created
435
- return job . Name , true , nil
409
+ r . Log . Info ( "Kubernetes Job created" , "RayJob" , rayJobInstance . Name , "Kubernetes Job" , job . Name )
410
+ r . Recorder . Eventf ( rayJobInstance , corev1 . EventTypeNormal , "Created" , "Created Kubernetes Job %s" , job . Name )
411
+ return nil
436
412
}
437
413
438
414
func (r * RayJobReconciler ) deleteCluster (ctx context.Context , rayJobInstance * rayv1.RayJob ) (reconcile.Result , error ) {
0 commit comments