@@ -14,7 +14,6 @@ import (
14
14
"time"
15
15
16
16
"k8s.io/apimachinery/pkg/api/meta"
17
- metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
18
17
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
19
18
"k8s.io/cli-runtime/pkg/genericclioptions"
20
19
"k8s.io/cli-runtime/pkg/genericiooptions"
@@ -34,9 +33,10 @@ import (
34
33
)
35
34
36
35
const (
37
- dashboardAddr = "http://localhost:8265"
38
- clusterTimeout = 120.0
39
- portforwardtimeout = 60.0
36
+ dashboardAddr = "http://localhost:8265"
37
+ clusterTimeout = 120.0
38
+ portforwardtimeout = 60.0
39
+ rayjobDeletionTimeout = 30.0
40
40
)
41
41
42
42
type SubmitJobOptions struct {
@@ -336,44 +336,18 @@ func (options *SubmitJobOptions) Run(ctx context.Context, factory cmdutil.Factor
336
336
}
337
337
fmt .Printf ("Submitted RayJob %s.\n " , options .RayJob .GetName ())
338
338
339
- // Continuously checks for Kubernetes events related to the RayJobDeletionPolicy.
340
- // If an event indicates that the RayJobDeletionPolicy feature gate must be enabled, throw an error and delete the RayJob.
341
- go func () {
342
- ticker := time .NewTicker (5 * time .Second )
343
- defer ticker .Stop ()
344
-
345
- for {
346
- select {
347
- case <- time .After (clusterTimeout * time .Second ):
348
- return
349
- case <- ticker .C :
350
- eventList , err := k8sClients .KubernetesClient ().CoreV1 ().Events (* options .configFlags .Namespace ).List (ctx , metav1.ListOptions {
351
- FieldSelector : fmt .Sprintf ("involvedObject.name=%s" , options .RayJob .GetName ()),
352
- })
353
- if err != nil {
354
- fmt .Printf ("Error listing events: %v\n " , err )
355
- return
356
- }
357
-
358
- // Check for error events related to RayJobDeletionPolicy feature gate
359
- for _ , event := range eventList .Items {
360
- if strings .Contains (event .Message , "RayJobDeletionPolicy feature gate must be enabled to use the DeletionPolicy feature" ) {
361
- if event .FirstTimestamp .Time .After (startTime ) || event .LastTimestamp .Time .After (startTime ) {
362
- fmt .Printf ("Deleting RayJob...\n " )
363
- err = k8sClients .RayClient ().RayV1 ().RayJobs (* options .configFlags .Namespace ).Delete (ctx , options .RayJob .GetName (), v1.DeleteOptions {})
364
- if err != nil {
365
- fmt .Printf ("Failed to clean up Ray job: %v\n " , err )
366
- } else {
367
- fmt .Printf ("Cleaned Up RayJob: %s\n " , options .RayJob .GetName ())
368
- }
369
- log .Fatalf ("%s" , event .Message )
370
- }
371
- }
372
- }
373
- return
339
+ if options .deletionPolicy != "" {
340
+ err = k8sClients .WaitRayJobDeletionPolicyEnabled (ctx , * options .configFlags .Namespace , options .RayJob .Name , startTime , rayjobDeletionTimeout )
341
+ if err != nil {
342
+ fmt .Printf ("Deleting RayJob...\n " )
343
+ deleteErr := k8sClients .RayClient ().RayV1 ().RayJobs (* options .configFlags .Namespace ).Delete (ctx , options .RayJob .GetName (), v1.DeleteOptions {})
344
+ if deleteErr != nil {
345
+ return fmt .Errorf ("Failed to clean up Ray job after time out.: %w" , deleteErr )
374
346
}
347
+ fmt .Printf ("Cleaned Up RayJob: %s\n " , options .RayJob .GetName ())
348
+ return fmt .Errorf ("%w" , err )
375
349
}
376
- }()
350
+ }
377
351
378
352
if len (options .RayJob .GetName ()) > 0 {
379
353
// Add timeout?
0 commit comments