@@ -326,15 +326,41 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
326
326
return r .updateStatus (ctx , aw , workloadv1beta2 .AppWrapperResuming )
327
327
328
328
case workloadv1beta2 .AppWrapperFailed :
329
+ // Support for debugging failed jobs.
330
+ // When an appwrapper is annotated with a non-zero debugging delay,
331
+ // we hold quota for the delay period and do not delete the resources of
332
+ // a failed appwrapper unless Kueue preempts it by setting Suspend to true.
333
+ deletionDelay := r .debuggingFailureDeletionDelay (ctx , aw )
334
+
335
+ if deletionDelay > 0 && ! aw .Spec .Suspend {
336
+ meta .SetStatusCondition (& aw .Status .Conditions , metav1.Condition {
337
+ Type : string (workloadv1beta2 .DeletingResources ),
338
+ Status : metav1 .ConditionFalse ,
339
+ Reason : "DeletionPaused" ,
340
+ Message : fmt .Sprintf ("%v has value %v" , workloadv1beta2 .DebuggingFailureDeletionDelayDurationAnnotation , deletionDelay ),
341
+ })
342
+ whenDelayed := meta .FindStatusCondition (aw .Status .Conditions , string (workloadv1beta2 .DeletingResources )).LastTransitionTime
343
+
344
+ now := time .Now ()
345
+ deadline := whenDelayed .Add (deletionDelay )
346
+ if now .Before (deadline ) {
347
+ return ctrl.Result {RequeueAfter : deadline .Sub (now )}, r .Status ().Update (ctx , aw )
348
+ }
349
+ }
350
+
329
351
if meta .IsStatusConditionTrue (aw .Status .Conditions , string (workloadv1beta2 .ResourcesDeployed )) {
330
352
if ! r .deleteComponents (ctx , aw ) {
331
353
return ctrl.Result {RequeueAfter : 5 * time .Second }, nil
332
354
}
355
+ msg := "Resources deleted for failed AppWrapper"
356
+ if deletionDelay > 0 && aw .Spec .Suspend {
357
+ msg = "Kueue forced resource deletion by suspending AppWrapper"
358
+ }
333
359
meta .SetStatusCondition (& aw .Status .Conditions , metav1.Condition {
334
360
Type : string (workloadv1beta2 .ResourcesDeployed ),
335
361
Status : metav1 .ConditionFalse ,
336
362
Reason : string (workloadv1beta2 .AppWrapperFailed ),
337
- Message : "Resources deleted for failed AppWrapper" ,
363
+ Message : msg ,
338
364
})
339
365
}
340
366
meta .SetStatusCondition (& aw .Status .Conditions , metav1.Condition {
@@ -393,26 +419,36 @@ func (r *AppWrapperReconciler) workloadStatus(ctx context.Context, aw *workloadv
393
419
return summary , nil
394
420
}
395
421
422
+ func (r * AppWrapperReconciler ) limitDuration (desired time.Duration ) time.Duration {
423
+ if desired < 0 {
424
+ return 0 * time .Second
425
+ } else if desired > r .Config .FaultTolerance .GracePeriodCeiling {
426
+ return r .Config .FaultTolerance .GracePeriodCeiling
427
+ } else {
428
+ return desired
429
+ }
430
+ }
431
+
396
432
func (r * AppWrapperReconciler ) warmupGraceDuration (ctx context.Context , aw * workloadv1beta2.AppWrapper ) time.Duration {
397
433
if userPeriod , ok := aw .Annotations [workloadv1beta2 .WarmupGracePeriodDurationAnnotation ]; ok {
398
434
if duration , err := time .ParseDuration (userPeriod ); err == nil {
399
- return duration
435
+ return r . limitDuration ( duration )
400
436
} else {
401
437
log .FromContext (ctx ).Info ("Malformed warmup period annotation" , "annotation" , userPeriod , "error" , err )
402
438
}
403
439
}
404
- return r .Config .FaultTolerance .WarmupGracePeriod
440
+ return r .limitDuration ( r . Config .FaultTolerance .WarmupGracePeriod )
405
441
}
406
442
407
443
func (r * AppWrapperReconciler ) failureGraceDuration (ctx context.Context , aw * workloadv1beta2.AppWrapper ) time.Duration {
408
444
if userPeriod , ok := aw .Annotations [workloadv1beta2 .FailureGracePeriodDurationAnnotation ]; ok {
409
445
if duration , err := time .ParseDuration (userPeriod ); err == nil {
410
- return duration
446
+ return r . limitDuration ( duration )
411
447
} else {
412
448
log .FromContext (ctx ).Info ("Malformed grace period annotation" , "annotation" , userPeriod , "error" , err )
413
449
}
414
450
}
415
- return r .Config .FaultTolerance .FailureGracePeriod
451
+ return r .limitDuration ( r . Config .FaultTolerance .FailureGracePeriod )
416
452
}
417
453
418
454
func (r * AppWrapperReconciler ) retryLimit (ctx context.Context , aw * workloadv1beta2.AppWrapper ) int32 {
@@ -429,12 +465,34 @@ func (r *AppWrapperReconciler) retryLimit(ctx context.Context, aw *workloadv1bet
429
465
func (r * AppWrapperReconciler ) resettingPauseDuration (ctx context.Context , aw * workloadv1beta2.AppWrapper ) time.Duration {
430
466
if userPeriod , ok := aw .Annotations [workloadv1beta2 .ResetPauseDurationAnnotation ]; ok {
431
467
if duration , err := time .ParseDuration (userPeriod ); err == nil {
432
- return duration
468
+ return r . limitDuration ( duration )
433
469
} else {
434
470
log .FromContext (ctx ).Info ("Malformed reset pause annotation" , "annotation" , userPeriod , "error" , err )
435
471
}
436
472
}
437
- return r .Config .FaultTolerance .ResetPause
473
+ return r .limitDuration (r .Config .FaultTolerance .ResetPause )
474
+ }
475
+
476
+ func (r * AppWrapperReconciler ) deletionGraceDuration (ctx context.Context , aw * workloadv1beta2.AppWrapper ) time.Duration {
477
+ if userPeriod , ok := aw .Annotations [workloadv1beta2 .DeletionGracePeriodAnnotation ]; ok {
478
+ if duration , err := time .ParseDuration (userPeriod ); err == nil {
479
+ return r .limitDuration (duration )
480
+ } else {
481
+ log .FromContext (ctx ).Info ("Malformed deletion period annotation" , "annotation" , userPeriod , "error" , err )
482
+ }
483
+ }
484
+ return r .limitDuration (r .Config .FaultTolerance .DeletionGracePeriod )
485
+ }
486
+
487
+ func (r * AppWrapperReconciler ) debuggingFailureDeletionDelay (ctx context.Context , aw * workloadv1beta2.AppWrapper ) time.Duration {
488
+ if userPeriod , ok := aw .Annotations [workloadv1beta2 .DebuggingFailureDeletionDelayDurationAnnotation ]; ok {
489
+ if duration , err := time .ParseDuration (userPeriod ); err == nil {
490
+ return r .limitDuration (duration )
491
+ } else {
492
+ log .FromContext (ctx ).Info ("Malformed delay deletion annotation" , "annotation" , userPeriod , "error" , err )
493
+ }
494
+ }
495
+ return 0 * time .Second
438
496
}
439
497
440
498
func clearCondition (aw * workloadv1beta2.AppWrapper , condition workloadv1beta2.AppWrapperCondition , reason string , message string ) {
0 commit comments