Skip to content

Commit 4bfe0b1

Browse files
authored
adjust names of grace period annotations (#128)
1 parent 335b734 commit 4bfe0b1

File tree

8 files changed

+100
-103
lines changed

8 files changed

+100
-103
lines changed

api/v1beta2/appwrapper_types.go

+8-8
Original file line numberDiff line numberDiff line change
@@ -133,14 +133,14 @@ const (
133133
)
134134

135135
const (
136-
AdmissionGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/admissionGracePeriodDuration"
137-
WarmupGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/warmupGracePeriodDuration"
138-
FailureGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/failureGracePeriodDuration"
139-
ResetPauseDurationAnnotation = "workload.codeflare.dev.appwrapper/resetPauseDuration"
140-
RetryLimitAnnotation = "workload.codeflare.dev.appwrapper/retryLimit"
141-
DeletionGracePeriodAnnotation = "workload.codeflare.dev.appwrapper/deletionGracePeriodDuration"
142-
DebuggingFailureDeletionDelayDurationAnnotation = "workload.codeflare.dev.appwrapper/debuggingFailureDeletionDelayDuration"
143-
SuccessTTLDurationAnnotation = "workload.codeflare.dev.appwrapper/successTTLDuration"
136+
AdmissionGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/admissionGracePeriodDuration"
137+
WarmupGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/warmupGracePeriodDuration"
138+
FailureGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/failureGracePeriodDuration"
139+
RetryPausePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/retryPausePeriodDuration"
140+
RetryLimitAnnotation = "workload.codeflare.dev.appwrapper/retryLimit"
141+
ForcefulDeletionGracePeriodAnnotation = "workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration"
142+
DeletionOnFailureGracePeriodAnnotation = "workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration"
143+
SuccessTTLAnnotation = "workload.codeflare.dev.appwrapper/successTTLDuration"
144144
)
145145

146146
//+kubebuilder:object:root=true

internal/controller/appwrapper/appwrapper_controller.go

+21-21
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,7 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
322322

323323
// Pause before transitioning to Resuming to heuristically allow transient system problems to subside
324324
whenReset := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.Unhealthy)).LastTransitionTime
325-
pauseDuration := r.resettingPauseDuration(ctx, aw)
325+
pauseDuration := r.retryPauseDuration(ctx, aw)
326326
now := time.Now()
327327
deadline := whenReset.Add(pauseDuration)
328328
if now.Before(deadline) {
@@ -342,14 +342,14 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
342342
// When an appwrapper is annotated with a non-zero debugging delay,
343343
// we hold quota for the delay period and do not delete the resources of
344344
// a failed appwrapper unless Kueue preempts it by setting Suspend to true.
345-
deletionDelay := r.debuggingFailureDeletionDelay(ctx, aw)
345+
deletionDelay := r.deletionOnFailureGraceDuration(ctx, aw)
346346

347347
if deletionDelay > 0 && !aw.Spec.Suspend {
348348
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
349349
Type: string(workloadv1beta2.DeletingResources),
350350
Status: metav1.ConditionFalse,
351351
Reason: "DeletionPaused",
352-
Message: fmt.Sprintf("%v has value %v", workloadv1beta2.DebuggingFailureDeletionDelayDurationAnnotation, deletionDelay),
352+
Message: fmt.Sprintf("%v has value %v", workloadv1beta2.DeletionOnFailureGracePeriodAnnotation, deletionDelay),
353353
})
354354
whenDelayed := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.DeletingResources)).LastTransitionTime
355355

@@ -457,8 +457,8 @@ func (r *AppWrapperReconciler) workloadStatus(ctx context.Context, aw *workloadv
457457
func (r *AppWrapperReconciler) limitDuration(desired time.Duration) time.Duration {
458458
if desired < 0 {
459459
return 0 * time.Second
460-
} else if desired > r.Config.FaultTolerance.GracePeriodCeiling {
461-
return r.Config.FaultTolerance.GracePeriodCeiling
460+
} else if desired > r.Config.FaultTolerance.GracePeriodMaximum {
461+
return r.Config.FaultTolerance.GracePeriodMaximum
462462
} else {
463463
return desired
464464
}
@@ -469,7 +469,7 @@ func (r *AppWrapperReconciler) admissionGraceDuration(ctx context.Context, aw *w
469469
if duration, err := time.ParseDuration(userPeriod); err == nil {
470470
return r.limitDuration(duration)
471471
} else {
472-
log.FromContext(ctx).Info("Malformed warmup period annotation", "annotation", userPeriod, "error", err)
472+
log.FromContext(ctx).Info("Malformed admission grace period annotation", "annotation", userPeriod, "error", err)
473473
}
474474
}
475475
return r.limitDuration(r.Config.FaultTolerance.AdmissionGracePeriod)
@@ -480,7 +480,7 @@ func (r *AppWrapperReconciler) warmupGraceDuration(ctx context.Context, aw *work
480480
if duration, err := time.ParseDuration(userPeriod); err == nil {
481481
return r.limitDuration(duration)
482482
} else {
483-
log.FromContext(ctx).Info("Malformed warmup period annotation", "annotation", userPeriod, "error", err)
483+
log.FromContext(ctx).Info("Malformed warmup grace period annotation", "annotation", userPeriod, "error", err)
484484
}
485485
}
486486
return r.limitDuration(r.Config.FaultTolerance.WarmupGracePeriod)
@@ -508,50 +508,50 @@ func (r *AppWrapperReconciler) retryLimit(ctx context.Context, aw *workloadv1bet
508508
return r.Config.FaultTolerance.RetryLimit
509509
}
510510

511-
func (r *AppWrapperReconciler) resettingPauseDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
512-
if userPeriod, ok := aw.Annotations[workloadv1beta2.ResetPauseDurationAnnotation]; ok {
511+
func (r *AppWrapperReconciler) retryPauseDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
512+
if userPeriod, ok := aw.Annotations[workloadv1beta2.RetryPausePeriodDurationAnnotation]; ok {
513513
if duration, err := time.ParseDuration(userPeriod); err == nil {
514514
return r.limitDuration(duration)
515515
} else {
516-
log.FromContext(ctx).Info("Malformed reset pause annotation", "annotation", userPeriod, "error", err)
516+
log.FromContext(ctx).Info("Malformed retry pause annotation", "annotation", userPeriod, "error", err)
517517
}
518518
}
519-
return r.limitDuration(r.Config.FaultTolerance.ResetPause)
519+
return r.limitDuration(r.Config.FaultTolerance.RetryPausePeriod)
520520
}
521521

522-
func (r *AppWrapperReconciler) deletionGraceDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
523-
if userPeriod, ok := aw.Annotations[workloadv1beta2.DeletionGracePeriodAnnotation]; ok {
522+
func (r *AppWrapperReconciler) forcefulDeletionGraceDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
523+
if userPeriod, ok := aw.Annotations[workloadv1beta2.ForcefulDeletionGracePeriodAnnotation]; ok {
524524
if duration, err := time.ParseDuration(userPeriod); err == nil {
525525
return r.limitDuration(duration)
526526
} else {
527-
log.FromContext(ctx).Info("Malformed deletion period annotation", "annotation", userPeriod, "error", err)
527+
log.FromContext(ctx).Info("Malformed forceful deletion period annotation", "annotation", userPeriod, "error", err)
528528
}
529529
}
530-
return r.limitDuration(r.Config.FaultTolerance.DeletionGracePeriod)
530+
return r.limitDuration(r.Config.FaultTolerance.ForcefulDeletionGracePeriod)
531531
}
532532

533-
func (r *AppWrapperReconciler) debuggingFailureDeletionDelay(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
534-
if userPeriod, ok := aw.Annotations[workloadv1beta2.DebuggingFailureDeletionDelayDurationAnnotation]; ok {
533+
func (r *AppWrapperReconciler) deletionOnFailureGraceDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
534+
if userPeriod, ok := aw.Annotations[workloadv1beta2.DeletionOnFailureGracePeriodAnnotation]; ok {
535535
if duration, err := time.ParseDuration(userPeriod); err == nil {
536536
return r.limitDuration(duration)
537537
} else {
538-
log.FromContext(ctx).Info("Malformed delay deletion annotation", "annotation", userPeriod, "error", err)
538+
log.FromContext(ctx).Info("Malformed delection on failue delay annotation", "annotation", userPeriod, "error", err)
539539
}
540540
}
541541
return 0 * time.Second
542542
}
543543

544544
func (r *AppWrapperReconciler) timeToLiveAfterSucceededDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
545-
if userPeriod, ok := aw.Annotations[workloadv1beta2.SuccessTTLDurationAnnotation]; ok {
545+
if userPeriod, ok := aw.Annotations[workloadv1beta2.SuccessTTLAnnotation]; ok {
546546
if duration, err := time.ParseDuration(userPeriod); err == nil {
547-
if duration > 0 && duration < r.Config.FaultTolerance.SuccessTTLCeiling {
547+
if duration > 0 && duration < r.Config.FaultTolerance.SuccessTTL {
548548
return duration
549549
}
550550
} else {
551551
log.FromContext(ctx).Info("Malformed successTTL annotation", "annotation", userPeriod, "error", err)
552552
}
553553
}
554-
return r.Config.FaultTolerance.SuccessTTLCeiling
554+
return r.Config.FaultTolerance.SuccessTTL
555555
}
556556

557557
func clearCondition(aw *workloadv1beta2.AppWrapper, condition workloadv1beta2.AppWrapperCondition, reason string, message string) {

internal/controller/appwrapper/appwrapper_controller_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ var _ = Describe("AppWrapper Controller", func() {
5858
}
5959
awConfig := config.NewAppWrapperConfig()
6060
awConfig.FaultTolerance.FailureGracePeriod = 0 * time.Second
61-
awConfig.FaultTolerance.ResetPause = 0 * time.Second
61+
awConfig.FaultTolerance.RetryPausePeriod = 0 * time.Second
6262
awConfig.FaultTolerance.RetryLimit = 0
6363
awReconciler = &AppWrapperReconciler{
6464
Client: k8sClient,

internal/controller/appwrapper/resource_management.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ func (r *AppWrapperReconciler) deleteComponents(ctx context.Context, aw *workloa
202202
remaining++ // no error deleting resource, resource therefore still exists
203203
}
204204

205-
deletionGracePeriod := r.deletionGraceDuration(ctx, aw)
205+
deletionGracePeriod := r.forcefulDeletionGraceDuration(ctx, aw)
206206
whenInitiated := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.DeletingResources)).LastTransitionTime
207207
gracePeriodExpired := time.Now().After(whenInitiated.Time.Add(deletionGracePeriod))
208208

pkg/config/config.go

+30-30
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,14 @@ type AppWrapperConfig struct {
3737
}
3838

3939
type FaultToleranceConfig struct {
40-
AdmissionGracePeriod time.Duration `json:"admissionGracePeriod,omitempty"`
41-
WarmupGracePeriod time.Duration `json:"warmupGracePeriod,omitempty"`
42-
FailureGracePeriod time.Duration `json:"failureGracePeriod,omitempty"`
43-
ResetPause time.Duration `json:"resetPause,omitempty"`
44-
RetryLimit int32 `json:"retryLimit,omitempty"`
45-
DeletionGracePeriod time.Duration `json:"deletionGracePeriod,omitempty"`
46-
GracePeriodCeiling time.Duration `json:"gracePeriodCeiling,omitempty"`
47-
SuccessTTLCeiling time.Duration `json:"successTTLCeiling,omitempty"`
40+
AdmissionGracePeriod time.Duration `json:"admissionGracePeriod,omitempty"`
41+
WarmupGracePeriod time.Duration `json:"warmupGracePeriod,omitempty"`
42+
FailureGracePeriod time.Duration `json:"failureGracePeriod,omitempty"`
43+
RetryPausePeriod time.Duration `json:"resetPause,omitempty"`
44+
RetryLimit int32 `json:"retryLimit,omitempty"`
45+
ForcefulDeletionGracePeriod time.Duration `json:"deletionGracePeriod,omitempty"`
46+
GracePeriodMaximum time.Duration `json:"gracePeriodCeiling,omitempty"`
47+
SuccessTTL time.Duration `json:"successTTLCeiling,omitempty"`
4848
}
4949

5050
type CertManagementConfig struct {
@@ -82,41 +82,41 @@ func NewAppWrapperConfig() *AppWrapperConfig {
8282
DisableChildAdmissionCtrl: false,
8383
UserRBACAdmissionCheck: true,
8484
FaultTolerance: &FaultToleranceConfig{
85-
AdmissionGracePeriod: 1 * time.Minute,
86-
WarmupGracePeriod: 5 * time.Minute,
87-
FailureGracePeriod: 1 * time.Minute,
88-
ResetPause: 90 * time.Second,
89-
RetryLimit: 3,
90-
DeletionGracePeriod: 10 * time.Minute,
91-
GracePeriodCeiling: 24 * time.Hour,
92-
SuccessTTLCeiling: 7 * 24 * time.Hour,
85+
AdmissionGracePeriod: 1 * time.Minute,
86+
WarmupGracePeriod: 5 * time.Minute,
87+
FailureGracePeriod: 1 * time.Minute,
88+
RetryPausePeriod: 90 * time.Second,
89+
RetryLimit: 3,
90+
ForcefulDeletionGracePeriod: 10 * time.Minute,
91+
GracePeriodMaximum: 24 * time.Hour,
92+
SuccessTTL: 7 * 24 * time.Hour,
9393
},
9494
}
9595
}
9696

9797
func ValidateAppWrapperConfig(config *AppWrapperConfig) error {
98-
if config.FaultTolerance.DeletionGracePeriod > config.FaultTolerance.GracePeriodCeiling {
99-
return fmt.Errorf("DelectionGracePeriod %v exceeds GracePeriodCeiling %v",
100-
config.FaultTolerance.DeletionGracePeriod, config.FaultTolerance.GracePeriodCeiling)
98+
if config.FaultTolerance.ForcefulDeletionGracePeriod > config.FaultTolerance.GracePeriodMaximum {
99+
return fmt.Errorf("ForcefulDelectionGracePeriod %v exceeds GracePeriodCeiling %v",
100+
config.FaultTolerance.ForcefulDeletionGracePeriod, config.FaultTolerance.GracePeriodMaximum)
101101
}
102-
if config.FaultTolerance.ResetPause > config.FaultTolerance.GracePeriodCeiling {
103-
return fmt.Errorf("ResetPause %v exceeds GracePeriodCeiling %v",
104-
config.FaultTolerance.ResetPause, config.FaultTolerance.GracePeriodCeiling)
102+
if config.FaultTolerance.RetryPausePeriod > config.FaultTolerance.GracePeriodMaximum {
103+
return fmt.Errorf("RetryPausePeriod %v exceeds GracePeriodCeiling %v",
104+
config.FaultTolerance.RetryPausePeriod, config.FaultTolerance.GracePeriodMaximum)
105105
}
106-
if config.FaultTolerance.FailureGracePeriod > config.FaultTolerance.GracePeriodCeiling {
106+
if config.FaultTolerance.FailureGracePeriod > config.FaultTolerance.GracePeriodMaximum {
107107
return fmt.Errorf("FailureGracePeriod %v exceeds GracePeriodCeiling %v",
108-
config.FaultTolerance.FailureGracePeriod, config.FaultTolerance.GracePeriodCeiling)
108+
config.FaultTolerance.FailureGracePeriod, config.FaultTolerance.GracePeriodMaximum)
109109
}
110-
if config.FaultTolerance.AdmissionGracePeriod > config.FaultTolerance.GracePeriodCeiling {
110+
if config.FaultTolerance.AdmissionGracePeriod > config.FaultTolerance.GracePeriodMaximum {
111111
return fmt.Errorf("AdmissionGracePeriod %v exceeds GracePeriodCeiling %v",
112-
config.FaultTolerance.AdmissionGracePeriod, config.FaultTolerance.GracePeriodCeiling)
112+
config.FaultTolerance.AdmissionGracePeriod, config.FaultTolerance.GracePeriodMaximum)
113113
}
114-
if config.FaultTolerance.WarmupGracePeriod > config.FaultTolerance.GracePeriodCeiling {
114+
if config.FaultTolerance.WarmupGracePeriod > config.FaultTolerance.GracePeriodMaximum {
115115
return fmt.Errorf("WarmupGracePeriod %v exceeds GracePeriodCeiling %v",
116-
config.FaultTolerance.WarmupGracePeriod, config.FaultTolerance.GracePeriodCeiling)
116+
config.FaultTolerance.WarmupGracePeriod, config.FaultTolerance.GracePeriodMaximum)
117117
}
118-
if config.FaultTolerance.SuccessTTLCeiling <= 0 {
119-
return fmt.Errorf("SuccessTTLCeiling %v is not a positive duration", config.FaultTolerance.SuccessTTLCeiling)
118+
if config.FaultTolerance.SuccessTTL <= 0 {
119+
return fmt.Errorf("SuccessTTL %v is not a positive duration", config.FaultTolerance.SuccessTTL)
120120
}
121121

122122
return nil

samples/wrapped-failing-job.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ metadata:
66
kueue.x-k8s.io/queue-name: user-queue
77
annotations:
88
workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 10s
9-
workload.codeflare.dev.appwrapper/resetPauseDuration: 10s
9+
workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 10s
1010
workload.codeflare.dev.appwrapper/retryLimit: "3"
1111
spec:
1212
components:

samples/wrapped-failing-pod.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@ metadata:
66
kueue.x-k8s.io/queue-name: user-queue
77
annotations:
88
workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 10s
9-
workload.codeflare.dev.appwrapper/resetPauseDuration: 10s
9+
workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 10s
1010
workload.codeflare.dev.appwrapper/retryLimit: "1"
11-
workload.codeflare.dev.appwrapper/debuggingFailureDeletionDelayDuration: "5m"
11+
workload.codeflare.dev.appwrapper/delectionOnFailureGracePeriodDuration: "5m"
1212
spec:
1313
components:
1414
- template:

0 commit comments

Comments
 (0)