Skip to content

Commit 335b734

Browse files
authored
implement shorter warmup grace period on pods being admitted (#127)
1 parent 6689c66 commit 335b734

File tree

4 files changed

+58
-31
lines changed

4 files changed

+58
-31
lines changed

api/v1beta2/appwrapper_types.go

+1
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ const (
133133
)
134134

135135
const (
136+
AdmissionGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/admissionGracePeriodDuration"
136137
WarmupGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/warmupGracePeriodDuration"
137138
FailureGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/failureGracePeriodDuration"
138139
ResetPauseDurationAnnotation = "workload.codeflare.dev.appwrapper/resetPauseDuration"

internal/controller/appwrapper/appwrapper_controller.go

+18-2
Original file line numberDiff line numberDiff line change
@@ -261,8 +261,13 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
261261
podDetailsMessage := fmt.Sprintf("%v pods pending; %v pods running; %v pods succeeded", podStatus.pending, podStatus.running, podStatus.succeeded)
262262
clearCondition(aw, workloadv1beta2.PodsReady, "InsufficientPodsReady", podDetailsMessage)
263263
whenDeployed := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.ResourcesDeployed)).LastTransitionTime
264-
warmupDuration := r.warmupGraceDuration(ctx, aw)
265-
if time.Now().Before(whenDeployed.Add(warmupDuration)) {
264+
var graceDuration time.Duration
265+
if podStatus.pending+podStatus.running+podStatus.succeeded >= podStatus.expected {
266+
graceDuration = r.warmupGraceDuration(ctx, aw)
267+
} else {
268+
graceDuration = r.admissionGraceDuration(ctx, aw)
269+
}
270+
if time.Now().Before(whenDeployed.Add(graceDuration)) {
266271
return ctrl.Result{RequeueAfter: 5 * time.Second}, r.Status().Update(ctx, aw)
267272
} else {
268273
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
@@ -459,6 +464,17 @@ func (r *AppWrapperReconciler) limitDuration(desired time.Duration) time.Duratio
459464
}
460465
}
461466

467+
func (r *AppWrapperReconciler) admissionGraceDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
468+
if userPeriod, ok := aw.Annotations[workloadv1beta2.AdmissionGracePeriodDurationAnnotation]; ok {
469+
if duration, err := time.ParseDuration(userPeriod); err == nil {
470+
return r.limitDuration(duration)
471+
} else {
472+
log.FromContext(ctx).Info("Malformed warmup period annotation", "annotation", userPeriod, "error", err)
473+
}
474+
}
475+
return r.limitDuration(r.Config.FaultTolerance.AdmissionGracePeriod)
476+
}
477+
462478
func (r *AppWrapperReconciler) warmupGraceDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
463479
if userPeriod, ok := aw.Annotations[workloadv1beta2.WarmupGracePeriodDurationAnnotation]; ok {
464480
if duration, err := time.ParseDuration(userPeriod); err == nil {

pkg/config/config.go

+20-14
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,14 @@ type AppWrapperConfig struct {
3737
}
3838

3939
type FaultToleranceConfig struct {
40-
WarmupGracePeriod time.Duration `json:"warmupGracePeriod,omitempty"`
41-
FailureGracePeriod time.Duration `json:"failureGracePeriod,omitempty"`
42-
ResetPause time.Duration `json:"resetPause,omitempty"`
43-
RetryLimit int32 `json:"retryLimit,omitempty"`
44-
DeletionGracePeriod time.Duration `json:"deletionGracePeriod,omitempty"`
45-
GracePeriodCeiling time.Duration `json:"gracePeriodCeiling,omitempty"`
46-
SuccessTTLCeiling time.Duration `json:"successTTLCeiling,omitempty"`
40+
AdmissionGracePeriod time.Duration `json:"admissionGracePeriod,omitempty"`
41+
WarmupGracePeriod time.Duration `json:"warmupGracePeriod,omitempty"`
42+
FailureGracePeriod time.Duration `json:"failureGracePeriod,omitempty"`
43+
ResetPause time.Duration `json:"resetPause,omitempty"`
44+
RetryLimit int32 `json:"retryLimit,omitempty"`
45+
DeletionGracePeriod time.Duration `json:"deletionGracePeriod,omitempty"`
46+
GracePeriodCeiling time.Duration `json:"gracePeriodCeiling,omitempty"`
47+
SuccessTTLCeiling time.Duration `json:"successTTLCeiling,omitempty"`
4748
}
4849

4950
type CertManagementConfig struct {
@@ -81,13 +82,14 @@ func NewAppWrapperConfig() *AppWrapperConfig {
8182
DisableChildAdmissionCtrl: false,
8283
UserRBACAdmissionCheck: true,
8384
FaultTolerance: &FaultToleranceConfig{
84-
WarmupGracePeriod: 5 * time.Minute,
85-
FailureGracePeriod: 1 * time.Minute,
86-
ResetPause: 90 * time.Second,
87-
RetryLimit: 3,
88-
DeletionGracePeriod: 10 * time.Minute,
89-
GracePeriodCeiling: 24 * time.Hour,
90-
SuccessTTLCeiling: 7 * 24 * time.Hour,
85+
AdmissionGracePeriod: 1 * time.Minute,
86+
WarmupGracePeriod: 5 * time.Minute,
87+
FailureGracePeriod: 1 * time.Minute,
88+
ResetPause: 90 * time.Second,
89+
RetryLimit: 3,
90+
DeletionGracePeriod: 10 * time.Minute,
91+
GracePeriodCeiling: 24 * time.Hour,
92+
SuccessTTLCeiling: 7 * 24 * time.Hour,
9193
},
9294
}
9395
}
@@ -105,6 +107,10 @@ func ValidateAppWrapperConfig(config *AppWrapperConfig) error {
105107
return fmt.Errorf("FailureGracePeriod %v exceeds GracePeriodCeiling %v",
106108
config.FaultTolerance.FailureGracePeriod, config.FaultTolerance.GracePeriodCeiling)
107109
}
110+
if config.FaultTolerance.AdmissionGracePeriod > config.FaultTolerance.GracePeriodCeiling {
111+
return fmt.Errorf("AdmissionGracePeriod %v exceeds GracePeriodCeiling %v",
112+
config.FaultTolerance.AdmissionGracePeriod, config.FaultTolerance.GracePeriodCeiling)
113+
}
108114
if config.FaultTolerance.WarmupGracePeriod > config.FaultTolerance.GracePeriodCeiling {
109115
return fmt.Errorf("WarmupGracePeriod %v exceeds GracePeriodCeiling %v",
110116
config.FaultTolerance.WarmupGracePeriod, config.FaultTolerance.GracePeriodCeiling)

site/_pages/arch-fault-tolerance.md

+19-15
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,17 @@ classes: wide
77
### Overall Design
88

99
The `podSets` contained in the AppWrapper specification enable the AppWrapper
10-
controller to inject labels into every `Pod` that is created by
10+
controller to inject labels into every Pod that is created by
1111
the workload during its execution. Throughout the execution of the
1212
workload, the AppWrapper controller monitors the number and health of
13-
all labeled `Pods` and uses this information to determine if a
14-
workload is unhealthy. A workload can be deemed *unhealthy* either
15-
because it contains a non-zero number of `Failed` pods or because
16-
after the `WarmupGracePeriod` has passed and it has fewer
17-
`Running` and `Completed` pods than expected.
13+
all labeled Pods and uses this information to determine if a
14+
workload is unhealthy. A workload can be deemed *unhealthy* if any of
15+
the following conditions are true:
16+
+ There are a non-zero number of `Failed` Pods.
17+
+ It takes longer than `AdmissionGracePeriod` for the expected
18+
number of Pods to at least reach the `Pending` state.
19+
+ It takes longer than the `WarmupGracePeriod` for the expected
20+
number of Pods to at least reach the `Running` state.
1821

1922
If a workload is determined to be unhealthy, the AppWrapper controller
2023
first waits for a `FailureGracePeriod` to allow the primary resource
@@ -54,15 +57,16 @@ and can be customized on a per-AppWrapper basis by adding annotations.
5457
The table below lists the parameters, gives their default, and the annotation that
5558
can be used to customize them.
5659

57-
| Parameter | Default Value | Annotation |
58-
|---------------------|---------------|---------------------------------------------------------------|
59-
| WarmupGracePeriod | 5 Minutes | workload.codeflare.dev.appwrapper/warmupGracePeriodDuration |
60-
| FailureGracePeriod | 1 Minute | workload.codeflare.dev.appwrapper/failureGracePeriodDuration |
61-
| ResetPause | 90 Seconds | workload.codeflare.dev.appwrapper/resetPauseDuration |
62-
| RetryLimit | 3 | workload.codeflare.dev.appwrapper/retryLimit |
63-
| DeletionGracePeriod | 10 Minutes | workload.codeflare.dev.appwrapper/deletionGracePeriodDuration |
64-
| GracePeriodCeiling | 24 Hours | Not Applicable |
65-
| SuccessTTLCeiling | 7 Days | workload.codeflare.dev.appwrapper/successTTLDuration |
60+
| Parameter | Default Value | Annotation |
61+
|------------------------|---------------|------------------------------------------------------------------|
62+
| AdmissionGracePeriod | 1 Minute | workload.codeflare.dev.appwrapper/admissionGracePeriodDuration |
63+
| WarmupGracePeriod | 5 Minutes | workload.codeflare.dev.appwrapper/warmupGracePeriodDuration |
64+
| FailureGracePeriod | 1 Minute | workload.codeflare.dev.appwrapper/failureGracePeriodDuration |
65+
| ResetPause | 90 Seconds | workload.codeflare.dev.appwrapper/resetPauseDuration |
66+
| RetryLimit | 3 | workload.codeflare.dev.appwrapper/retryLimit |
67+
| DeletionGracePeriod | 10 Minutes | workload.codeflare.dev.appwrapper/deletionGracePeriodDuration |
68+
| GracePeriodCeiling | 24 Hours | Not Applicable |
69+
| SuccessTTLCeiling | 7 Days | workload.codeflare.dev.appwrapper/successTTLDuration |
6670

6771

6872
The `GracePeriodCeiling` imposes an upper limit on the other grace periods to

0 commit comments

Comments
 (0)