Skip to content

Commit 981eafc

Browse files
authored
implement forceful deletion (#71)
1 parent 0db706e commit 981eafc

File tree

10 files changed

+260
-24
lines changed

10 files changed

+260
-24
lines changed

README.md

+11
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,17 @@ AppWrapper operator ensures that when Kueue admits an AppWrapper for
1515
execution, all of the necessary information will be propagated
1616
to cause the child's Kueue-enabled operator to admit it as well.
1717

18+
AppWrappers are also designed to harden workloads by providing an
19+
additional level of automatic fault detection and recovery. The AppWrapper
20+
controller monitors the health of the workload and if corrective actions
21+
are not taken by the primary resource controllers within specified deadlines,
22+
the AppWrapper controller will orchestrate workload-level retries and
23+
resource deletion to ensure that either the workload returns to a
24+
healthy state or is cleanly removed from the cluster and its quota
25+
freed for use by other workloads. For details on customizing and
26+
configuring these fault tolerance capabilities, please see
27+
[fault_tolerance.md](docs/fault_tolerance.md).
28+
1829
## Description
1930

2031
Kueue has a well-developed pattern for Kueue-enabling a Custom

api/v1beta2/appwrapper_types.go

+7-4
Original file line numberDiff line numberDiff line change
@@ -95,13 +95,16 @@ const (
9595
ResourcesDeployed AppWrapperCondition = "ResourcesDeployed"
9696
PodsReady AppWrapperCondition = "PodsReady"
9797
Unhealthy AppWrapperCondition = "Unhealthy"
98+
DeletingResources AppWrapperCondition = "DeletingResources"
9899
)
99100

100101
const (
101-
WarmupGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/warmupGracePeriodDuration"
102-
FailureGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/failureGracePeriodDuration"
103-
ResetPauseDurationAnnotation = "workload.codeflare.dev.appwrapper/resetPauseDuration"
104-
RetryLimitAnnotation = "workload.codeflare.dev.appwrapper/retryLimit"
102+
WarmupGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/warmupGracePeriodDuration"
103+
FailureGracePeriodDurationAnnotation = "workload.codeflare.dev.appwrapper/failureGracePeriodDuration"
104+
ResetPauseDurationAnnotation = "workload.codeflare.dev.appwrapper/resetPauseDuration"
105+
RetryLimitAnnotation = "workload.codeflare.dev.appwrapper/retryLimit"
106+
DeletionGracePeriodAnnotation = "workload.codeflare.dev.appwrapper/deletionGracePeriodDuration"
107+
DebuggingFailureDeletionDelayDurationAnnotation = "workload.codeflare.dev.appwrapper/debuggingFailureDeletionDelayDuration"
105108
)
106109

107110
//+kubebuilder:object:root=true

cmd/standalone/main.go

+5
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,11 @@ func main() {
8383
setupLog.Info("Build info", "version", BuildVersion, "date", BuildDate)
8484
setupLog.Info("Configuration", "config", awConfig)
8585

86+
if err := config.ValidateConfig(awConfig); err != nil {
87+
setupLog.Error(err, "invalid appwrapper config")
88+
os.Exit(1)
89+
}
90+
8691
// if the enable-http2 flag is false (the default), http/2 should be disabled
8792
// due to its vulnerabilities. More specifically, disabling http/2 will
8893
// prevent from being vulnerable to the HTTP/2 Stream Cancelation and

cmd/unified/main.go

+5
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,11 @@ func main() {
8484
setupLog.Info("Build info", "version", BuildVersion, "date", BuildDate)
8585
setupLog.Info("Configuration", "config", awConfig)
8686

87+
if err := config.ValidateConfig(awConfig); err != nil {
88+
setupLog.Error(err, "invalid appwrapper config")
89+
os.Exit(1)
90+
}
91+
8792
// if the enable-http2 flag is false (the default), http/2 should be disabled
8893
// due to its vulnerabilities. More specifically, disabling http/2 will
8994
// prevent from being vulnerable to the HTTP/2 Stream Cancelation and

docs/fault_tolerance.md

+71
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
## Fault Tolerance
2+
3+
### Overall Design
4+
5+
The `podSets` contained in the AppWrapper specification enable the AppWrapper
6+
controller to inject labels into every `Pod` that is created by
7+
the workload during its execution. Throughout the execution of the
8+
workload, the AppWrapper controller monitors the number and health of
9+
all labeled `Pods` and uses this information to determine if a
10+
workload is unhealthy. A workload can be deemed *unhealthy* either
11+
because it contains a non-zero number of `Failed` pods or because
12+
after the `WarmupGracePeriod` has passed and it has fewer
13+
`Running` and `Completed` pods than expected.
14+
15+
If a workload is determined to be unhealthy, the AppWrapper controller
16+
first waits for a `FailureGracePeriod` to allow the primary resource
17+
controller an opportunity to react and return the workload to a
18+
healthy state. If the `FailureGracePeriod` expires, the AppWrapper
19+
controller will *reset* the workload by deleting its resources, waiting
20+
for a `ResetPause`, and then creating new instances of the resources.
21+
During this reset period, the AppWrapper **does not** release the workload's
22+
quota; this ensures that when the resources are recreated they will still
23+
have sufficient quota to execute. The number of times an AppWrapper is reset
24+
is tracked as part of its status; if the number of resets exceeds the `RetryLimit`,
25+
then the AppWrapper moves into a `Failed` state and its resources are deleted
26+
(thus finally releasing its quota). If at any time during this retry loop,
27+
an AppWrapper is suspended (ie, Kueue decides to preempt the AppWrapper),
28+
the AppWrapper controller will respect this request by proceeding to delete
29+
the resources
30+
31+
When the AppWrapper controller decides to delete the resources for a workload,
32+
it proceeds through several phases. First it does a normal delete of the
33+
resources, allowing the primary resource controllers time to cascade the deletion
34+
through all child resources. During a `DeletionGracePeriod`, the AppWrapper controller
35+
monitors to see if the primary controllers have managed to successfully delete
36+
all of the workload's Pods and resources. If they fail to accomplish this within
37+
the `DeletionGracePeriod`, the AppWrapper controller then initiates a *forceful*
38+
deletion of all remaining Pods and resources by deleting them with a `GracePeriod` of `0`.
39+
An AppWrapper will continue to have its `ResourcesDeployed` condition to be
40+
`True` until all resources and Pods are successfully deleted.
41+
42+
This process ensures that when `ResourcesDeployed` becomes `False`, which
43+
indicates to Kueue that the quota has been released, all resources created by
44+
a failed workload will have been totally removed from the cluster.
45+
46+
### Configuration Details
47+
48+
The parameters of the retry loop described about are configured at the operator level
49+
and can be customized on a per-AppWrapper basis by adding annotations.
50+
The table below lists the parameters, gives their default, and the annotation that
51+
can be used to customize them.
52+
53+
| Parameter | Default Value | Annotation |
54+
|---------------------|---------------|---------------------------------------------------------------|
55+
| WarmupGracePeriod | 5 Minutes | workload.codeflare.dev.appwrapper/warmupGracePeriodDuration |
56+
| FailureGracePeriod | 1 Minute | workload.codeflare.dev.appwrapper/failureGracePeriodDuration |
57+
| ResetPause | 90 Seconds | workload.codeflare.dev.appwrapper/resetPauseDuration |
58+
| RetryLimit | 3 | workload.codeflare.dev.appwrapper/retryLimit |
59+
| DeletionGracePeriod | 10 Minutes | workload.codeflare.dev.appwrapper/deletionGracePeriodDuration |
60+
| GracePeriodCeiling | 24 Hours | Not Applicable |
61+
62+
The `GracePeriodCeiling` imposes an upper limit on the other grace periods to
63+
reduce the impact of user-added annotations on overall system utilization.
64+
65+
To support debugging `Failed` workloads, an additional annotation
66+
`workload.codeflare.dev.appwrapper/debuggingFailureDeletionDelayDuration` can
67+
be added to an AppWrapper when it is created to add a delay between the time the
68+
AppWrapper enters the `Failed` state and when the process of deleting its resources
69+
begins. Since the AppWrapper continues to consume quota during this delayed deletion period,
70+
this annotation should be used sparingly and only when interactive debugging of
71+
the failed workload is being actively pursued.

docs/state-diagram.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ stateDiagram-v2
4343
%% Failures
4444
rs --> f
4545
rn --> f
46-
rn --> rt : Pod Failures
46+
rn --> rt : Workload Unhealthy
4747
rt --> rs
4848
4949
classDef quota fill:lightblue

internal/controller/appwrapper/appwrapper_controller.go

+65-7
Original file line numberDiff line numberDiff line change
@@ -326,15 +326,41 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
326326
return r.updateStatus(ctx, aw, workloadv1beta2.AppWrapperResuming)
327327

328328
case workloadv1beta2.AppWrapperFailed:
329+
// Support for debugging failed jobs.
330+
// When an appwrapper is annotated with a non-zero debugging delay,
331+
// we hold quota for the delay period and do not delete the resources of
332+
// a failed appwrapper unless Kueue preempts it by setting Suspend to true.
333+
deletionDelay := r.debuggingFailureDeletionDelay(ctx, aw)
334+
335+
if deletionDelay > 0 && !aw.Spec.Suspend {
336+
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
337+
Type: string(workloadv1beta2.DeletingResources),
338+
Status: metav1.ConditionFalse,
339+
Reason: "DeletionPaused",
340+
Message: fmt.Sprintf("%v has value %v", workloadv1beta2.DebuggingFailureDeletionDelayDurationAnnotation, deletionDelay),
341+
})
342+
whenDelayed := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.DeletingResources)).LastTransitionTime
343+
344+
now := time.Now()
345+
deadline := whenDelayed.Add(deletionDelay)
346+
if now.Before(deadline) {
347+
return ctrl.Result{RequeueAfter: deadline.Sub(now)}, r.Status().Update(ctx, aw)
348+
}
349+
}
350+
329351
if meta.IsStatusConditionTrue(aw.Status.Conditions, string(workloadv1beta2.ResourcesDeployed)) {
330352
if !r.deleteComponents(ctx, aw) {
331353
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
332354
}
355+
msg := "Resources deleted for failed AppWrapper"
356+
if deletionDelay > 0 && aw.Spec.Suspend {
357+
msg = "Kueue forced resource deletion by suspending AppWrapper"
358+
}
333359
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
334360
Type: string(workloadv1beta2.ResourcesDeployed),
335361
Status: metav1.ConditionFalse,
336362
Reason: string(workloadv1beta2.AppWrapperFailed),
337-
Message: "Resources deleted for failed AppWrapper",
363+
Message: msg,
338364
})
339365
}
340366
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
@@ -393,26 +419,36 @@ func (r *AppWrapperReconciler) workloadStatus(ctx context.Context, aw *workloadv
393419
return summary, nil
394420
}
395421

422+
func (r *AppWrapperReconciler) limitDuration(desired time.Duration) time.Duration {
423+
if desired < 0 {
424+
return 0 * time.Second
425+
} else if desired > r.Config.FaultTolerance.GracePeriodCeiling {
426+
return r.Config.FaultTolerance.GracePeriodCeiling
427+
} else {
428+
return desired
429+
}
430+
}
431+
396432
func (r *AppWrapperReconciler) warmupGraceDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
397433
if userPeriod, ok := aw.Annotations[workloadv1beta2.WarmupGracePeriodDurationAnnotation]; ok {
398434
if duration, err := time.ParseDuration(userPeriod); err == nil {
399-
return duration
435+
return r.limitDuration(duration)
400436
} else {
401437
log.FromContext(ctx).Info("Malformed warmup period annotation", "annotation", userPeriod, "error", err)
402438
}
403439
}
404-
return r.Config.FaultTolerance.WarmupGracePeriod
440+
return r.limitDuration(r.Config.FaultTolerance.WarmupGracePeriod)
405441
}
406442

407443
func (r *AppWrapperReconciler) failureGraceDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
408444
if userPeriod, ok := aw.Annotations[workloadv1beta2.FailureGracePeriodDurationAnnotation]; ok {
409445
if duration, err := time.ParseDuration(userPeriod); err == nil {
410-
return duration
446+
return r.limitDuration(duration)
411447
} else {
412448
log.FromContext(ctx).Info("Malformed grace period annotation", "annotation", userPeriod, "error", err)
413449
}
414450
}
415-
return r.Config.FaultTolerance.FailureGracePeriod
451+
return r.limitDuration(r.Config.FaultTolerance.FailureGracePeriod)
416452
}
417453

418454
func (r *AppWrapperReconciler) retryLimit(ctx context.Context, aw *workloadv1beta2.AppWrapper) int32 {
@@ -429,12 +465,34 @@ func (r *AppWrapperReconciler) retryLimit(ctx context.Context, aw *workloadv1bet
429465
func (r *AppWrapperReconciler) resettingPauseDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
430466
if userPeriod, ok := aw.Annotations[workloadv1beta2.ResetPauseDurationAnnotation]; ok {
431467
if duration, err := time.ParseDuration(userPeriod); err == nil {
432-
return duration
468+
return r.limitDuration(duration)
433469
} else {
434470
log.FromContext(ctx).Info("Malformed reset pause annotation", "annotation", userPeriod, "error", err)
435471
}
436472
}
437-
return r.Config.FaultTolerance.ResetPause
473+
return r.limitDuration(r.Config.FaultTolerance.ResetPause)
474+
}
475+
476+
func (r *AppWrapperReconciler) deletionGraceDuration(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
477+
if userPeriod, ok := aw.Annotations[workloadv1beta2.DeletionGracePeriodAnnotation]; ok {
478+
if duration, err := time.ParseDuration(userPeriod); err == nil {
479+
return r.limitDuration(duration)
480+
} else {
481+
log.FromContext(ctx).Info("Malformed deletion period annotation", "annotation", userPeriod, "error", err)
482+
}
483+
}
484+
return r.limitDuration(r.Config.FaultTolerance.DeletionGracePeriod)
485+
}
486+
487+
func (r *AppWrapperReconciler) debuggingFailureDeletionDelay(ctx context.Context, aw *workloadv1beta2.AppWrapper) time.Duration {
488+
if userPeriod, ok := aw.Annotations[workloadv1beta2.DebuggingFailureDeletionDelayDurationAnnotation]; ok {
489+
if duration, err := time.ParseDuration(userPeriod); err == nil {
490+
return r.limitDuration(duration)
491+
} else {
492+
log.FromContext(ctx).Info("Malformed delay deletion annotation", "annotation", userPeriod, "error", err)
493+
}
494+
}
495+
return 0 * time.Second
438496
}
439497

440498
func clearCondition(aw *workloadv1beta2.AppWrapper, condition workloadv1beta2.AppWrapperCondition, reason string, message string) {

internal/controller/appwrapper/resource_management.go

+56-2
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@ package appwrapper
1919
import (
2020
"context"
2121
"fmt"
22+
"time"
2223

2324
workloadv1beta2 "github.com/project-codeflare/appwrapper/api/v1beta2"
2425
"github.com/project-codeflare/appwrapper/pkg/utils"
26+
v1 "k8s.io/api/core/v1"
2527
apierrors "k8s.io/apimachinery/pkg/api/errors"
2628
"k8s.io/apimachinery/pkg/api/meta"
2729
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -164,7 +166,11 @@ func (r *AppWrapperReconciler) createComponents(ctx context.Context, aw *workloa
164166
}
165167

166168
func (r *AppWrapperReconciler) deleteComponents(ctx context.Context, aw *workloadv1beta2.AppWrapper) bool {
167-
// TODO forceful deletion: See https://github.com/project-codeflare/appwrapper/issues/36
169+
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
170+
Type: string(workloadv1beta2.DeletingResources),
171+
Status: metav1.ConditionTrue,
172+
Reason: "DeletionInitiated",
173+
})
168174
log := log.FromContext(ctx)
169175
remaining := 0
170176
for _, component := range aw.Spec.Components {
@@ -181,5 +187,53 @@ func (r *AppWrapperReconciler) deleteComponents(ctx context.Context, aw *workloa
181187
}
182188
remaining++ // no error deleting resource, resource therefore still exists
183189
}
184-
return remaining == 0
190+
191+
deletionGracePeriod := r.deletionGraceDuration(ctx, aw)
192+
whenInitiated := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.DeletingResources)).LastTransitionTime
193+
gracePeriodExpired := time.Now().After(whenInitiated.Time.Add(deletionGracePeriod))
194+
195+
if remaining > 0 && !gracePeriodExpired {
196+
// Resources left and deadline hasn't expired, just requeue the deletion
197+
return false
198+
}
199+
200+
pods := &v1.PodList{Items: []v1.Pod{}}
201+
if err := r.List(ctx, pods,
202+
client.UnsafeDisableDeepCopy,
203+
client.InNamespace(aw.Namespace),
204+
client.MatchingLabels{AppWrapperLabel: aw.Name}); err != nil {
205+
log.Error(err, "Pod list error")
206+
}
207+
208+
if remaining == 0 && len(pods.Items) == 0 {
209+
// no resources or pods left; deletion is complete
210+
clearCondition(aw, workloadv1beta2.DeletingResources, "DeletionComplete", "")
211+
return true
212+
}
213+
214+
if gracePeriodExpired {
215+
if len(pods.Items) > 0 {
216+
// force deletion of pods first
217+
for _, pod := range pods.Items {
218+
if err := r.Delete(ctx, &pod, client.GracePeriodSeconds(0)); err != nil {
219+
log.Error(err, "Forceful pod deletion error")
220+
}
221+
}
222+
} else {
223+
// force deletion of wrapped resources once pods are gone
224+
for _, component := range aw.Spec.Components {
225+
obj, err := parseComponent(aw, component.Template.Raw)
226+
if err != nil {
227+
log.Error(err, "Parsing error")
228+
continue
229+
}
230+
if err := r.Delete(ctx, obj, client.GracePeriodSeconds(0)); err != nil && !apierrors.IsNotFound(err) {
231+
log.Error(err, "Forceful deletion error")
232+
}
233+
}
234+
}
235+
}
236+
237+
// requeue deletion
238+
return false
185239
}

0 commit comments

Comments
 (0)