Skip to content

Commit bdcd8ee

Browse files
authored
Increase fault tolerance during resuming phase (#157)
* break resource creation into microsteps * tolerate non-fatal errors during resuming phase during the grace period
1 parent 296ae7e commit bdcd8ee

File tree

2 files changed

+47
-25
lines changed

2 files changed

+47
-25
lines changed

internal/controller/appwrapper/appwrapper_controller.go

+9-1
Original file line numberDiff line numberDiff line change
@@ -196,14 +196,22 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
196196
}
197197
err, fatal := r.createComponents(ctx, aw)
198198
if err != nil {
199+
if !fatal {
200+
startTime := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.ResourcesDeployed)).LastTransitionTime
201+
graceDuration := r.admissionGraceDuration(ctx, aw)
202+
if time.Now().Before(startTime.Add(graceDuration)) {
203+
// be patient; non-fatal error; requeue and keep trying
204+
return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
205+
}
206+
}
199207
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
200208
Type: string(workloadv1beta2.Unhealthy),
201209
Status: metav1.ConditionTrue,
202210
Reason: "CreateFailed",
203211
Message: fmt.Sprintf("error creating components: %v", err),
204212
})
205213
if fatal {
206-
return r.updateStatus(ctx, aw, workloadv1beta2.AppWrapperFailed) // abort on fatal error
214+
return r.updateStatus(ctx, aw, workloadv1beta2.AppWrapperFailed) // always move to failed on fatal error
207215
} else {
208216
return r.resetOrFail(ctx, aw)
209217
}

internal/controller/appwrapper/resource_management.go

+38-24
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ func parseComponent(raw []byte, expectedNamespace string) (*unstructured.Unstruc
5252
}
5353

5454
//gocyclo:ignore
55-
func (r *AppWrapperReconciler) createComponent(ctx context.Context, aw *workloadv1beta2.AppWrapper, componentIdx int) (*unstructured.Unstructured, error, bool) {
55+
func (r *AppWrapperReconciler) createComponent(ctx context.Context, aw *workloadv1beta2.AppWrapper, componentIdx int) (error, bool) {
5656
component := aw.Spec.Components[componentIdx]
5757
componentStatus := aw.Status.ComponentStatus[componentIdx]
5858
toMap := func(x interface{}) map[string]string {
@@ -80,7 +80,7 @@ func (r *AppWrapperReconciler) createComponent(ctx context.Context, aw *workload
8080

8181
obj, err := parseComponent(component.Template.Raw, aw.Namespace)
8282
if err != nil {
83-
return nil, err, true
83+
return err, true
8484
}
8585
if r.Config.EnableKueueIntegrations && !r.Config.DisableChildAdmissionCtrl {
8686
obj.SetLabels(utilmaps.MergeKeepFirst(obj.GetLabels(), map[string]string{AppWrapperLabel: aw.Name, constants.QueueLabel: childJobQueueName}))
@@ -95,13 +95,13 @@ func (r *AppWrapperReconciler) createComponent(ctx context.Context, aw *workload
9595
if podSetsIdx < len(component.PodSetInfos) {
9696
toInject = &component.PodSetInfos[podSetsIdx]
9797
} else {
98-
return nil, fmt.Errorf("missing podSetInfo %v for component %v", podSetsIdx, componentIdx), true
98+
return fmt.Errorf("missing podSetInfo %v for component %v", podSetsIdx, componentIdx), true
9999
}
100100
}
101101

102102
p, err := utils.GetRawTemplate(obj.UnstructuredContent(), podSet.Path)
103103
if err != nil {
104-
return nil, err, true // Should not happen, path validity is enforced by validateAppWrapperInvariants
104+
return err, true // Should not happen, path validity is enforced by validateAppWrapperInvariants
105105
}
106106
if md, ok := p["metadata"]; !ok || md == nil {
107107
p["metadata"] = make(map[string]interface{})
@@ -113,7 +113,7 @@ func (r *AppWrapperReconciler) createComponent(ctx context.Context, aw *workload
113113
if len(toInject.Annotations) > 0 {
114114
existing := toMap(metadata["annotations"])
115115
if err := utilmaps.HaveConflict(existing, toInject.Annotations); err != nil {
116-
return nil, podset.BadPodSetsUpdateError("annotations", err), true
116+
return podset.BadPodSetsUpdateError("annotations", err), true
117117
}
118118
metadata["annotations"] = utilmaps.MergeKeepFirst(existing, toInject.Annotations)
119119
}
@@ -122,15 +122,15 @@ func (r *AppWrapperReconciler) createComponent(ctx context.Context, aw *workload
122122
mergedLabels := utilmaps.MergeKeepFirst(toInject.Labels, awLabels)
123123
existing := toMap(metadata["labels"])
124124
if err := utilmaps.HaveConflict(existing, mergedLabels); err != nil {
125-
return nil, podset.BadPodSetsUpdateError("labels", err), true
125+
return podset.BadPodSetsUpdateError("labels", err), true
126126
}
127127
metadata["labels"] = utilmaps.MergeKeepFirst(existing, mergedLabels)
128128

129129
// NodeSelectors
130130
if len(toInject.NodeSelector) > 0 {
131131
existing := toMap(spec["nodeSelector"])
132132
if err := utilmaps.HaveConflict(existing, toInject.NodeSelector); err != nil {
133-
return nil, podset.BadPodSetsUpdateError("nodeSelector", err), true
133+
return podset.BadPodSetsUpdateError("nodeSelector", err), true
134134
}
135135
spec["nodeSelector"] = utilmaps.MergeKeepFirst(existing, toInject.NodeSelector)
136136
}
@@ -156,43 +156,57 @@ func (r *AppWrapperReconciler) createComponent(ctx context.Context, aw *workload
156156
}
157157

158158
if err := controllerutil.SetControllerReference(aw, obj, r.Scheme); err != nil {
159-
return nil, err, true
159+
return err, true
160+
}
161+
162+
if meta.FindStatusCondition(aw.Status.ComponentStatus[componentIdx].Conditions, string(workloadv1beta2.ResourcesDeployed)) == nil {
163+
aw.Status.ComponentStatus[componentIdx].Name = obj.GetName()
164+
aw.Status.ComponentStatus[componentIdx].Kind = obj.GetKind()
165+
aw.Status.ComponentStatus[componentIdx].APIVersion = obj.GetAPIVersion()
166+
meta.SetStatusCondition(&aw.Status.ComponentStatus[componentIdx].Conditions, metav1.Condition{
167+
Type: string(workloadv1beta2.ResourcesDeployed),
168+
Status: metav1.ConditionUnknown,
169+
Reason: "ComponentCreationInitiated",
170+
})
171+
if err := r.Status().Update(ctx, aw); err != nil {
172+
return err, false
173+
}
160174
}
161175

162176
if err := r.Create(ctx, obj); err != nil {
163177
if apierrors.IsAlreadyExists(err) {
164178
// obj is not updated if Create returns an error; Get required for accurate information
165179
if err := r.Get(ctx, client.ObjectKeyFromObject(obj), obj); err != nil {
166-
return nil, err, false
180+
return err, false
167181
}
168182
ctrlRef := metav1.GetControllerOf(obj)
169183
if ctrlRef == nil || ctrlRef.Name != aw.Name {
170-
return nil, fmt.Errorf("resource %v exists, but is not controlled by appwrapper", obj.GetName()), true
184+
return fmt.Errorf("resource %v exists, but is not controlled by appwrapper", obj.GetName()), true
171185
}
172-
return obj, nil, false // ok -- already exists and the correct appwrapper owns it.
186+
// fall through. This is not actually an error. The object already exists and the correct appwrapper owns it.
173187
} else {
174-
return nil, err, meta.IsNoMatchError(err) || apierrors.IsInvalid(err) // fatal
188+
return err, meta.IsNoMatchError(err) || apierrors.IsInvalid(err) // fatal
175189
}
176190
}
177191

178-
return obj, nil, false
192+
meta.SetStatusCondition(&aw.Status.ComponentStatus[componentIdx].Conditions, metav1.Condition{
193+
Type: string(workloadv1beta2.ResourcesDeployed),
194+
Status: metav1.ConditionTrue,
195+
Reason: "ComponentCreatedSuccessfully",
196+
})
197+
if err := r.Status().Update(ctx, aw); err != nil {
198+
return err, false
199+
}
200+
201+
return nil, false
179202
}
180203

181204
func (r *AppWrapperReconciler) createComponents(ctx context.Context, aw *workloadv1beta2.AppWrapper) (error, bool) {
182205
for componentIdx := range aw.Spec.Components {
183206
if !meta.IsStatusConditionTrue(aw.Status.ComponentStatus[componentIdx].Conditions, string(workloadv1beta2.ResourcesDeployed)) {
184-
obj, err, fatal := r.createComponent(ctx, aw, componentIdx)
185-
if err != nil {
207+
if err, fatal := r.createComponent(ctx, aw, componentIdx); err != nil {
186208
return err, fatal
187209
}
188-
aw.Status.ComponentStatus[componentIdx].Name = obj.GetName()
189-
aw.Status.ComponentStatus[componentIdx].Kind = obj.GetKind()
190-
aw.Status.ComponentStatus[componentIdx].APIVersion = obj.GetAPIVersion()
191-
meta.SetStatusCondition(&aw.Status.ComponentStatus[componentIdx].Conditions, metav1.Condition{
192-
Type: string(workloadv1beta2.ResourcesDeployed),
193-
Status: metav1.ConditionTrue,
194-
Reason: "CompononetCreated",
195-
})
196210
}
197211
}
198212
return nil, false
@@ -201,7 +215,7 @@ func (r *AppWrapperReconciler) createComponents(ctx context.Context, aw *workloa
201215
func (r *AppWrapperReconciler) deleteComponents(ctx context.Context, aw *workloadv1beta2.AppWrapper) bool {
202216
deleteIfPresent := func(idx int, opts ...client.DeleteOption) bool {
203217
cs := &aw.Status.ComponentStatus[idx]
204-
if !meta.IsStatusConditionTrue(cs.Conditions, string(workloadv1beta2.ResourcesDeployed)) {
218+
if rd := meta.FindStatusCondition(cs.Conditions, string(workloadv1beta2.ResourcesDeployed)); rd == nil || rd.Status == metav1.ConditionFalse {
205219
return false // not present
206220
}
207221
obj := &metav1.PartialObjectMetadata{

0 commit comments

Comments
 (0)