@@ -19,6 +19,7 @@ package appwrapper
19
19
import (
20
20
"context"
21
21
"fmt"
22
+ "strconv"
22
23
"time"
23
24
24
25
v1 "k8s.io/api/core/v1"
@@ -168,6 +169,18 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
168
169
Reason : string (workloadv1beta2 .AppWrapperResuming ),
169
170
Message : "Suspend is false" ,
170
171
})
172
+ meta .SetStatusCondition (& aw .Status .Conditions , metav1.Condition {
173
+ Type : string (workloadv1beta2 .PodsReady ),
174
+ Status : metav1 .ConditionFalse ,
175
+ Reason : string (workloadv1beta2 .AppWrapperResuming ),
176
+ Message : "Suspend is false" ,
177
+ })
178
+ meta .SetStatusCondition (& aw .Status .Conditions , metav1.Condition {
179
+ Type : string (workloadv1beta2 .Unhealthy ),
180
+ Status : metav1 .ConditionFalse ,
181
+ Reason : string (workloadv1beta2 .AppWrapperResuming ),
182
+ Message : "Suspend is false" ,
183
+ })
171
184
return r .updateStatus (ctx , aw , workloadv1beta2 .AppWrapperResuming )
172
185
173
186
case workloadv1beta2 .AppWrapperResuming : // deploying components
@@ -176,16 +189,17 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
176
189
}
177
190
err , fatal := r .createComponents (ctx , aw )
178
191
if err != nil {
192
+ meta .SetStatusCondition (& aw .Status .Conditions , metav1.Condition {
193
+ Type : string (workloadv1beta2 .Unhealthy ),
194
+ Status : metav1 .ConditionTrue ,
195
+ Reason : "CreateFailed" ,
196
+ Message : fmt .Sprintf ("error creating components: %v" , err ),
197
+ })
179
198
if fatal {
180
- meta .SetStatusCondition (& aw .Status .Conditions , metav1.Condition {
181
- Type : string (workloadv1beta2 .PodsReady ),
182
- Status : metav1 .ConditionFalse ,
183
- Reason : "CreateFailed" ,
184
- Message : fmt .Sprintf ("fatal error creating components: %v" , err ),
185
- })
186
199
return r .updateStatus (ctx , aw , workloadv1beta2 .AppWrapperFailed ) // abort on fatal error
200
+ } else {
201
+ return r .resetOrFail (ctx , aw )
187
202
}
188
- return ctrl.Result {}, err // retry creation on transient error
189
203
}
190
204
return r .updateStatus (ctx , aw , workloadv1beta2 .AppWrapperRunning )
191
205
@@ -197,6 +211,8 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
197
211
if err != nil {
198
212
return ctrl.Result {}, err
199
213
}
214
+
215
+ // Handle Success
200
216
if podStatus .succeeded >= podStatus .expected && (podStatus .pending + podStatus .running + podStatus .failed == 0 ) {
201
217
meta .SetStatusCondition (& aw .Status .Conditions , metav1.Condition {
202
218
Type : string (workloadv1beta2 .QuotaReserved ),
@@ -206,16 +222,30 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
206
222
})
207
223
return r .updateStatus (ctx , aw , workloadv1beta2 .AppWrapperSucceeded )
208
224
}
225
+
226
+ // Handle Failed Pods
209
227
if podStatus .failed > 0 {
210
228
meta .SetStatusCondition (& aw .Status .Conditions , metav1.Condition {
211
- Type : string (workloadv1beta2 .PodsReady ),
212
- Status : metav1 .ConditionFalse ,
213
- Reason : "PodsFailed" ,
214
- Message : fmt .Sprintf ("%v pods failed (%v pods pending; %v pods running; %v pods succeeded)" ,
215
- podStatus .failed , podStatus .pending , podStatus .running , podStatus .succeeded ),
229
+ Type : string (workloadv1beta2 .Unhealthy ),
230
+ Status : metav1 .ConditionTrue ,
231
+ Reason : "FoundFailedPods" ,
232
+ // Intentionally no detailed message with failed pod count, since changing the message resets the transition time
216
233
})
217
- return r .updateStatus (ctx , aw , workloadv1beta2 .AppWrapperFailed )
234
+
235
+ // Grace period to give the resource controller a chance to correct the failure
236
+ whenDetected := meta .FindStatusCondition (aw .Status .Conditions , string (workloadv1beta2 .Unhealthy )).LastTransitionTime
237
+ gracePeriod := r .failureGraceDuration (ctx , aw )
238
+ now := time .Now ()
239
+ deadline := whenDetected .Add (gracePeriod )
240
+ if now .Before (deadline ) {
241
+ return ctrl.Result {RequeueAfter : deadline .Sub (now )}, r .Status ().Update (ctx , aw )
242
+ } else {
243
+ return r .resetOrFail (ctx , aw )
244
+ }
218
245
}
246
+
247
+ clearCondition (aw , workloadv1beta2 .Unhealthy , "FoundNoFailedPods" , "" )
248
+
219
249
if podStatus .running + podStatus .succeeded >= podStatus .expected {
220
250
meta .SetStatusCondition (& aw .Status .Conditions , metav1.Condition {
221
251
Type : string (workloadv1beta2 .PodsReady ),
@@ -225,14 +255,23 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
225
255
})
226
256
return ctrl.Result {RequeueAfter : time .Minute }, r .Status ().Update (ctx , aw )
227
257
}
228
- meta .SetStatusCondition (& aw .Status .Conditions , metav1.Condition {
229
- Type : string (workloadv1beta2 .PodsReady ),
230
- Status : metav1 .ConditionFalse ,
231
- Reason : "InsufficientPodsReady" ,
232
- Message : fmt .Sprintf ("%v pods pending; %v pods running; %v pods succeeded" ,
233
- podStatus .pending , podStatus .running , podStatus .succeeded ),
234
- })
235
- return ctrl.Result {RequeueAfter : 5 * time .Second }, r .Status ().Update (ctx , aw )
258
+
259
+ // Not ready yet; either continue to wait or giveup if the warmup period has expired
260
+ podDetailsMessage := fmt .Sprintf ("%v pods pending; %v pods running; %v pods succeeded" , podStatus .pending , podStatus .running , podStatus .succeeded )
261
+ clearCondition (aw , workloadv1beta2 .PodsReady , "InsufficientPodsReady" , podDetailsMessage )
262
+ whenDeployed := meta .FindStatusCondition (aw .Status .Conditions , string (workloadv1beta2 .ResourcesDeployed )).LastTransitionTime
263
+ warmupDuration := r .warmupGraceDuration (ctx , aw )
264
+ if time .Now ().Before (whenDeployed .Add (warmupDuration )) {
265
+ return ctrl.Result {RequeueAfter : 5 * time .Second }, r .Status ().Update (ctx , aw )
266
+ } else {
267
+ meta .SetStatusCondition (& aw .Status .Conditions , metav1.Condition {
268
+ Type : string (workloadv1beta2 .Unhealthy ),
269
+ Status : metav1 .ConditionTrue ,
270
+ Reason : "InsufficientPodsReady" ,
271
+ Message : podDetailsMessage ,
272
+ })
273
+ return r .resetOrFail (ctx , aw )
274
+ }
236
275
237
276
case workloadv1beta2 .AppWrapperSuspending : // undeploying components
238
277
// finish undeploying components irrespective of desired state (suspend bit)
@@ -253,8 +292,45 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
253
292
Reason : string (workloadv1beta2 .AppWrapperSuspended ),
254
293
Message : "Suspend is true" ,
255
294
})
295
+ clearCondition (aw , workloadv1beta2 .PodsReady , string (workloadv1beta2 .AppWrapperSuspended ), "" )
296
+ clearCondition (aw , workloadv1beta2 .Unhealthy , string (workloadv1beta2 .AppWrapperSuspended ), "" )
256
297
return r .updateStatus (ctx , aw , workloadv1beta2 .AppWrapperSuspended )
257
298
299
+ case workloadv1beta2 .AppWrapperResetting :
300
+ if aw .Spec .Suspend {
301
+ return r .updateStatus (ctx , aw , workloadv1beta2 .AppWrapperSuspending ) // Suspending trumps Resetting
302
+ }
303
+
304
+ clearCondition (aw , workloadv1beta2 .PodsReady , string (workloadv1beta2 .AppWrapperResetting ), "" )
305
+ if meta .IsStatusConditionTrue (aw .Status .Conditions , string (workloadv1beta2 .ResourcesDeployed )) {
306
+ if ! r .deleteComponents (ctx , aw ) {
307
+ return ctrl.Result {RequeueAfter : 5 * time .Second }, nil
308
+ }
309
+ meta .SetStatusCondition (& aw .Status .Conditions , metav1.Condition {
310
+ Type : string (workloadv1beta2 .ResourcesDeployed ),
311
+ Status : metav1 .ConditionFalse ,
312
+ Reason : string (workloadv1beta2 .AppWrapperResetting ),
313
+ Message : "Resources deleted for resetting AppWrapper" ,
314
+ })
315
+ }
316
+
317
+ // Pause before transitioning to Resuming to heuristically allow transient system problems to subside
318
+ whenReset := meta .FindStatusCondition (aw .Status .Conditions , string (workloadv1beta2 .Unhealthy )).LastTransitionTime
319
+ pauseDuration := r .resettingPauseDuration (ctx , aw )
320
+ now := time .Now ()
321
+ deadline := whenReset .Add (pauseDuration )
322
+ if now .Before (deadline ) {
323
+ return ctrl.Result {RequeueAfter : deadline .Sub (now )}, r .Status ().Update (ctx , aw )
324
+ }
325
+
326
+ meta .SetStatusCondition (& aw .Status .Conditions , metav1.Condition {
327
+ Type : string (workloadv1beta2 .ResourcesDeployed ),
328
+ Status : metav1 .ConditionTrue ,
329
+ Reason : string (workloadv1beta2 .AppWrapperResuming ),
330
+ Message : "Reset complete; resuming" ,
331
+ })
332
+ return r .updateStatus (ctx , aw , workloadv1beta2 .AppWrapperResuming )
333
+
258
334
case workloadv1beta2 .AppWrapperFailed :
259
335
if meta .IsStatusConditionTrue (aw .Status .Conditions , string (workloadv1beta2 .ResourcesDeployed )) {
260
336
if ! r .deleteComponents (ctx , aw ) {
@@ -449,6 +525,16 @@ func (r *AppWrapperReconciler) updateStatus(ctx context.Context, aw *workloadv1b
449
525
return ctrl.Result {}, nil
450
526
}
451
527
528
+ func (r * AppWrapperReconciler ) resetOrFail (ctx context.Context , aw * workloadv1beta2.AppWrapper ) (ctrl.Result , error ) {
529
+ maxRetries := r .retryLimit (ctx , aw )
530
+ if aw .Status .Retries < maxRetries {
531
+ aw .Status .Retries += 1
532
+ return r .updateStatus (ctx , aw , workloadv1beta2 .AppWrapperResetting )
533
+ } else {
534
+ return r .updateStatus (ctx , aw , workloadv1beta2 .AppWrapperFailed )
535
+ }
536
+ }
537
+
452
538
func (r * AppWrapperReconciler ) workloadStatus (ctx context.Context , aw * workloadv1beta2.AppWrapper ) (* podStatusSummary , error ) {
453
539
pods := & v1.PodList {}
454
540
if err := r .List (ctx , pods ,
@@ -474,6 +560,61 @@ func (r *AppWrapperReconciler) workloadStatus(ctx context.Context, aw *workloadv
474
560
return summary , nil
475
561
}
476
562
563
+ func (r * AppWrapperReconciler ) warmupGraceDuration (ctx context.Context , aw * workloadv1beta2.AppWrapper ) time.Duration {
564
+ if userPeriod , ok := aw .Annotations [workloadv1beta2 .WarmupGracePeriodDurationAnnotation ]; ok {
565
+ if duration , err := time .ParseDuration (userPeriod ); err == nil {
566
+ return duration
567
+ } else {
568
+ log .FromContext (ctx ).Info ("Malformed warmup period annotation" , "annotation" , userPeriod , "error" , err )
569
+ }
570
+ }
571
+ return r .Config .WarmupGracePeriod
572
+ }
573
+
574
+ func (r * AppWrapperReconciler ) failureGraceDuration (ctx context.Context , aw * workloadv1beta2.AppWrapper ) time.Duration {
575
+ if userPeriod , ok := aw .Annotations [workloadv1beta2 .FailureGracePeriodDurationAnnotation ]; ok {
576
+ if duration , err := time .ParseDuration (userPeriod ); err == nil {
577
+ return duration
578
+ } else {
579
+ log .FromContext (ctx ).Info ("Malformed grace period annotation" , "annotation" , userPeriod , "error" , err )
580
+ }
581
+ }
582
+ return r .Config .FailureGracePeriod
583
+ }
584
+
585
+ func (r * AppWrapperReconciler ) retryLimit (ctx context.Context , aw * workloadv1beta2.AppWrapper ) int32 {
586
+ if userLimit , ok := aw .Annotations [workloadv1beta2 .RetryLimitAnnotation ]; ok {
587
+ if limit , err := strconv .Atoi (userLimit ); err == nil {
588
+ return int32 (limit )
589
+ } else {
590
+ log .FromContext (ctx ).Info ("Malformed retry limit annotation" , "annotation" , userLimit , "error" , err )
591
+ }
592
+ }
593
+ return r .Config .RetryLimit
594
+ }
595
+
596
+ func (r * AppWrapperReconciler ) resettingPauseDuration (ctx context.Context , aw * workloadv1beta2.AppWrapper ) time.Duration {
597
+ if userPeriod , ok := aw .Annotations [workloadv1beta2 .ResetPauseDurationAnnotation ]; ok {
598
+ if duration , err := time .ParseDuration (userPeriod ); err == nil {
599
+ return duration
600
+ } else {
601
+ log .FromContext (ctx ).Info ("Malformed reset pause annotation" , "annotation" , userPeriod , "error" , err )
602
+ }
603
+ }
604
+ return r .Config .ResetPause
605
+ }
606
+
607
+ func clearCondition (aw * workloadv1beta2.AppWrapper , condition workloadv1beta2.AppWrapperCondition , reason string , message string ) {
608
+ if meta .IsStatusConditionTrue (aw .Status .Conditions , string (condition )) {
609
+ meta .SetStatusCondition (& aw .Status .Conditions , metav1.Condition {
610
+ Type : string (condition ),
611
+ Status : metav1 .ConditionFalse ,
612
+ Reason : reason ,
613
+ Message : message ,
614
+ })
615
+ }
616
+ }
617
+
477
618
// SetupWithManager sets up the controller with the Manager.
478
619
func (r * AppWrapperReconciler ) SetupWithManager (mgr ctrl.Manager ) error {
479
620
return ctrl .NewControllerManagedBy (mgr ).
0 commit comments