Skip to content

Commit 27c372a

Browse files
Updating error status on NIMcache object for reconcile failure (#104)
* Updating error status on NIMcache object for reconcile failure --------- Signed-off-by: Vishesh Tanksale <[email protected]>
1 parent 0af0135 commit 27c372a

File tree

4 files changed

+136
-55
lines changed

4 files changed

+136
-55
lines changed

api/apps/v1alpha1/nimcache_types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ const (
172172
NimCacheConditionJobPending = "NIM_CACHE_JOB_PENDING"
173173
// NimCacheConditionPVCCreated indicates that the caching pvc is created.
174174
NimCacheConditionPVCCreated = "NIM_CACHE_PVC_CREATED"
175+
// NimCacheConditionReconcileFailed indicated that error occured while reconciling NIMCache object
176+
NimCacheConditionReconcileFailed = "NIM_CACHE_RECONCILE_FAILED"
175177

176178
// NimCacheStatusNotReady indicates that cache is not ready
177179
NimCacheStatusNotReady = "NotReady"

internal/conditions/conditions.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,3 +148,17 @@ func UpdateCondition(conditions *[]metav1.Condition, conditionType string, statu
148148
})
149149
// condition updated
150150
}
151+
152+
func IfPresentUpdateCondition(conditions *[]metav1.Condition, conditionType string, status metav1.ConditionStatus, reason, message string) {
153+
for i := range *conditions {
154+
if (*conditions)[i].Type == conditionType {
155+
// existing condition
156+
(*conditions)[i].Status = status
157+
(*conditions)[i].LastTransitionTime = metav1.Now()
158+
(*conditions)[i].Reason = reason
159+
(*conditions)[i].Message = message
160+
// condition updated
161+
return
162+
}
163+
}
164+
}

internal/controller/nimcache_controller.go

Lines changed: 31 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,6 @@ func (r *NIMCacheReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
125125
}
126126
return ctrl.Result{}, client.IgnoreNotFound(err)
127127
}
128-
129128
logger.Info("Reconciling", "NIMCache", nimCache.Name)
130129

131130
// Check if the instance is marked for deletion
@@ -153,14 +152,19 @@ func (r *NIMCacheReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
153152
return ctrl.Result{}, nil
154153
}
155154
}
156-
157155
// Handle nim-cache reconciliation
158156
result, err := r.reconcileNIMCache(ctx, nimCache)
159157
if err != nil {
160158
logger.Error(err, "error reconciling NIMCache", "name", nimCache.Name)
159+
conditions.UpdateCondition(&nimCache.Status.Conditions, appsv1alpha1.NimCacheConditionReconcileFailed, metav1.ConditionTrue, "ReconcileFailed", err.Error())
160+
nimCache.Status.State = appsv1alpha1.NimCacheStatusNotReady
161+
errUpdate := r.updateNIMCacheStatus(ctx, nimCache)
162+
if errUpdate != nil {
163+
logger.Error(err, "Failed to update NIMCache status", "NIMCache", nimCache.Name)
164+
return result, errUpdate
165+
}
161166
return result, err
162167
}
163-
164168
return result, nil
165169
}
166170

@@ -444,10 +448,6 @@ func (r *NIMCacheReconciler) reconcilePVC(ctx context.Context, nimCache *appsv1a
444448

445449
conditions.UpdateCondition(&nimCache.Status.Conditions, appsv1alpha1.NimCacheConditionPVCCreated, metav1.ConditionTrue, "PVCCreated", "The PVC has been created for caching NIM model")
446450
nimCache.Status.State = appsv1alpha1.NimCacheStatusPVCCreated
447-
if err := r.Status().Update(ctx, nimCache); err != nil {
448-
logger.Error(err, "Failed to update status", "NIMCache", nimCache.Name)
449-
return err
450-
}
451451
} else {
452452
logger.Error(err, "PVC doesn't exist and auto-creation is not enabled", "name", pvcNamespacedName)
453453
return err
@@ -621,10 +621,6 @@ func (r *NIMCacheReconciler) reconcileModelSelection(ctx context.Context, nimCac
621621
}
622622

623623
nimCache.Annotations[SelectedNIMProfilesAnnotationKey] = string(profilesJSON)
624-
if err := r.Update(ctx, nimCache); err != nil {
625-
logger.Error(err, "unable to update NIMCache with selected profiles annotation")
626-
return err
627-
}
628624
}
629625
return nil
630626
}
@@ -659,10 +655,6 @@ func (r *NIMCacheReconciler) reconcileJob(ctx context.Context, nimCache *appsv1a
659655
conditions.UpdateCondition(&nimCache.Status.Conditions, appsv1alpha1.NimCacheConditionJobCreated, metav1.ConditionTrue, "JobCreated", "The Job to cache NIM has been created")
660656
nimCache.Status.State = appsv1alpha1.NimCacheStatusStarted
661657
nimCache.Status.Profiles = []v1alpha1.NIMProfile{}
662-
if err := r.Status().Update(ctx, nimCache); err != nil {
663-
return err
664-
}
665-
// return to reconcile later on job status update
666658
return nil
667659
}
668660

@@ -713,39 +705,24 @@ func (r *NIMCacheReconciler) reconcileJobStatus(ctx context.Context, nimCache *a
713705
}
714706
}
715707

716-
if err := r.Status().Update(ctx, nimCache); err != nil {
717-
return fmt.Errorf("failed to update status: %w", err)
718-
}
719-
720708
case job.Status.Failed > 0 && nimCache.Status.State != appsv1alpha1.NimCacheStatusFailed:
721709
logger.Info("Failed to cache NIM, job failed", "job", jobName)
722710
conditions.UpdateCondition(&nimCache.Status.Conditions, appsv1alpha1.NimCacheConditionJobCompleted, metav1.ConditionFalse, "JobFailed", "The Job to cache NIM has failed")
723711
nimCache.Status.State = appsv1alpha1.NimCacheStatusFailed
724712
nimCache.Status.Profiles = []v1alpha1.NIMProfile{}
725713

726-
if err := r.Status().Update(ctx, nimCache); err != nil {
727-
return fmt.Errorf("failed to update status: %w", err)
728-
}
729-
730714
case job.Status.Active > 0 && nimCache.Status.State != appsv1alpha1.NimCacheStatusInProgress:
731715
logger.Info("Caching NIM is in progress, job running", "job", jobName)
732716
conditions.UpdateCondition(&nimCache.Status.Conditions, appsv1alpha1.NimCacheConditionJobPending, metav1.ConditionFalse, "JobRunning", "The Job to cache NIM is in progress")
733717
nimCache.Status.State = appsv1alpha1.NimCacheStatusInProgress
734718
nimCache.Status.Profiles = []v1alpha1.NIMProfile{}
735719

736-
if err := r.Status().Update(ctx, nimCache); err != nil {
737-
return fmt.Errorf("failed to update status: %w", err)
738-
}
739-
740720
case job.Status.Active == 0 && nimCache.Status.State != appsv1alpha1.NimCacheStatusReady && nimCache.Status.State != appsv1alpha1.NimCacheStatusPending:
741721
logger.Info("Caching NIM is in progress, job pending", "job", jobName)
742722
conditions.UpdateCondition(&nimCache.Status.Conditions, appsv1alpha1.NimCacheConditionJobPending, metav1.ConditionTrue, "JobPending", "The Job to cache NIM is in pending state")
743723
nimCache.Status.State = appsv1alpha1.NimCacheStatusPending
744724
nimCache.Status.Profiles = []v1alpha1.NIMProfile{}
745725

746-
if err := r.Status().Update(ctx, nimCache); err != nil {
747-
return fmt.Errorf("failed to update status: %w", err)
748-
}
749726
}
750727

751728
return nil
@@ -815,9 +792,33 @@ func (r *NIMCacheReconciler) reconcileNIMCache(ctx context.Context, nimCache *ap
815792
logger.Error(err, "reconciliation of caching job failed", "job", getJobName(nimCache))
816793
return ctrl.Result{}, err
817794
}
795+
796+
conditions.IfPresentUpdateCondition(&nimCache.Status.Conditions, appsv1alpha1.NimCacheConditionReconcileFailed, metav1.ConditionFalse, "Reconciled", "")
797+
798+
err = r.updateNIMCacheStatus(ctx, nimCache)
799+
if err != nil {
800+
logger.Error(err, "Failed to update NIMCache status", "NIMCache", nimCache.Name)
801+
return ctrl.Result{}, err
802+
}
818803
return ctrl.Result{}, nil
819804
}
820805

806+
func (r *NIMCacheReconciler) updateNIMCacheStatus(ctx context.Context, nimCache *appsv1alpha1.NIMCache) error {
807+
logger := r.GetLogger()
808+
obj := &appsv1alpha1.NIMCache{}
809+
errGet := r.Get(ctx, types.NamespacedName{Name: nimCache.Name, Namespace: nimCache.GetNamespace()}, obj)
810+
if errGet != nil {
811+
logger.Error(errGet, "error getting NIMCache", "name", nimCache.Name)
812+
return errGet
813+
}
814+
obj.Status = nimCache.Status
815+
if err := r.Status().Update(ctx, obj); err != nil {
816+
logger.Error(err, "Failed to update status", "NIMCache", nimCache.Name)
817+
return err
818+
}
819+
return nil
820+
}
821+
821822
func getJobName(nimCache *appsv1alpha1.NIMCache) string {
822823
return fmt.Sprintf("%s-job", nimCache.GetName())
823824
}

0 commit comments

Comments
 (0)