From f1b52ba15a0538855c01b780a87bef751a49a75d Mon Sep 17 00:00:00 2001 From: "Shiva Krishna, Merla" Date: Mon, 19 Aug 2024 21:58:56 -0700 Subject: [PATCH 1/3] Extract NIM model manifest in all cases for model caching Signed-off-by: Shiva Krishna, Merla --- internal/controller/nimcache_controller.go | 153 +++++++++++------- .../controller/nimcache_controller_test.go | 25 +++ 2 files changed, 117 insertions(+), 61 deletions(-) diff --git a/internal/controller/nimcache_controller.go b/internal/controller/nimcache_controller.go index fe45a115..650e2683 100644 --- a/internal/controller/nimcache_controller.go +++ b/internal/controller/nimcache_controller.go @@ -495,73 +495,107 @@ func getSelectedProfiles(nimCache *appsv1alpha1.NIMCache) ([]string, error) { return nil, nil } -func (r *NIMCacheReconciler) reconcileModelSelection(ctx context.Context, nimCache *appsv1alpha1.NIMCache) (requeue bool, err error) { +func (r *NIMCacheReconciler) reconcileModelManifest(ctx context.Context, nimCache *appsv1alpha1.NIMCache) (requeue bool, err error) { logger := r.GetLogger() - // reconcile model selection pod - if isModelSelectionRequired(nimCache) && !isModelSelectionDone(nimCache) { - // Create a temporary pod for parsing model manifest - pod := constructPodSpec(nimCache) - // Add nimCache as owner for watching on status change - if err := controllerutil.SetControllerReference(nimCache, pod, r.GetScheme()); err != nil { - return false, err - } - err := r.createPod(ctx, pod) - if err != nil { - logger.Error(err, "failed to create", "pod", pod.Name) - return false, err - } + existingConfig := &corev1.ConfigMap{} + cmName := getManifestConfigName(nimCache) + err = r.Get(ctx, client.ObjectKey{Name: cmName, Namespace: nimCache.Namespace}, existingConfig) + if err != nil && client.IgnoreNotFound(err) != nil { + logger.Error(err, "failed to get configmap of the model manifest", "name", cmName) + return false, err + } - existingPod := &corev1.Pod{} - err = r.Get(ctx, client.ObjectKey{Name: pod.Name, Namespace: nimCache.Namespace}, existingPod) - if err != nil { - logger.Error(err, "failed to get pod for model selection", "pod", pod.Name) - return false, err - } + // No action if the configmap is already created + if err == nil { + return false, nil + } - if existingPod.Status.Phase != corev1.PodRunning { - // requeue request with delay until the pod is ready - return true, nil - } + // Create a configmap by extracting the model manifest + // Create a temporary pod for parsing model manifest + pod := constructPodSpec(nimCache) + // Add nimCache as owner for watching on status change + if err := controllerutil.SetControllerReference(nimCache, pod, r.GetScheme()); err != nil { + return false, err + } + err = r.createPod(ctx, pod) + if err != nil { + logger.Error(err, "failed to create", "pod", pod.Name) + return false, err + } - // Extract manifest file - output, err := r.getPodLogs(ctx, existingPod) - if err != nil { - logger.Error(err, "failed to get pod logs for parsing model manifest file", "pod", pod.Name) - return false, err - } + existingPod := &corev1.Pod{} + err = r.Get(ctx, client.ObjectKey{Name: pod.Name, Namespace: nimCache.Namespace}, existingPod) + if err != nil { + logger.Error(err, "failed to get pod for model selection", "pod", pod.Name) + return false, err + } - // Parse the file - manifest, err := nimparser.ParseModelManifestFromRawOutput([]byte(output)) - if err != nil { - logger.Error(err, "Failed to parse model manifest from the pod") - return false, err - } - logger.V(2).Info("manifest file", "nimcache", nimCache.Name, "manifest", manifest) + if existingPod.Status.Phase != corev1.PodRunning { + // requeue request with delay until the pod is ready + return true, nil + } - // Create a ConfigMap with the model manifest file for re-use - err = r.createManifestConfigMap(ctx, nimCache, manifest) - if err != nil { - logger.Error(err, "Failed to create model manifest config map") - return false, err - } + // Extract manifest file + output, err := r.getPodLogs(ctx, existingPod) + if err != nil { + logger.Error(err, "failed to get pod logs for parsing model manifest file", "pod", pod.Name) + return false, err + } + // Parse the file + manifest, err := nimparser.ParseModelManifestFromRawOutput([]byte(output)) + if err != nil { + logger.Error(err, "Failed to parse model manifest from the pod") + return false, err + } + logger.V(2).Info("manifest file", "nimcache", nimCache.Name, "manifest", manifest) + + // Create a ConfigMap with the model manifest file for re-use + err = r.createManifestConfigMap(ctx, nimCache, manifest) + if err != nil { + logger.Error(err, "Failed to create model manifest config map") + return false, err + } + + // Model manifest is successfully extracted, cleanup temporary pod + err = r.Delete(ctx, existingPod) + if err != nil && !errors.IsNotFound(err) { + logger.Error(err, "failed to delete", "pod", pod.Name) + // requeue request with delay until the pod is cleaned up + // this is required as NIM containers are resource heavy + return true, err + } + return false, nil +} + +func (r *NIMCacheReconciler) reconcileModelSelection(ctx context.Context, nimCache *appsv1alpha1.NIMCache) error { + logger := r.GetLogger() + + // reconcile model selection pod + if isModelSelectionRequired(nimCache) && !isModelSelectionDone(nimCache) { var discoveredGPUs []string // If no specific GPUs are provided, then auto-detect GPUs in the cluster for profile selection if len(nimCache.Spec.Source.NGC.Model.GPUs) == 0 { gpusByNode, err := r.GetNodeGPUProducts(ctx) if err != nil { logger.Error(err, "Failed to get gpus in the cluster") - return false, err + return err } discoveredGPUs = getUniqueGPUProducts(gpusByNode) } + // Get the model manifest from the config + nimManifest, err := r.extractNIMManifest(ctx, getManifestConfigName(nimCache), nimCache.GetNamespace()) + if err != nil { + return fmt.Errorf("failed to get model manifest config file: %w", err) + } + // Match profiles with user input - profiles, err := nimparser.MatchProfiles(nimCache.Spec.Source.NGC.Model, *manifest, discoveredGPUs) + profiles, err := nimparser.MatchProfiles(nimCache.Spec.Source.NGC.Model, *nimManifest, discoveredGPUs) if err != nil { logger.Error(err, "Failed to match profiles for given model parameters") - return false, err + return err } // Add the annotation to the NIMCache object @@ -572,25 +606,16 @@ func (r *NIMCacheReconciler) reconcileModelSelection(ctx context.Context, nimCac profilesJSON, err := json.Marshal(profiles) if err != nil { logger.Error(err, "unable to marshal profiles to JSON") - return false, err + return err } nimCache.Annotations[SelectedNIMProfilesAnnotationKey] = string(profilesJSON) if err := r.Update(ctx, nimCache); err != nil { logger.Error(err, "unable to update NIMCache with selected profiles annotation") - return false, err - } - - // Selected profiles updated, cleanup temporary pod - err = r.Delete(ctx, existingPod) - if err != nil && !errors.IsNotFound(err) { - logger.Error(err, "failed to delete", "pod", pod.Name) - // requeue request with delay until the pod is cleaned up - // this is required as NIM containers are resource heavy - return true, err + return err } } - return false, nil + return nil } func (r *NIMCacheReconciler) reconcileJob(ctx context.Context, nimCache *appsv1alpha1.NIMCache) error { @@ -755,10 +780,9 @@ func (r *NIMCacheReconciler) reconcileNIMCache(ctx context.Context, nimCache *ap return ctrl.Result{}, err } - // Reconcile NIM model selection - requeue, err := r.reconcileModelSelection(ctx, nimCache) + requeue, err := r.reconcileModelManifest(ctx, nimCache) if err != nil { - logger.Error(err, "reconciliation of model selection failed", "pod", getPodName(nimCache)) + logger.Error(err, "reconciliation of model manifest failed", "pod", getPodName(nimCache)) return ctrl.Result{}, err } @@ -767,6 +791,13 @@ func (r *NIMCacheReconciler) reconcileNIMCache(ctx context.Context, nimCache *ap return ctrl.Result{RequeueAfter: time.Second * 30}, err } + // Reconcile NIM model selection + err = r.reconcileModelSelection(ctx, nimCache) + if err != nil { + logger.Error(err, "reconciliation of model selection failed", "pod", getPodName(nimCache)) + return ctrl.Result{}, err + } + // Reconcile caching Job err = r.reconcileJob(ctx, nimCache) if err != nil { diff --git a/internal/controller/nimcache_controller_test.go b/internal/controller/nimcache_controller_test.go index bbc28c4f..6a5be768 100644 --- a/internal/controller/nimcache_controller_test.go +++ b/internal/controller/nimcache_controller_test.go @@ -60,6 +60,31 @@ var _ = Describe("NIMCache Controller", func() { Client: client, scheme: scheme, } + + nimCache := &appsv1alpha1.NIMCache{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-nimcache", + Namespace: "default", + }, + Spec: appsv1alpha1.NIMCacheSpec{ + Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret"}}, + }, + } + + // Create a model manifest configmap, as we cannot run a sample NIM container to extract for tests + filePath := filepath.Join("testdata", "manifest_trtllm.yaml") + manifestData, err := nimparser.ParseModelManifest(filePath) + Expect(err).NotTo(HaveOccurred()) + Expect(*manifestData).To(HaveLen(2)) + + err = reconciler.createManifestConfigMap(context.TODO(), nimCache, manifestData) + Expect(err).NotTo(HaveOccurred()) + + // Verify that the ConfigMap was created + createdConfigMap := &corev1.ConfigMap{} + err = client.Get(context.TODO(), types.NamespacedName{Name: getManifestConfigName(nimCache), Namespace: "default"}, createdConfigMap) + Expect(err).NotTo(HaveOccurred()) + Expect(createdConfigMap.Data).To(HaveKey("model_manifest.yaml")) }) AfterEach(func() { From 9a205b3fc315a27c8a90195fd031bf3b21086a2d Mon Sep 17 00:00:00 2001 From: "Shiva Krishna, Merla" Date: Mon, 19 Aug 2024 22:02:53 -0700 Subject: [PATCH 2/3] Update log statement Signed-off-by: Shiva Krishna, Merla --- internal/controller/nimcache_controller.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/controller/nimcache_controller.go b/internal/controller/nimcache_controller.go index 650e2683..420890f1 100644 --- a/internal/controller/nimcache_controller.go +++ b/internal/controller/nimcache_controller.go @@ -782,7 +782,7 @@ func (r *NIMCacheReconciler) reconcileNIMCache(ctx context.Context, nimCache *ap requeue, err := r.reconcileModelManifest(ctx, nimCache) if err != nil { - logger.Error(err, "reconciliation of model manifest failed", "pod", getPodName(nimCache)) + logger.Error(err, "reconciliation to extract model manifest failed", "pod", getPodName(nimCache)) return ctrl.Result{}, err } @@ -794,7 +794,7 @@ func (r *NIMCacheReconciler) reconcileNIMCache(ctx context.Context, nimCache *ap // Reconcile NIM model selection err = r.reconcileModelSelection(ctx, nimCache) if err != nil { - logger.Error(err, "reconciliation of model selection failed", "pod", getPodName(nimCache)) + logger.Error(err, "reconciliation of model selection failed") return ctrl.Result{}, err } From 12e3447c39072a4f845dc6cb8c54482b135b55c0 Mon Sep 17 00:00:00 2001 From: "Shiva Krishna, Merla" Date: Tue, 20 Aug 2024 16:38:44 -0700 Subject: [PATCH 3/3] Extract model manifest config only from NGC model pullers (NIM images) Signed-off-by: Shiva Krishna, Merla --- internal/controller/nimcache_controller.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/internal/controller/nimcache_controller.go b/internal/controller/nimcache_controller.go index 420890f1..21700d23 100644 --- a/internal/controller/nimcache_controller.go +++ b/internal/controller/nimcache_controller.go @@ -498,6 +498,11 @@ func getSelectedProfiles(nimCache *appsv1alpha1.NIMCache) ([]string, error) { func (r *NIMCacheReconciler) reconcileModelManifest(ctx context.Context, nimCache *appsv1alpha1.NIMCache) (requeue bool, err error) { logger := r.GetLogger() + // Model manifest is available only for NGC model pullers + if nimCache.Spec.Source.NGC == nil { + return false, nil + } + existingConfig := &corev1.ConfigMap{} cmName := getManifestConfigName(nimCache) err = r.Get(ctx, client.ObjectKey{Name: cmName, Namespace: nimCache.Namespace}, existingConfig)