Skip to content

Commit

Permalink
Adding parser for 1.4 model manifest (#236)
Browse files Browse the repository at this point in the history
* Adding parser for 1.4 model manifest

Signed-off-by: Vishesh Tanksale <[email protected]>

* Updating the profile selection logic

Signed-off-by: Vishesh Tanksale <[email protected]>

* Updating the profile selection logic

Signed-off-by: Vishesh Tanksale <[email protected]>

* Updating description of new field

Signed-off-by: Vishesh Tanksale <[email protected]>

* Updating description of new field

Signed-off-by: Vishesh Tanksale <[email protected]>

---------

Signed-off-by: Vishesh Tanksale <[email protected]>
  • Loading branch information
visheshtanksale authored Nov 21, 2024
1 parent d3e8a73 commit edb985e
Show file tree
Hide file tree
Showing 12 changed files with 667 additions and 273 deletions.
2 changes: 2 additions & 0 deletions api/apps/v1alpha1/nimcache_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ type ModelSpec struct {
GPUs []GPUSpec `json:"gpus,omitempty"`
// Lora indicates a finetuned model with LoRa adapters
Lora *bool `json:"lora,omitempty"`
// Buildable indicates generic model profiles that can be optimized with an NVIDIA engine for any GPUs
Buildable *bool `json:"buildable,omitempty"`
}

// GPUSpec is the spec required to cache models for selected gpu type
Expand Down
5 changes: 5 additions & 0 deletions api/apps/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions bundle/manifests/apps.nvidia.com_nimcaches.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,11 @@ spec:
model:
description: Model spec for caching
properties:
buildable:
description: Buildable indicates generic model profiles
that can be optimized with an NVIDIA engine for any
GPUs
type: boolean
engine:
description: Engine is the backend engine (tensort_llm,
vllm)
Expand Down
5 changes: 5 additions & 0 deletions config/crd/bases/apps.nvidia.com_nimcaches.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,11 @@ spec:
model:
description: Model spec for caching
properties:
buildable:
description: Buildable indicates generic model profiles
that can be optimized with an NVIDIA engine for any
GPUs
type: boolean
engine:
description: Engine is the backend engine (tensort_llm,
vllm)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,11 @@ spec:
model:
description: Model spec for caching
properties:
buildable:
description: Buildable indicates generic model profiles
that can be optimized with an NVIDIA engine for any
GPUs
type: boolean
engine:
description: Engine is the backend engine (tensort_llm,
vllm)
Expand Down
24 changes: 14 additions & 10 deletions internal/controller/nimcache_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
platform "github.com/NVIDIA/k8s-nim-operator/internal/controller/platform"
"github.com/NVIDIA/k8s-nim-operator/internal/k8sutil"
"github.com/NVIDIA/k8s-nim-operator/internal/nimparser"
nimparserutils "github.com/NVIDIA/k8s-nim-operator/internal/nimparser/utils"
"github.com/NVIDIA/k8s-nim-operator/internal/render"
"github.com/NVIDIA/k8s-nim-operator/internal/shared"
"github.com/NVIDIA/k8s-nim-operator/internal/utils"
Expand Down Expand Up @@ -621,16 +622,17 @@ func (r *NIMCacheReconciler) reconcileModelManifest(ctx context.Context, nimCach
return true, nil
}

parser := nimparserutils.GetNIMParser([]byte(output))
// Parse the file
manifest, err := nimparser.ParseModelManifestFromRawOutput([]byte(output))
manifest, err := parser.ParseModelManifestFromRawOutput([]byte(output))
if err != nil {
logger.Error(err, "Failed to parse model manifest from the pod")
return false, err
}
logger.V(2).Info("manifest file", "nimcache", nimCache.Name, "manifest", manifest)

// Create a ConfigMap with the model manifest file for re-use
err = r.createManifestConfigMap(ctx, nimCache, manifest)
err = r.createManifestConfigMap(ctx, nimCache, &manifest)
if err != nil {
logger.Error(err, "Failed to create model manifest config map")
return false, err
Expand Down Expand Up @@ -670,7 +672,7 @@ func (r *NIMCacheReconciler) reconcileModelSelection(ctx context.Context, nimCac
}

// Match profiles with user input
profiles, err := nimparser.MatchProfiles(nimCache.Spec.Source.NGC.Model, *nimManifest, discoveredGPUs)
profiles, err := nimManifest.MatchProfiles(nimCache.Spec.Source.NGC.Model, discoveredGPUs)
if err != nil {
logger.Error(err, "Failed to match profiles for given model parameters")
return err
Expand Down Expand Up @@ -758,17 +760,18 @@ func (r *NIMCacheReconciler) reconcileJobStatus(ctx context.Context, nimCache *a
logger.V(2).Info("model manifest config", "manifest", nimManifest)

// for selected profiles, update relevant info for status
for profileName, profileData := range *nimManifest {
for _, profileName := range nimManifest.GetProfilesList() {
for _, selectedProfile := range selectedProfiles {
if profileName == selectedProfile {
nimCache.Status.Profiles = append(nimCache.Status.Profiles, appsv1alpha1.NIMProfile{
Name: profileName,
Model: profileData.Model,
Config: profileData.Tags,
Release: profileData.Release,
Model: nimManifest.GetProfileModel(profileName),
Config: nimManifest.GetProfileTags(profileName),
Release: nimManifest.GetProfileRelease(profileName),
})
}
}

}
}

Expand Down Expand Up @@ -1241,7 +1244,7 @@ func (r *NIMCacheReconciler) createCertVolumesAndMounts(ctx context.Context, nim
}

// extractNIMManifest extracts the NIMManifest from the ConfigMap data
func (r *NIMCacheReconciler) extractNIMManifest(ctx context.Context, configName, namespace string) (*nimparser.NIMManifest, error) {
func (r *NIMCacheReconciler) extractNIMManifest(ctx context.Context, configName, namespace string) (nimparser.NIMManifestInterface, error) {
configMap, err := r.getConfigMap(ctx, configName, namespace)
if err != nil {
return nil, fmt.Errorf("unable to get ConfigMap %s: %w", configName, err)
Expand All @@ -1252,15 +1255,16 @@ func (r *NIMCacheReconciler) extractNIMManifest(ctx context.Context, configName,
return nil, fmt.Errorf("model_manifest.yaml not found in ConfigMap")
}

manifest, err := nimparser.ParseModelManifestFromRawOutput([]byte(data))
parser := nimparserutils.GetNIMParser([]byte(data))
manifest, err := parser.ParseModelManifestFromRawOutput([]byte(data))
if err != nil {
return nil, fmt.Errorf("failed to unmarshal manifest data: %w", err)
}
return manifest, nil
}

// createManifestConfigMap creates a ConfigMap with the given model manifest data
func (r *NIMCacheReconciler) createManifestConfigMap(ctx context.Context, nimCache *appsv1alpha1.NIMCache, manifestData *nimparser.NIMManifest) error {
func (r *NIMCacheReconciler) createManifestConfigMap(ctx context.Context, nimCache *appsv1alpha1.NIMCache, manifestData *nimparser.NIMManifestInterface) error {
// Convert manifestData to YAML
manifestBytes, err := yaml.Marshal(manifestData)
if err != nil {
Expand Down
16 changes: 9 additions & 7 deletions internal/controller/nimcache_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ import (

appsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1"
"github.com/NVIDIA/k8s-nim-operator/internal/k8sutil"
"github.com/NVIDIA/k8s-nim-operator/internal/nimparser"
nimparserv1 "github.com/NVIDIA/k8s-nim-operator/internal/nimparser/v1"
)

var _ = Describe("NIMCache Controller", func() {
Expand Down Expand Up @@ -79,11 +79,11 @@ var _ = Describe("NIMCache Controller", func() {

// Create a model manifest configmap, as we cannot run a sample NIM container to extract for tests
filePath := filepath.Join("testdata", "manifest_trtllm.yaml")
nimparser := nimparserv1.NIMParser{}
manifestData, err := nimparser.ParseModelManifest(filePath)
Expect(err).NotTo(HaveOccurred())
Expect(*manifestData).To(HaveLen(2))

err = reconciler.createManifestConfigMap(context.TODO(), nimCache, manifestData)
err = reconciler.createManifestConfigMap(context.TODO(), nimCache, &manifestData)
Expect(err).NotTo(HaveOccurred())

// Verify that the ConfigMap was created
Expand Down Expand Up @@ -601,11 +601,12 @@ var _ = Describe("NIMCache Controller", func() {
}

filePath := filepath.Join("testdata", "manifest_trtllm.yaml")

nimparser := nimparserv1.NIMParser{}
manifestData, err := nimparser.ParseModelManifest(filePath)
Expect(err).NotTo(HaveOccurred())
Expect(*manifestData).To(HaveLen(2))

err = reconciler.createManifestConfigMap(ctx, nimCache, manifestData)
err = reconciler.createManifestConfigMap(ctx, nimCache, &manifestData)
Expect(err).NotTo(HaveOccurred())

// Verify that the ConfigMap was created
Expand All @@ -618,8 +619,9 @@ var _ = Describe("NIMCache Controller", func() {
extractedManifest, err := reconciler.extractNIMManifest(ctx, createdConfigMap.Name, createdConfigMap.Namespace)
Expect(err).NotTo(HaveOccurred())
Expect(extractedManifest).NotTo(BeNil())
Expect(*extractedManifest).To(HaveLen(2))
profile, exists := (*extractedManifest)["03fdb4d11f01be10c31b00e7c0540e2835e89a0079b483ad2dd3c25c8cc29b61"]
nimManifest := extractedManifest.(nimparserv1.NIMManifest)

profile, exists := (nimManifest)["03fdb4d11f01be10c31b00e7c0540e2835e89a0079b483ad2dd3c25c8cc29b61"]
Expect(exists).To(BeTrue())
Expect(profile.Model).To(Equal("meta/llama3-70b-instruct"))
Expect(profile.Tags["llm_engine"]).To(Equal("tensorrt_llm"))
Expand Down
Loading

0 comments on commit edb985e

Please sign in to comment.