From 28d1c71fb67c97ba90fd2019098420db6e1133fb Mon Sep 17 00:00:00 2001
From: Mike McKiernan <mmckiernan@nvidia.com>
Date: Tue, 6 Aug 2024 09:21:09 -0400
Subject: [PATCH] Add developer install and caching

Signed-off-by: Mike McKiernan <mmckiernan@nvidia.com>
---
 docs/nimcache.md   | 180 +++++++++++++++++++++++++++++++++++++++++++++
 docs/nimservice.md | 143 +++++++++++++++++++++++++++++++++++
 2 files changed, 323 insertions(+)
 create mode 100644 docs/nimcache.md
 create mode 100644 docs/nimservice.md
diff --git a/docs/nimcache.md b/docs/nimcache.md
new file mode 100644
index 000000000..aa4a9f61a
--- /dev/null
+++ b/docs/nimcache.md
@@ -0,0 +1,180 @@
+<!--
+  SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+  SPDX-License-Identifier: Apache-2.0
+-->
+
+# Caching NIM Models
+
+Follow these steps to cache NIM models in a persistent volume.
+
+## Prerequisites
+
+* NVIDIA GPU Operator is installed.
+* NVIDIA NIM Operator is installed.
+* You must have an active subscription to an NVIDIA AI Enterprise product or be an
+  [NVIDIA Developer Program member](https://build.nvidia.com/explore/discover?integrate_nim=true&developer_enroll=true&self_hosted_api=true&signin=true).
+  Access to the containers and models for NVIDIA NIM microservices is restricted.
+
+* A persistent volume provisioner is installed.
+
+  The Local Path Provisioner from Rancher is acceptable for development on a single-node cluster.
+
+## 1. Create a Namespace for Running NIM Microservices
+
+```sh
+kubectl create ns nim-service
+```
+
+### 2. Create an Image Pull Secret for the NIM Container
+
+Replace <ngc-cli-api-key> with your NGC CLI API key.
+
+```sh
+kubectl create secret -n nim-service docker-registry ngc-secret \
+    --docker-server=nvcr.io \
+    --docker-username='$oauthtoken' \
+    --docker-password=<ngc-cli-api-key>
+```
+
+## 3. Create the NIM Cache Instance and Enable Model Auto-Detection
+
+Update the `NIMCache` custom resource (CR) with appropriate values for model selection.
+These include `model.precision`, `model.engine`, `model.qosProfile`, `model.gpu.product` and `model.gpu.ids`.
+With these, the NIM Operator can extract the supported profiles and use that for caching.
+
+Alternatively, if you specify `model.profiles`, then the model puller downloads and caches that particular model profile.
+
+```yaml
+apiVersion: apps.nvidia.com/v1alpha1
+kind: NIMCache
+metadata:
+  labels:
+    app.kubernetes.io/name: k8s-nim-operator
+    app.kubernetes.io/managed-by: kustomize
+  name: meta-llama3-8b-instruct
+spec:
+  source:
+    ngc:
+      modelPuller: nvcr.io/nim/meta/llama3-8b-instruct:1.0.0
+      pullSecret: ngc-secret
+      authSecret: ngc-api-secret
+      model:
+        profiles: []
+        autoDetect: true
+        precision: "fp8"
+        engine: "tensorrt_llm"
+        qosProfile: "throughput"
+        gpu:
+          product: "l40s"
+          ids:
+            - "26b5"
+        tensorParallelism: "1"
+  storage:
+    pvc:
+      create: true
+      storageClass: "local-path"
+      size: "50Gi"
+      volumeAccessMode: ReadWriteOnce
+```
+
+### 4. Create the CR
+
+```sh
+kubectl create -f nimcache.yaml -n nim-service
+```
+
+### 5. Verify the Progress of NIM Model Caching
+
+Verify that the NIM Operator has initiated the caching job and track status via the CR.
+
+```sh
+kubectl get nimcache -n nim-service -o wide
+```
+
+```output
+NAME                             STATUS   PVC                                  AGE
+meta-llama3-8b-instruct   ready    meta-llama3-8b-instruct-pvc   2024-07-04T23:22:13Z
+```
+
+Get the NIM cache so you can view the status:
+
+```sh
+kubectl get nimcache -n nim-service -o yaml
+```
+
+```output
+apiVersion: apps.nvidia.com/v1alpha1
+kind: NIMCache
+metadata:
+  annotations:
+    nvidia.com/selected-profiles: '["09e2f8e68f78ce94bf79d15b40a21333cea5d09dbe01ede63f6c957f4fcfab7b"]'
+  creationTimestamp: "2024-07-04T23:22:13Z"
+  finalizers:
+  - finalizer.nimcache.apps.nvidia.com
+  generation: 2
+  labels:
+    app.kubernetes.io/managed-by: kustomize
+    app.kubernetes.io/name: k8s-nim-operator
+  name: meta-llama3-8b-instruct
+  namespace: nim-cache
+  resourceVersion: "16539047"
+  uid: 81bda896-5ce2-4d63-b082-27c9a963250a
+spec:
+  source:
+    ngc:
+      authSecret: ngc-api-secret
+      model:
+        autoDetect: true
+        engine: tensorrt_llm
+        gpu:
+          ids:
+          - 26b5
+          product: l40s
+        precision: fp8
+        qosProfile: throughput
+        tensorParallelism: "1"
+      modelPuller: nvcr.io/nim/meta/llama3-8b-instruct:1.0.3
+      pullSecret: ngc-secret
+  storage:
+    pvc:
+      create: true
+      size: 50Gi
+      storageClass: local-path
+      volumeAccessMode: ReadWriteOnce
+status:
+  conditions:
+  - lastTransitionTime: "2024-07-04T23:22:13Z"
+    message: The PVC has been created for caching NIM
+    reason: PVCCreated
+    status: "True"
+    type: NIM_CACHE_PVC_CREATED
+  - lastTransitionTime: "2024-07-05T22:13:11Z"
+    message: The Job to cache NIM has been created
+    reason: JobCreated
+    status: "True"
+    type: NIM_CACHE_JOB_CREATED
+  - lastTransitionTime: "2024-07-05T22:13:27Z"
+    message: The Job to cache NIM is in pending state
+    reason: JobPending
+    status: "True"
+    type: NIM_CACHE_JOB_PENDING
+  - lastTransitionTime: "2024-07-05T22:13:27Z"
+    message: The Job to cache NIM has successfully completed
+    reason: JobCompleted
+    status: "True"
+    type: NIM_CACHE_JOB_COMPLETED
+  profiles:
+    - model: meta/llama3-8b-instruct
+      release: 1.0.0
+      tags:
+        feat_lora: "false"
+        gpu: A100
+        gpu_device: 20b2:10de
+        llm_engine: tensorrt_llm
+        pp: "1"
+        precision: fp16
+        profile: latency
+        tp: "2"
+  pvc: meta-llama3-8b-instruct-pvc
+  state: ready
+```
diff --git a/docs/nimservice.md b/docs/nimservice.md
new file mode 100644
index 000000000..8cd46b17d
--- /dev/null
+++ b/docs/nimservice.md
@@ -0,0 +1,143 @@
+<!--
+  SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+  SPDX-License-Identifier: Apache-2.0
+-->
+
+# Create a NIM Service
+
+## Prerequisites
+
+* A `NIMCache` instance in the namespace `nim-service`.
+
+## 1. Create the NIM Service Instance
+
+Create a file, such as `nimservice.yaml`, with contents like the following example:
+
+```yaml
+apiVersion: apps.nvidia.com/v1alpha1
+kind: NIMService
+metadata:
+  labels:
+    app.kubernetes.io/name: k8s-nim-operator
+    app.kubernetes.io/managed-by: kustomize
+  name: meta-llama3-8b-instruct
+spec:
+  image:
+    repository: nvcr.io/nim/meta/llama3-8b-instruct
+    tag: 1.0.0
+    pullPolicy: IfNotPresent
+    pullSecrets:
+      - ngc-secret
+  authSecret: ngc-api-secret
+  nimCache:
+    name: meta-llama3-8b-instruct
+    profile: ''
+  scale:
+    minReplicas: 1
+  resources:
+    limits:
+      nvidia.com/gpu: 1
+  expose:
+    service:
+      type: ClusterIP
+      openaiPort: 8000
+```
+
+Apply the manifest:
+
+```sh
+kubectl create -f nimservice.yaml -n nim-service
+```
+
+### 2. Check the Status of NIM Service Deployment
+
+```sh
+kubectl get nimservice -n nim-service
+```
+
+```output
+NAME                      STATUS   AGE
+meta-llama3-8b-instruct   Ready    115m
+```
+
+```sh
+kubectl get pods -n nim-service
+```
+
+```output
+NAME                                       READY   STATUS      RESTARTS   AGE
+meta-llama3-8b-instruct-db9d899fd-mfmq2    1/1     Running     0          108m
+meta-llama3-8b-instruct-job-xktnk          0/1     Completed   0          4m38s
+```
+
+### 3. Verify the Microservice is Running
+
+Create a file, `verify-pod.yaml`, with contents like the following example:
+
+```yaml
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: verify-streaming-chat
+spec:
+  containers:
+    - name: curl
+      image: curlimages/curl:8.6.0
+      command: ['curl']
+      args:
+        - -X
+        - "POST"
+        - 'http://meta-llama3-8b-instruct:8000/v1/chat/completions'
+        - -H
+        - 'accept: application/json'
+        - -H
+        - 'Content-Type: application/json'
+        - --fail-with-body
+        - -d
+        - |
+            {
+                "model": "meta/llama3-8b-instruct",
+                "messages": [
+                    {
+                        "role":"user",
+                        "content":"Hello there how are you?",
+                        "name": "aleks"
+                    },
+                    {
+                        "role":"assistant",
+                        "content":"How may I help you?"
+                    },
+                    {
+                        "role":"user",
+                        "content":"Do something for me?"
+                    }
+                ],
+                "top_p": 1,
+                "n": 1,
+                "max_tokens": 15,
+                "stream": true,
+                "frequency_penalty": 1.0,
+                "stop": ["hello"]
+            }
+  restartPolicy: Never
+```
+
+Apply the manifest:
+
+```sh
+kubectl create -f test-pod.yaml -n nim-service
+```
+
+Confirm the verification pod ran to completion:
+
+```sh
+kubectl get pods -n nim-service
+```
+
+```console
+NAME                                              READY   STATUS      RESTARTS   AGE
+meta-llama3-8b-instruct-latest-db9d899fd-mfmq2    1/1     Running     0          112m
+meta-llama3-8b-instruct-latest-job-xktnk          0/1     Completed   0          8m8s
+verify-streaming-chat                             0/1     Completed   0          99m
+```