From 8029c4099f2cfa43d0a17f5dc22acdfdee34a580 Mon Sep 17 00:00:00 2001
From: Shobhit <shobhit.narayanan@vulcan-ai.com>
Date: Sun, 21 Jul 2024 19:52:57 +0800
Subject: [PATCH 1/2] Added demo chart. Version is functional on single GPU
 system pending testing.

---
 examples/kubernetes/README.md                 |  91 +++++++---
 examples/kubernetes/llama-cpp/Chart.yaml      |   6 -
 .../kubernetes/llama-cpp/templates/NOTES.txt  |  28 ---
 .../llama-cpp/templates/deployment.yaml       | 102 -----------
 .../kubernetes/llama-cpp/templates/hpa.yaml   |  32 ----
 .../templates/ingress-completions.yaml        |  64 -------
 .../templates/ingress-embeddings.yaml         |  64 -------
 .../kubernetes/llama-cpp/templates/jobs.yaml  |  66 -------
 .../llama-cpp/templates/pod-monitor.yaml      |  16 --
 .../kubernetes/llama-cpp/templates/pvc.yaml   |  17 --
 .../llama-cpp/templates/service.yaml          |  15 --
 .../templates/tests/test-connection.yaml      |  15 --
 examples/kubernetes/llama-cpp/values.yaml     | 121 -------------
 .../{llama-cpp => llamacpp}/.helmignore       |   0
 examples/kubernetes/llamacpp/Chart.lock       |   9 +
 examples/kubernetes/llamacpp/Chart.yaml       |  32 ++++
 .../llamacpp/charts/embedding/.helmignore     |  23 +++
 .../llamacpp/charts/embedding/Chart.yaml      |  24 +++
 .../charts/embedding/templates/NOTES.txt      |   0
 .../embedding/templates/PersistentVolume.yaml |  21 +++
 .../charts/embedding}/templates/_helpers.tpl  |  20 +--
 .../charts/embedding/templates/configMap.yaml |   8 +
 .../embedding/templates/deployment.yaml       | 161 ++++++++++++++++++
 .../charts/embedding/templates/hpa.yaml       |  24 +++
 .../templates/persistentvolumeclaim.yaml      |  18 ++
 .../charts/embedding/templates/service.yaml   |  22 +++
 .../templates/tests/test-connection.yaml      |  15 ++
 .../llamacpp/charts/embedding/values.yaml     |  16 ++
 .../llamacpp/charts/modelRunner/.helmignore   |  23 +++
 .../llamacpp/charts/modelRunner/Chart.yaml    |  24 +++
 .../charts/modelRunner/templates/NOTES.txt    |   0
 .../templates/PersistentVolume.yaml           |  21 +++
 .../charts/modelRunner/templates/_helpers.tpl |  64 +++++++
 .../modelRunner/templates/configMap.yaml      |   8 +
 .../modelRunner/templates/deployment.yaml     | 161 ++++++++++++++++++
 .../charts/modelRunner/templates/hpa.yaml     |  24 +++
 .../templates/persistentvolumeclaim.yaml      |  18 ++
 .../charts/modelRunner/templates/service.yaml |  22 +++
 .../charts/modelRunner/templates/sidecar.yaml |  29 ++++
 .../llamacpp/charts/modelRunner/values.yaml   |  17 ++
 .../kubernetes/llamacpp/templates/NOTES.txt   |   1 +
 .../llamacpp/templates/_helpers.tpl           |  66 +++++++
 .../llamacpp/templates/ingress.yaml           |  85 +++++++++
 examples/kubernetes/llamacpp/values.yaml      | 108 ++++++++++++
 44 files changed, 1118 insertions(+), 583 deletions(-)
 delete mode 100644 examples/kubernetes/llama-cpp/Chart.yaml
 delete mode 100644 examples/kubernetes/llama-cpp/templates/NOTES.txt
 delete mode 100644 examples/kubernetes/llama-cpp/templates/deployment.yaml
 delete mode 100644 examples/kubernetes/llama-cpp/templates/hpa.yaml
 delete mode 100644 examples/kubernetes/llama-cpp/templates/ingress-completions.yaml
 delete mode 100644 examples/kubernetes/llama-cpp/templates/ingress-embeddings.yaml
 delete mode 100644 examples/kubernetes/llama-cpp/templates/jobs.yaml
 delete mode 100644 examples/kubernetes/llama-cpp/templates/pod-monitor.yaml
 delete mode 100644 examples/kubernetes/llama-cpp/templates/pvc.yaml
 delete mode 100644 examples/kubernetes/llama-cpp/templates/service.yaml
 delete mode 100644 examples/kubernetes/llama-cpp/templates/tests/test-connection.yaml
 delete mode 100644 examples/kubernetes/llama-cpp/values.yaml
 rename examples/kubernetes/{llama-cpp => llamacpp}/.helmignore (100%)
 create mode 100644 examples/kubernetes/llamacpp/Chart.lock
 create mode 100644 examples/kubernetes/llamacpp/Chart.yaml
 create mode 100644 examples/kubernetes/llamacpp/charts/embedding/.helmignore
 create mode 100644 examples/kubernetes/llamacpp/charts/embedding/Chart.yaml
 create mode 100644 examples/kubernetes/llamacpp/charts/embedding/templates/NOTES.txt
 create mode 100644 examples/kubernetes/llamacpp/charts/embedding/templates/PersistentVolume.yaml
 rename examples/kubernetes/{llama-cpp => llamacpp/charts/embedding}/templates/_helpers.tpl (72%)
 create mode 100644 examples/kubernetes/llamacpp/charts/embedding/templates/configMap.yaml
 create mode 100644 examples/kubernetes/llamacpp/charts/embedding/templates/deployment.yaml
 create mode 100644 examples/kubernetes/llamacpp/charts/embedding/templates/hpa.yaml
 create mode 100644 examples/kubernetes/llamacpp/charts/embedding/templates/persistentvolumeclaim.yaml
 create mode 100644 examples/kubernetes/llamacpp/charts/embedding/templates/service.yaml
 create mode 100644 examples/kubernetes/llamacpp/charts/embedding/templates/tests/test-connection.yaml
 create mode 100644 examples/kubernetes/llamacpp/charts/embedding/values.yaml
 create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/.helmignore
 create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/Chart.yaml
 create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/templates/NOTES.txt
 create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/templates/PersistentVolume.yaml
 create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/templates/_helpers.tpl
 create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/templates/configMap.yaml
 create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/templates/deployment.yaml
 create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/templates/hpa.yaml
 create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/templates/persistentvolumeclaim.yaml
 create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/templates/service.yaml
 create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/templates/sidecar.yaml
 create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/values.yaml
 create mode 100644 examples/kubernetes/llamacpp/templates/NOTES.txt
 create mode 100644 examples/kubernetes/llamacpp/templates/_helpers.tpl
 create mode 100644 examples/kubernetes/llamacpp/templates/ingress.yaml
 create mode 100644 examples/kubernetes/llamacpp/values.yaml

diff --git a/examples/kubernetes/README.md b/examples/kubernetes/README.md
index 5a0806977ae77..1a9b10fd5ce76 100644
--- a/examples/kubernetes/README.md
+++ b/examples/kubernetes/README.md
@@ -1,5 +1,21 @@
 # llama.cpp/example/kubernetes
 
+
+## Setup kubernetes
+
+You can use microk8s to setup a kubernetes cluster on your local machine. 
+
+Once downloaded enable the following addons for the cluster:
+
+```shell
+microk8s enable dns storage registry helm3 gpu
+```
+
+You can also set up your system to use the microk8s kubectl [here](https://microk8s.io/docs/working-with-kubectl).
+
+
+## Usage
+
 This example demonstrates how to deploy [llama.cpp server](../server) on a [kubernetes cluster](https://kubernetes.io).
 
 ![llama.cpp.kubernetes.png](llama.cpp.kubernetes.png)
@@ -10,19 +26,48 @@ We provide an [Helm chart](https://helm.sh/)  repository to deploy llama.cpp at
 
 helm repo add llama.cpp https://ggerganov.github.io/llama.cpp
 helm repo update
-helm install example llama-cpp --namespace llama-cpp --create-namespace
+helm install example llamacpp --namespace llama-cpp --create-namespace
 ```
 
-## Prerequisites
+This chart features 2 subcharts that can be deployed independently:
+1. modelRunner: Responsible for completion
+2. embeddings: Responsible for embeddings
+
+In order to set the various parameters for the deployment, you can use the `values.yaml` file:
+
+```yaml
+
+modelRunner:
+  fullname: "modelrunner"
+  service:
+    type: ClusterIP
+    port: 8080
+  modelPath: 
+    val: <Path to local>
+  models: {
+    "model1":{
+      "enabled": true,
+      "download": true,
+      "replicas": 3,
+      "device": "cpu",
+      "autoScale": {
+        "enabled": false,
+        "minReplicas": 1,
+        "maxReplicas": 100,
+        "targetCPUUtilizationPercentage": 80
+      },
+      "url": "https://huggingface.co/TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF/resolve/main/capybarahermes-2.5-mistral-7b.Q4_0.gguf",
+      "image": "ghcr.io/ggerganov/llama.cpp:server",
+      "endpoint": "/model1"
+    }  
+  }
 
-Obviously you need a kubernetes cluster.
+```
 
-Required access to an API server with the following `roles`:
+Adjust the model path to a local directory that stores the models. The models are downloaded from the provided URL and stored in the local directory. The models are then mounted to the pod.
 
-- verbs: `["get", "list", "watch", "create", "update", "patch", "delete"]` 
-- resources: `["pods", "deployments", "services", "pvc", "jobs", "ingresses]`
+You can also adjust the number of replicas, the device, the image, the endpoint, and the autoscaling parameters.
 
-If you do not have a real k8s cluster, you can give a try to [kind](https://kind.sigs.k8s.io/).
 
 ### Metrics monitoring
 
@@ -38,29 +83,21 @@ helm install \
     --namespace monitoring
 ```
 
-## Goals
 
-Deploy a production ready LLM API over kubernetes, including:
-- High availability
-- multi models
-- support of embeddings and completions models
-- load balancing
-- Auto scaling
-- Security 
+## Feature set for the Helm chart
+
+- [x] High availability
+- [x] Multi models
+- [x] Support of embeddings and completions models
+- [ ] Load balancing
+- [x] Auto scaling
+- [x] CUDA support
+- [x] Downloading functionality
 
-### Limitations
-This example does not cover [NVidia based docker engine](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html), the target architecture remains the same, just switch to [cuda based images](../../.devops/server-cuda.Dockerfile).
+## Pending testing
 
-## Proposed architectures
+- [ ] Load balancing
+- [ ] multi GPU support using MiG for kubernetes [docs](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html) & [microk8s](https://microk8s.io/docs/addon-gpu)
 
-**Constraints:**
-- llama.cpp server is mono model
-- GGUF models files are heavy (even quantized)
 
-**Approach**
-1. Models file are downloaded once on a `PV` by a `Job` when the stack is deployed
-2. Server `Deployment` is using an init containers to verify if the model is downloaded
-3. `Ingress` rules are routing incoming request to the target models
-3. `Probes` are used to monitor the `pods` healthiness
-4. [Prometheus](https://prometheus.io/) is used as the metrics server
 
diff --git a/examples/kubernetes/llama-cpp/Chart.yaml b/examples/kubernetes/llama-cpp/Chart.yaml
deleted file mode 100644
index 02cce93ef28c3..0000000000000
--- a/examples/kubernetes/llama-cpp/Chart.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-apiVersion: v2
-name: llama-cpp
-description: llama.cpp Helm chart for Kubernetes
-type: application
-version: 0.0.1
-appVersion: "77d1ac7e00bf049b9f2bba1b5a310a78318c49c4"
diff --git a/examples/kubernetes/llama-cpp/templates/NOTES.txt b/examples/kubernetes/llama-cpp/templates/NOTES.txt
deleted file mode 100644
index 44d5a115a3d42..0000000000000
--- a/examples/kubernetes/llama-cpp/templates/NOTES.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-1. Get the application URL by running these commands:
-{{- if .Values.ingresses.completions.enabled }}
-{{- range $host := .Values.ingresses.completions.hosts }}
-  {{- range .paths }}
-  http{{ if $.Values.ingresses.completions.tls }}s{{ end }}://{{ if .host }}{{ .host }}{{else}}localhost{{ end }}{{ .path }} --data '{"messages": [{"role": "user", "message":"hello llama.cpp"}]}'
-  {{- end }}
-{{- end }}
-{{- else if .Values.ingresses.embeddings.enabled }}
-{{- range $host := .Values.ingresses.embeddings.hosts }}
-  {{- range .paths }}
-  curl http{{ if $.Values.ingresses.embeddings.tls }}s{{ end }}://{{ $host.host }}{{ .path }} --data '{"input": "hello llama.cpp"}'
-a  {{- end }}
-{{- end }}
-{{- else if contains "NodePort" .Values.service.type }}
-  export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "server.llama.cpp.fullname" . }})
-  export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")
-  echo http://$NODE_IP:$NODE_PORT
-{{- else if contains "LoadBalancer" .Values.service.type }}
-     NOTE: It may take a few minutes for the LoadBalancer IP to be available.
-           You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "server.llama.cpp.fullname" . }}'
-  export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "server.llama.cpp.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}")
-  echo http://$SERVICE_IP:{{ .Values.service.port }}
-{{- else if contains "ClusterIP" .Values.service.type }}
-  export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "server.llama.cpp.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
-  export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}")
-  echo "Visit http://127.0.0.1:8080 to use your application"
-  kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT
-{{- end }}
diff --git a/examples/kubernetes/llama-cpp/templates/deployment.yaml b/examples/kubernetes/llama-cpp/templates/deployment.yaml
deleted file mode 100644
index 223b5dd8a7d68..0000000000000
--- a/examples/kubernetes/llama-cpp/templates/deployment.yaml
+++ /dev/null
@@ -1,102 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: {{ include "server.llama.cpp.fullname" . }}
-  labels:
-    {{- include "server.llama.cpp.labels" . | nindent 4 }}
-spec:
-  {{- if not .Values.autoscaling.enabled }}
-  replicas: {{ .Values.replicaCount }}
-  {{- end }}
-  selector:
-    matchLabels:
-      {{- include "server.llama.cpp.selectorLabels" . | nindent 6 }}
-  template:
-    metadata:
-      annotations:
-        {{- include "server.llama.cpp.labels" . | nindent 8 }}
-        {{- if .Values.server.metrics }}
-        prometheus.io/scrape: 'true'
-        prometheus.io/port: '{{ .Values.server.port }}'
-        {{- end }}
-      {{- with .Values.podAnnotations }}
-        {{- toYaml . | nindent 8 }}
-      {{- end }}
-      labels:
-        prometheus.io/scrape: 'true'
-        {{- include "server.llama.cpp.labels" . | nindent 8 }}
-        {{- with .Values.podLabels }}
-        {{- toYaml . | nindent 8 }}
-        {{- end }}
-    spec:
-      {{- with .Values.imagePullSecrets }}
-      imagePullSecrets:
-        {{- toYaml . | nindent 8 }}
-      {{- end }}
-      {{- with .Values.nodeSelector }}
-      nodeSelector:
-        {{- toYaml . | nindent 8 }}
-      {{- end }}
-      {{- with .Values.affinity }}
-      affinity:
-        {{- toYaml . | nindent 8 }}
-      {{- end }}
-      {{- with .Values.tolerations }}
-      tolerations:
-        {{- toYaml . | nindent 8 }}
-      {{- end }}
-      containers:
-        - name: {{ .Chart.Name }}
-          securityContext:
-            {{- toYaml .Values.securityContext | nindent 12 }}
-          image: "{{ .Values.images.server.repository }}:{{ .Values.images.server.name }}-{{ .Values.images.server.tag | default .Chart.AppVersion }}"
-          imagePullPolicy: {{ .Values.images.pullPolicy }}
-          resources:
-            {{- toYaml .Values.resources | nindent 12 }}
-          command:
-            - {{ .Values.server.command }}
-          args:
-            - --host
-            - {{ .Values.server.host }}
-            - --port
-            - "{{ .Values.server.port }}"
-            - --model
-            - {{ .Values.model.path }}/{{  regexReplaceAll "(.*/)?([^/]+).gguf" .Values.model.file "${2}.gguf" }}
-            - --cont-batching
-            - --alias
-            - {{ .Values.model.alias }}
-            - --ctx-size
-            - "{{ .Values.server.kvCache.size }}"
-            - --parallel
-            - "{{ .Values.server.slots }}"
-            {{- if .Values.server.embeddings }}
-            - --embedding
-            {{- end }}
-            {{- if .Values.server.metrics }}
-            - --metrics
-            {{- end }}
-            - --log-format
-            - {{ .Values.server.log.format }}
-            {{- if  .Values.server.log.disabled }}
-            - --log-disable
-            {{- end }}
-            {{- with .Values.server.extraArgs }}
-            {{- toYaml . | nindent 12 }}
-            {{- end }}
-          ports:
-            - name: http
-              containerPort: {{ .Values.server.port }}
-              protocol: TCP
-          {{- with .Values.volumeMounts }}
-          volumeMounts:
-            {{- toYaml . | nindent 12 }}
-          {{- end }}
-          volumeMounts:
-            - mountPath: {{ .Values.model.path }}
-              name: models
-              readOnly: true
-      volumes:
-        - name: models
-          persistentVolumeClaim:
-            claimName: {{ include "server.llama.cpp.fullname" . }}
-            readOnly: true
diff --git a/examples/kubernetes/llama-cpp/templates/hpa.yaml b/examples/kubernetes/llama-cpp/templates/hpa.yaml
deleted file mode 100644
index ad8841bac27ce..0000000000000
--- a/examples/kubernetes/llama-cpp/templates/hpa.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-{{- if .Values.autoscaling.enabled }}
-apiVersion: autoscaling/v2
-kind: HorizontalPodAutoscaler
-metadata:
-  name: {{ include "server.llama.cpp.fullname" . }}
-  labels:
-    {{- include "server.llama.cpp.labels" . | nindent 4 }}
-spec:
-  scaleTargetRef:
-    apiVersion: apps/v1
-    kind: Deployment
-    name: {{ include "server.llama.cpp.fullname" . }}
-  minReplicas: {{ .Values.autoscaling.minReplicas }}
-  maxReplicas: {{ .Values.autoscaling.maxReplicas }}
-  metrics:
-    {{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
-    - type: Resource
-      resource:
-        name: cpu
-        target:
-          type: Utilization
-          averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
-    {{- end }}
-    {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
-    - type: Resource
-      resource:
-        name: memory
-        target:
-          type: Utilization
-          averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
-    {{- end }}
-{{- end }}
diff --git a/examples/kubernetes/llama-cpp/templates/ingress-completions.yaml b/examples/kubernetes/llama-cpp/templates/ingress-completions.yaml
deleted file mode 100644
index d1ef1bda4541c..0000000000000
--- a/examples/kubernetes/llama-cpp/templates/ingress-completions.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-{{- if and .Values.server.completions .Values.ingresses.completions.enabled -}}
-{{- $fullName := include "server.llama.cpp.fullname" . -}}
-{{- $svcPort := .Values.service.port -}}
-{{- if and .Values.ingresses.completions.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }}
-  {{- if not (hasKey .Values.ingresses.completions.annotations "kubernetes.io/ingress.class") }}
-  {{- $_ := set .Values.ingresses.completions.annotations "kubernetes.io/ingress.class" .Values.ingresses.completions.className}}
-  {{- end }}
-{{- end }}
-{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}}
-apiVersion: networking.k8s.io/v1
-{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}}
-apiVersion: networking.k8s.io/v1beta1
-{{- else -}}
-apiVersion: extensions/v1beta1
-{{- end }}
-kind: Ingress
-metadata:
-  name: {{ $fullName }}-completions
-  labels:
-    {{- include "server.llama.cpp.labels" . | nindent 4 }}
-  {{- with .Values.ingresses.completions.annotations }}
-  annotations:
-    {{- toYaml . | nindent 4 }}
-  {{- end }}
-spec:
-  {{- if and .Values.ingresses.completions.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }}
-  ingressClassName: {{ .Values.ingresses.completions.className }}
-  {{- end }}
-  {{- if .Values.ingresses.completions.tls }}
-  tls:
-    {{- range .Values.ingresses.completions.tls }}
-    - hosts:
-        {{- range .hosts }}
-        - {{ . | quote }}
-        {{- end }}
-      secretName: {{ .secretName }}
-    {{- end }}
-  {{- end }}
-  rules:
-    {{- range .Values.ingresses.completions.hosts }}
-    - http:
-        paths:
-          {{- range .paths }}
-          - path: {{ .path }}
-            {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }}
-            pathType: {{ .pathType }}
-            {{- end }}
-            backend:
-              {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }}
-              service:
-                name: {{ $fullName }}
-                port:
-                  number: {{ $svcPort }}
-              {{- else }}
-              serviceName: {{ $fullName }}
-              servicePort: {{ $svcPort }}
-              {{- end }}
-          {{- end }}
-    {{- end }}
-      {{- if .host }}
-      host: {{ .host | quote }}
-      {{- end }}
-
-{{- end }}
diff --git a/examples/kubernetes/llama-cpp/templates/ingress-embeddings.yaml b/examples/kubernetes/llama-cpp/templates/ingress-embeddings.yaml
deleted file mode 100644
index 1085d62580e46..0000000000000
--- a/examples/kubernetes/llama-cpp/templates/ingress-embeddings.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-{{- if and .Values.server.embeddings .Values.ingresses.embeddings.enabled -}}
-{{- $fullName := include "server.llama.cpp.fullname" . -}}
-{{- $svcPort := .Values.service.port -}}
-{{- if and .Values.ingresses.embeddings.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }}
-  {{- if not (hasKey .Values.ingresses.embeddings.annotations "kubernetes.io/ingress.class") }}
-  {{- $_ := set .Values.ingresses.embeddings.annotations "kubernetes.io/ingress.class" .Values.ingresses.embeddings.className}}
-  {{- end }}
-{{- end }}
-{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}}
-apiVersion: networking.k8s.io/v1
-{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}}
-apiVersion: networking.k8s.io/v1beta1
-{{- else -}}
-apiVersion: extensions/v1beta1
-{{- end }}
-kind: Ingress
-metadata:
-  name: {{ $fullName }}-embeddings
-  labels:
-    {{- include "server.llama.cpp.labels" . | nindent 4 }}
-  {{- with .Values.ingresses.embeddings.annotations }}
-  annotations:
-    {{- toYaml . | nindent 4 }}
-  {{- end }}
-spec:
-  {{- if and .Values.ingresses.embeddings.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }}
-  ingressClassName: {{ .Values.ingresses.embeddings.className }}
-  {{- end }}
-  {{- if .Values.ingresses.embeddings.tls }}
-  tls:
-    {{- range .Values.ingresses.embeddings.tls }}
-    - hosts:
-        {{- range .hosts }}
-        - {{ . | quote }}
-        {{- end }}
-      secretName: {{ .secretName }}
-    {{- end }}
-  {{- end }}
-  rules:
-    {{- range .Values.ingresses.embeddings.hosts }}
-    - http:
-        paths:
-          {{- range .paths }}
-          - path: {{ .path }}
-            {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }}
-            pathType: {{ .pathType }}
-            {{- end }}
-            backend:
-              {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }}
-              service:
-                name: {{ $fullName }}
-                port:
-                  number: {{ $svcPort }}
-              {{- else }}
-              serviceName: {{ $fullName }}
-              servicePort: {{ $svcPort }}
-              {{- end }}
-          {{- end }}
-    {{- end }}
-      {{- if .host }}
-      host: {{ .host | quote }}
-      {{- end }}
-
-{{- end }}
diff --git a/examples/kubernetes/llama-cpp/templates/jobs.yaml b/examples/kubernetes/llama-cpp/templates/jobs.yaml
deleted file mode 100644
index 9142bfbfa996d..0000000000000
--- a/examples/kubernetes/llama-cpp/templates/jobs.yaml
+++ /dev/null
@@ -1,66 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name:  {{ include "server.llama.cpp.fullname" . }}-download-model
-  labels:
-    {{- include "server.llama.cpp.labels" . | nindent 4 }}
-spec:
-  template:
-    metadata:
-      name: {{ include "server.llama.cpp.fullname" . }}-download-model
-      {{- with .Values.podAnnotations }}
-      annotations:
-        {{- toYaml . | nindent 8 }}
-      {{- end }}
-      labels:
-        {{- include "server.llama.cpp.labels" . | nindent 8 }}
-        {{- with .Values.jobLabels }}
-        {{- toYaml . | nindent 8 }}
-        {{- end }}
-    spec:
-      containers:
-        - name: {{ include "server.llama.cpp.fullname" . }}-download-model
-          securityContext:
-            {{- toYaml .Values.securityContext | nindent 12 }}
-          image: {{ .Values.images.downloader.repository }}:{{ .Values.images.downloader.name }}{{if  .Values.images.downloader.tag }}-{{end}}{{ .Values.images.downloader.tag }}
-          env:
-            - name: MODEL_PATH
-              value: {{ .Values.model.path }}
-            - name: MODEL_FILE
-              value: {{  regexReplaceAll "(.*/)?([^/]+).gguf" .Values.model.file "${2}.gguf" }}
-            - name: MODEL_SHA256
-              value: {{ .Values.model.sha256 }}
-            - name: MODEL_DOWNLOAD_REPO
-              value: {{ .Values.model.repo }}
-            - name: MODEL_DOWNLOAD_FILE
-              value: {{ .Values.model.file }}
-          command:
-            - sh
-            - -c
-          args:
-            - >
-              set -eux;
-              if ! echo "${MODEL_SHA256} *${MODEL_PATH}/${MODEL_FILE}" | sha256sum -c -s - ; then
-                wget -q -c -O ${MODEL_PATH}/${MODEL_FILE} https://huggingface.co/${MODEL_DOWNLOAD_REPO}/resolve/main/${MODEL_DOWNLOAD_FILE};
-              fi
-          volumeMounts:
-            - mountPath: {{ .Values.model.path }}
-              name: models
-      restartPolicy: OnFailure
-      volumes:
-        - name: models
-          persistentVolumeClaim:
-            claimName: {{ include "server.llama.cpp.fullname" . }}
-            readOnly: false
-      {{- with .Values.nodeSelector }}
-      nodeSelector:
-        {{- toYaml . | nindent 8 }}
-      {{- end }}
-      {{- with .Values.affinity }}
-      affinity:
-        {{- toYaml . | nindent 8 }}
-      {{- end }}
-      {{- with .Values.tolerations }}
-      tolerations:
-        {{- toYaml . | nindent 8 }}
-      {{- end }}
\ No newline at end of file
diff --git a/examples/kubernetes/llama-cpp/templates/pod-monitor.yaml b/examples/kubernetes/llama-cpp/templates/pod-monitor.yaml
deleted file mode 100644
index f2a9ba0ce29e4..0000000000000
--- a/examples/kubernetes/llama-cpp/templates/pod-monitor.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-{{- if .Values.server.metrics }}
-apiVersion: monitoring.coreos.com/v1
-kind: PodMonitor
-metadata:
-  name: {{ include "server.llama.cpp.fullname" . }}
-  labels:
-    {{- include "server.llama.cpp.labels" . | nindent 4 }}
-spec:
-  selector:
-    matchLabels:
-      {{- include "server.llama.cpp.selectorLabels" . | nindent 6 }}
-  podMetricsEndpoints:
-    - port: http
-      interval: 30s
-      path: /metrics
-{{end}}
\ No newline at end of file
diff --git a/examples/kubernetes/llama-cpp/templates/pvc.yaml b/examples/kubernetes/llama-cpp/templates/pvc.yaml
deleted file mode 100644
index e2c40e5acaea4..0000000000000
--- a/examples/kubernetes/llama-cpp/templates/pvc.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-kind: PersistentVolumeClaim
-apiVersion: v1
-metadata:
-  name: {{ include "server.llama.cpp.fullname" . }}
-  labels:
-    {{- include "server.llama.cpp.labels" . | nindent 4 }}
-  annotations:
-    helm.sh/resource-policy: "keep"
-spec:
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: {{ .Values.model.size | quote }}
-{{- if .Values.persistence.storageClass }}
-  storageClassName: {{ .Values.persistence.storageClass }}
-{{- end }}
\ No newline at end of file
diff --git a/examples/kubernetes/llama-cpp/templates/service.yaml b/examples/kubernetes/llama-cpp/templates/service.yaml
deleted file mode 100644
index 09cab5f400968..0000000000000
--- a/examples/kubernetes/llama-cpp/templates/service.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: {{ include "server.llama.cpp.fullname" . }}
-  labels:
-    {{- include "server.llama.cpp.labels" . | nindent 4 }}
-spec:
-  type: {{ .Values.service.type }}
-  ports:
-    - port: {{ .Values.service.port }}
-      targetPort: {{ .Values.server.port }}
-      protocol: TCP
-      name: http
-  selector:
-    {{- include "server.llama.cpp.selectorLabels" . | nindent 4 }}
diff --git a/examples/kubernetes/llama-cpp/templates/tests/test-connection.yaml b/examples/kubernetes/llama-cpp/templates/tests/test-connection.yaml
deleted file mode 100644
index 5685bf3421180..0000000000000
--- a/examples/kubernetes/llama-cpp/templates/tests/test-connection.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-apiVersion: v1
-kind: Pod
-metadata:
-  name: "{{ include "server.llama.cpp.fullname" . }}-test-connection"
-  labels:
-    {{- include "server.llama.cpp.labels" . | nindent 4 }}
-  annotations:
-    "helm.sh/hook": test
-spec:
-  containers:
-    - name: wget
-      image: busybox
-      command: ['wget']
-      args: ['{{ include "server.llama.cpp.fullname" . }}:{{ .Values.service.port }}/health']
-  restartPolicy: Never
diff --git a/examples/kubernetes/llama-cpp/values.yaml b/examples/kubernetes/llama-cpp/values.yaml
deleted file mode 100644
index 2ac6ed35ad4cc..0000000000000
--- a/examples/kubernetes/llama-cpp/values.yaml
+++ /dev/null
@@ -1,121 +0,0 @@
-# Default values for server.llama.cpp.
-# This is a YAML-formatted file.
-# Declare variables to be passed into your templates.
-
-replicaCount: 2
-
-images:
-  server:
-    repository: ghcr.io/ggerganov/llama.cpp
-    name: server
-    tag:
-  downloader:
-    repository: busybox
-    name: 1.36.1
-    tag: "glibc"
-
-  pullPolicy: IfNotPresent
-  # Overrides the image tag whose default is the chart appVersion.
-
-imagePullSecrets: [ ]
-nameOverride: ""
-fullnameOverride: ""
-
-podAnnotations: { }
-podLabels: { }
-
-jobAnnotations: { }
-jobLabels: { }
-
-podSecurityContext:
-  runAsNonRoot: true
-
-securityContext:
-  readOnlyRootFilesystem: false #FIXME
-  runAsNonRoot: true
-  runAsUser: 1000
-
-model:
-  path: /tmp
-  alias: microsoft-phi2
-  repo: ggml-org/models # TheBloke/phi-2-GGUF
-  file: tinyllamas/stories260K.gguf # phi-2.Q4_K_M.gguf
-  size: 2Mi # 1.8Gi
-  sha256: 047bf46455a544931cff6fef14d7910154c56afbc23ab1c5e56a72e69912c04b # 324356668fa5ba9f4135de348447bb2bbe2467eaa1b8fcfb53719de62fbd2499
-
-server:
-  command: /server
-  host: 0.0.0.0
-  port: 8080
-  completions: true
-  embeddings: false
-  metrics: true
-  kvCache:
-    size: 64
-  slots: 2
-  log:
-    format: text
-    disabled: false
-  extraArgs: []
-
-deployments:
-  init
-
-service:
-  type: ClusterIP
-  port: 80
-
-ingresses:
-  completions:
-    enabled: true
-    className: ""
-    annotations:
-      kubernetes.io/ingress.class: nginx
-
-    hosts:
-      - #host: llama-cpp.mydomain
-        paths:
-          - path: /v1/completions
-            pathType: Prefix
-    tls: [ ]
-    #  - secretName: chart-example-tls
-    #    hosts:
-    #      - chart-example.local
-
-  embeddings:
-    enabled: true
-    className: ""
-    annotations:
-      kubernetes.io/ingress.class: nginx
-
-    hosts:
-      - #host: llama-cpp.mydomain
-        paths:
-          - path: /v1/embeddings
-            pathType: Prefix
-    tls: [ ]
-    #  - secretName: chart-example-tls
-    #    hosts:
-    #      - chart-example.local
-
-resources: { }
-
-autoscaling:
-  enabled: true
-  minReplicas: 1
-  maxReplicas: 4
-  targetCPUUtilizationPercentage: 80
-  targetMemoryUtilizationPercentage: 80
-
-volumes: [ ]
-
-volumeMounts: [ ]
-
-nodeSelector: { }
-
-tolerations: [ ]
-
-affinity: { }
-
-persistence:
-  storageClass:
\ No newline at end of file
diff --git a/examples/kubernetes/llama-cpp/.helmignore b/examples/kubernetes/llamacpp/.helmignore
similarity index 100%
rename from examples/kubernetes/llama-cpp/.helmignore
rename to examples/kubernetes/llamacpp/.helmignore
diff --git a/examples/kubernetes/llamacpp/Chart.lock b/examples/kubernetes/llamacpp/Chart.lock
new file mode 100644
index 0000000000000..599acdcd85512
--- /dev/null
+++ b/examples/kubernetes/llamacpp/Chart.lock
@@ -0,0 +1,9 @@
+dependencies:
+- name: model-runner
+  repository: file://charts/model-runner
+  version: 0.1.0
+- name: embedding
+  repository: file://charts/embedding
+  version: 0.1.0
+digest: sha256:91f709ba2b6a0d17e8ebfe5ee93141115d1d85ae6d1fd3cf77bc6dfaec76d69c
+generated: "2024-05-12T19:06:38.283833152+08:00"
diff --git a/examples/kubernetes/llamacpp/Chart.yaml b/examples/kubernetes/llamacpp/Chart.yaml
new file mode 100644
index 0000000000000..0e64676a079c0
--- /dev/null
+++ b/examples/kubernetes/llamacpp/Chart.yaml
@@ -0,0 +1,32 @@
+apiVersion: v2
+name: llamacpp
+description: A Helm chart for Kubernetes
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.1.0
+
+# This is the version number of the application being deployed. This version number should be
+# incremented each time you make changes to the application. Versions are not expected to
+# follow Semantic Versioning. They should reflect the version the application is using.
+# It is recommended to use it with quotes.
+appVersion: "1.16.0"
+
+dependencies:
+  - name: modelRunner
+    version: 0.1.0
+    repository: "file://charts/modelRunner"
+  - name: embedding
+    version: 0.1.0
+    repository: "file://charts/embedding"
\ No newline at end of file
diff --git a/examples/kubernetes/llamacpp/charts/embedding/.helmignore b/examples/kubernetes/llamacpp/charts/embedding/.helmignore
new file mode 100644
index 0000000000000..0e8a0eb36f4ca
--- /dev/null
+++ b/examples/kubernetes/llamacpp/charts/embedding/.helmignore
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/examples/kubernetes/llamacpp/charts/embedding/Chart.yaml b/examples/kubernetes/llamacpp/charts/embedding/Chart.yaml
new file mode 100644
index 0000000000000..f0c186a519321
--- /dev/null
+++ b/examples/kubernetes/llamacpp/charts/embedding/Chart.yaml
@@ -0,0 +1,24 @@
+apiVersion: v2
+name: embedding
+description: A Helm chart for Kubernetes
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.1.0
+
+# This is the version number of the application being deployed. This version number should be
+# incremented each time you make changes to the application. Versions are not expected to
+# follow Semantic Versioning. They should reflect the version the application is using.
+# It is recommended to use it with quotes.
+appVersion: "1.16.0"
diff --git a/examples/kubernetes/llamacpp/charts/embedding/templates/NOTES.txt b/examples/kubernetes/llamacpp/charts/embedding/templates/NOTES.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/examples/kubernetes/llamacpp/charts/embedding/templates/PersistentVolume.yaml b/examples/kubernetes/llamacpp/charts/embedding/templates/PersistentVolume.yaml
new file mode 100644
index 0000000000000..385d2aeae5e1f
--- /dev/null
+++ b/examples/kubernetes/llamacpp/charts/embedding/templates/PersistentVolume.yaml
@@ -0,0 +1,21 @@
+{{- range $modelName, $modelConfig := .Values.models }}
+
+
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: {{ include "embedding.fullname" $ | lower }}-pv-{{ $modelName }}
+  labels:
+    {{- include "embedding.labels" $ | nindent 4 }}
+
+spec:
+  capacity:
+    storage: 10Gi
+  accessModes:
+    - ReadWriteMany
+  hostPath:
+    path: {{ $.Values.modelPath.val }}
+
+---
+
+{{- end}}
\ No newline at end of file
diff --git a/examples/kubernetes/llama-cpp/templates/_helpers.tpl b/examples/kubernetes/llamacpp/charts/embedding/templates/_helpers.tpl
similarity index 72%
rename from examples/kubernetes/llama-cpp/templates/_helpers.tpl
rename to examples/kubernetes/llamacpp/charts/embedding/templates/_helpers.tpl
index 54bc8197f1f57..7522c671a87ca 100644
--- a/examples/kubernetes/llama-cpp/templates/_helpers.tpl
+++ b/examples/kubernetes/llamacpp/charts/embedding/templates/_helpers.tpl
@@ -1,7 +1,7 @@
 {{/*
 Expand the name of the chart.
 */}}
-{{- define "server.llama.cpp.name" -}}
+{{- define "embedding.name" -}}
 {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
 {{- end }}
 
@@ -10,7 +10,7 @@ Create a default fully qualified app name.
 We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
 If release name contains chart name it will be used as a full name.
 */}}
-{{- define "server.llama.cpp.fullname" -}}
+{{- define "embedding.fullname" -}}
 {{- if .Values.fullnameOverride }}
 {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
 {{- else }}
@@ -26,16 +26,16 @@ If release name contains chart name it will be used as a full name.
 {{/*
 Create chart name and version as used by the chart label.
 */}}
-{{- define "server.llama.cpp.chart" -}}
+{{- define "embedding.chart" -}}
 {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
 {{- end }}
 
 {{/*
 Common labels
 */}}
-{{- define "server.llama.cpp.labels" -}}
-helm.sh/chart: {{ include "server.llama.cpp.chart" . }}
-{{ include "server.llama.cpp.selectorLabels" . }}
+{{- define "embedding.labels" -}}
+helm.sh/chart: {{ include "embedding.chart" . }}
+{{ include "embedding.selectorLabels" . }}
 {{- if .Chart.AppVersion }}
 app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
 {{- end }}
@@ -45,17 +45,17 @@ app.kubernetes.io/managed-by: {{ .Release.Service }}
 {{/*
 Selector labels
 */}}
-{{- define "server.llama.cpp.selectorLabels" -}}
-app.kubernetes.io/name: {{ include "server.llama.cpp.name" . }}
+{{- define "embedding.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "embedding.name" . }}
 app.kubernetes.io/instance: {{ .Release.Name }}
 {{- end }}
 
 {{/*
 Create the name of the service account to use
 */}}
-{{- define "server.llama.cpp.serviceAccountName" -}}
+{{- define "embedding.serviceAccountName" -}}
 {{- if .Values.serviceAccount.create }}
-{{- default (include "server.llama.cpp.fullname" .) .Values.serviceAccount.name }}
+{{- default (include "embedding.fullname" .) .Values.serviceAccount.name }}
 {{- else }}
 {{- default "default" .Values.serviceAccount.name }}
 {{- end }}
diff --git a/examples/kubernetes/llamacpp/charts/embedding/templates/configMap.yaml b/examples/kubernetes/llamacpp/charts/embedding/templates/configMap.yaml
new file mode 100644
index 0000000000000..9ee1acce3e149
--- /dev/null
+++ b/examples/kubernetes/llamacpp/charts/embedding/templates/configMap.yaml
@@ -0,0 +1,8 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "embedding.fullname" . | lower }}-configmap
+data:
+  {{- range $modelName, $modelConfig := .Values.models }}
+  {{ $modelName }}.status: "pending"
+  {{- end }}
diff --git a/examples/kubernetes/llamacpp/charts/embedding/templates/deployment.yaml b/examples/kubernetes/llamacpp/charts/embedding/templates/deployment.yaml
new file mode 100644
index 0000000000000..0e566eef234d9
--- /dev/null
+++ b/examples/kubernetes/llamacpp/charts/embedding/templates/deployment.yaml
@@ -0,0 +1,161 @@
+{{ $port := .Values.service.port }}
+{{- range $modelName, $modelConfig := .Values.models }}
+{{- if $modelConfig.enabled }}
+
+{{- if $modelConfig.download }}
+
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ include "embedding.fullname" $ | lower }}-download-{{ $modelName }}
+  labels:
+    app: {{ include "embedding.fullname" $ | lower }}
+spec:
+  template:
+    spec:
+      initContainers:
+        - name: fix-permissions
+          image: busybox
+          command: ["sh", "-c", "chmod -R 777 /models"]
+          volumeMounts:
+            - name: {{ include "embedding.fullname" $ | lower }}-pv-{{ $modelName }}
+              mountPath: /models
+        
+      containers:
+      - name: init-job
+        image: alpine/k8s:1.27.11
+        command:
+          - sh
+          - -c
+          - |
+            set -e
+            if curl -L {{ $modelConfig.url }} --output /models/{{ $modelName }}/{{ $modelName }}.gguf; then
+              kubectl patch configmap {{ include "embedding.fullname" $ | lower }}-configmap --type merge -p '{"data": {"{{ $modelName }}.status": "completed"}}'
+              echo "Download succeeded"
+            else
+              echo "Download failed"
+              exit 1
+            fi
+        volumeMounts:
+          - name: {{ include "embedding.fullname" $ | lower }}-pv-{{ $modelName }}
+            mountPath: /models/{{ $modelName }}
+          - name: kubeconfig
+            mountPath: /.kube
+      restartPolicy: OnFailure
+      volumes:
+        - name: {{ include "embedding.fullname" $ | lower }}-pv-{{ $modelName }}
+          persistentVolumeClaim:
+            claimName: {{ include "embedding.fullname" $ | lower }}-pvc-{{ $modelName }}
+        - name: kubeconfig
+          hostPath:
+            path: /home/vadmin/.kube
+  backoffLimit: 4
+
+---
+
+{{- end }}
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "embedding.fullname" $ | lower }}-{{ $modelName }}
+  labels:
+    {{- include "embedding.labels" $ | nindent 4 }}
+    modelnameInternal: {{ $modelName }}
+
+spec:
+  {{- if ne (int $modelConfig.replicas) 0 }}
+  replicas: {{ $modelConfig.replicas }}
+  {{- end }}
+  selector:
+    matchLabels:
+      {{- include "embedding.labels" $ | nindent 6 }}
+      modelnameInternal: {{ $modelName }}
+  template:
+    metadata:
+      labels:
+        {{- include "embedding.labels" $ | nindent 8 }}
+        modelnameInternal: {{ $modelName }}
+    spec:
+      securityContext: 
+        fsGroup: 2000 
+      initContainers:
+        - name: check-download-job
+          image: alpine/k8s:1.27.11
+          command:
+            - sh
+            - -c
+            - |
+              while true; do
+                STATUS=$(kubectl get configmap {{ include "embedding.fullname" $ | lower }}-configmap -o jsonpath='{.data.{{ $modelName }}\.status}')
+                if [ "$STATUS" == "completed" ]; then
+                  echo "Configmap updated"
+                  sleep 5
+                  exit 0
+                fi
+                kubectl get configmap {{ include "embedding.fullname" $ | lower }}-configmap -o jsonpath='{.data.{{ $modelName }}\.status}'
+                echo "Waiting for configmap update..."
+                sleep 15
+              done
+          volumeMounts:
+            - name: kubeconfig
+              mountPath: /.kube
+      containers:
+        - name: {{ $modelName }}
+          image: "{{ $modelConfig.image }}" 
+          command: 
+            {{- if eq $modelConfig.device  "cuda" }}
+            - /server
+            {{- else }}
+            - /llama-server
+            {{- end }}
+            - --verbose
+            - -m
+            - /models/{{ $modelName }}.gguf
+            - --host
+            - 0.0.0.0
+            {{- if eq $modelConfig.device  "cuda" }}
+            - --n-gpu-layers
+            - "99"
+            {{- end }}
+            
+          volumeMounts:
+            - name: {{ include "embedding.fullname" $ | lower }}-pv-{{ $modelName }}
+              mountPath: /models
+              readOnly: false
+          securityContext:
+            runAsUser: 1001
+            runAsGroup: 2000
+          ports: 
+            - name: http
+              containerPort: {{ $port }}
+              protocol: TCP
+          tolerations:
+            - key: "node.kubernetes.io/unreachable"
+              operator: "Exists"
+              effect: "NoExecute"
+          {{- if eq $modelConfig.device  "cuda" }}
+          resources:
+            limits:
+              nvidia.com/gpu: 1 # Request 1 GPU
+          {{- end }}
+      livenessProbe:
+        {{- toYaml $.Values.livenessProbe | nindent 12 }}
+      readinessProbe:
+        {{- toYaml $.Values.readinessProbe | nindent 12 }}
+
+      volumes:
+        - name: {{ include "embedding.fullname" $ | lower }}-pv-{{ $modelName }}
+          persistentVolumeClaim:
+            claimName: {{ include "embedding.fullname" $ | lower }}-pvc-{{ $modelName }}
+        - name: kubeconfig
+          hostPath:
+            path: /home/vadmin/.kube
+
+      nodeSelector:
+        {{- if $modelConfig.nodeType }}
+        {{- end }}
+
+---
+{{- end }}
+{{- end }}
diff --git a/examples/kubernetes/llamacpp/charts/embedding/templates/hpa.yaml b/examples/kubernetes/llamacpp/charts/embedding/templates/hpa.yaml
new file mode 100644
index 0000000000000..6efb26c85f14f
--- /dev/null
+++ b/examples/kubernetes/llamacpp/charts/embedding/templates/hpa.yaml
@@ -0,0 +1,24 @@
+{{- range $modelName, $modelConfig := .Values.models }}
+{{ if $modelConfig.autoScale.enabled }}
+apiVersion: autoscaling/v2beta2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ include "embedding.fullname" $ | lower }}-{{ $modelName }}-hpa
+  labels:
+    app: {{ include "embedding.name" . }}
+    chart: {{ include "embedding.chart" . }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "embedding.fullname" $ | lower }}-{{ $modelName }}
+  minReplicas: {{ $modelConfig.autoScale.minReplicas }}
+  maxReplicas: {{ $modelConfig.autoScale.maxReplicas }}
+  metrics:
+    - type: Resource
+      resource:
+        name: cpu
+        targetAverageUtilization: {{ $modelConfig.autoScale.targetAverageUtilization }}
+---
+{{- end }}
+{{- end }}
diff --git a/examples/kubernetes/llamacpp/charts/embedding/templates/persistentvolumeclaim.yaml b/examples/kubernetes/llamacpp/charts/embedding/templates/persistentvolumeclaim.yaml
new file mode 100644
index 0000000000000..783b737ee4aa5
--- /dev/null
+++ b/examples/kubernetes/llamacpp/charts/embedding/templates/persistentvolumeclaim.yaml
@@ -0,0 +1,18 @@
+{{- range $modelName, $modelConfig := .Values.models }}
+
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: {{ include "embedding.fullname" $ | lower }}-pvc-{{ $modelName }}
+  labels:
+    {{- include "embedding.labels" $ | nindent 4 }}
+    modelnameInternal: {{ $modelName }}
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 10Gi
+---
+
+{{- end}}
diff --git a/examples/kubernetes/llamacpp/charts/embedding/templates/service.yaml b/examples/kubernetes/llamacpp/charts/embedding/templates/service.yaml
new file mode 100644
index 0000000000000..4c5fe8558c322
--- /dev/null
+++ b/examples/kubernetes/llamacpp/charts/embedding/templates/service.yaml
@@ -0,0 +1,22 @@
+{{- range $modelName, $modelConfig := .Values.models }}
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "embedding.fullname" $ | lower }}-{{ $modelName }}-svc
+  labels:
+    {{- include "embedding.labels" $ | nindent 4 }}
+    modelnameInternal: {{ $modelName }}
+spec:
+  type: {{ $.Values.service.type }}
+  ports:
+    - port: {{ $.Values.service.port }}
+      targetPort: http
+      protocol: TCP
+      name: http
+  selector:
+    {{- include "embedding.selectorLabels" $ | nindent 4 }}
+    modelnameInternal: {{ $modelName }}
+
+---
+{{- end }}
\ No newline at end of file
diff --git a/examples/kubernetes/llamacpp/charts/embedding/templates/tests/test-connection.yaml b/examples/kubernetes/llamacpp/charts/embedding/templates/tests/test-connection.yaml
new file mode 100644
index 0000000000000..21edb7b569748
--- /dev/null
+++ b/examples/kubernetes/llamacpp/charts/embedding/templates/tests/test-connection.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: "{{ include "embedding.fullname" . }}-test-connection"
+  labels:
+    {{- include "embedding.labels" . | nindent 4 }}
+  annotations:
+    "helm.sh/hook": test
+spec:
+  containers:
+    - name: wget
+      image: busybox
+      command: ['wget']
+      args: ['{{ include "embedding.fullname" . }}:{{ .Values.service.port }}']
+  restartPolicy: Never
diff --git a/examples/kubernetes/llamacpp/charts/embedding/values.yaml b/examples/kubernetes/llamacpp/charts/embedding/values.yaml
new file mode 100644
index 0000000000000..fb92b9db27d05
--- /dev/null
+++ b/examples/kubernetes/llamacpp/charts/embedding/values.yaml
@@ -0,0 +1,16 @@
+
+livenessProbe:
+  httpGet:
+    path: /
+    port: http
+readinessProbe:
+  httpGet:
+    path: /
+    port: http
+
+
+models: {
+  
+}
+
+
diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/.helmignore b/examples/kubernetes/llamacpp/charts/modelRunner/.helmignore
new file mode 100644
index 0000000000000..0e8a0eb36f4ca
--- /dev/null
+++ b/examples/kubernetes/llamacpp/charts/modelRunner/.helmignore
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/Chart.yaml b/examples/kubernetes/llamacpp/charts/modelRunner/Chart.yaml
new file mode 100644
index 0000000000000..6d3f6f44a0270
--- /dev/null
+++ b/examples/kubernetes/llamacpp/charts/modelRunner/Chart.yaml
@@ -0,0 +1,24 @@
+apiVersion: v2
+name: modelRunner
+description: A Helm chart for Kubernetes
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.1.0
+
+# This is the version number of the application being deployed. This version number should be
+# incremented each time you make changes to the application. Versions are not expected to
+# follow Semantic Versioning. They should reflect the version the application is using.
+# It is recommended to use it with quotes.
+appVersion: "1.16.0"
diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/templates/NOTES.txt b/examples/kubernetes/llamacpp/charts/modelRunner/templates/NOTES.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/templates/PersistentVolume.yaml b/examples/kubernetes/llamacpp/charts/modelRunner/templates/PersistentVolume.yaml
new file mode 100644
index 0000000000000..545dd74a30cfb
--- /dev/null
+++ b/examples/kubernetes/llamacpp/charts/modelRunner/templates/PersistentVolume.yaml
@@ -0,0 +1,21 @@
+{{- range $modelName, $modelConfig := .Values.models }}
+
+
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: {{ include "modelRunner.fullname" $ | lower }}-pv-{{ $modelName }}
+  labels:
+    {{- include "modelRunner.labels" $ | nindent 4 }}
+
+spec:
+  capacity:
+    storage: 10Gi
+  accessModes:
+    - ReadWriteMany
+  hostPath:
+    path: {{ $.Values.modelPath.val }}/{{ $modelName }}
+
+---
+
+{{- end}}
\ No newline at end of file
diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/templates/_helpers.tpl b/examples/kubernetes/llamacpp/charts/modelRunner/templates/_helpers.tpl
new file mode 100644
index 0000000000000..ec5864594b398
--- /dev/null
+++ b/examples/kubernetes/llamacpp/charts/modelRunner/templates/_helpers.tpl
@@ -0,0 +1,64 @@
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "modelRunner.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "modelRunner.fullname" -}}
+{{- if .Values.fullnameOverride }}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- $name := default .Chart.Name .Values.nameOverride }}
+{{- if contains $name .Release.Name }}
+{{- .Release.Name | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
+{{- end }}
+{{- end }}
+{{- end }}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "modelRunner.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Common labels
+*/}}
+{{- define "modelRunner.labels" -}}
+helm.sh/chart: {{ include "modelRunner.chart" . }}
+{{ include "modelRunner.selectorLabels" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
+
+{{/*
+Selector labels
+*/}}
+{{- define "modelRunner.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "modelRunner.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
+
+{{/*
+Create the name of the service account to use
+*/}}
+{{- define "modelRunner.serviceAccountName" -}}
+{{- if .Values.serviceAccount.create }}
+{{- default (include "modelRunner.fullname" .) .Values.serviceAccount.name }}
+{{- else }}
+{{- default "default" .Values.serviceAccount.name }}
+{{- end }}
+{{- end }}
+
+
diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/templates/configMap.yaml b/examples/kubernetes/llamacpp/charts/modelRunner/templates/configMap.yaml
new file mode 100644
index 0000000000000..dec3586f85cce
--- /dev/null
+++ b/examples/kubernetes/llamacpp/charts/modelRunner/templates/configMap.yaml
@@ -0,0 +1,8 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "modelRunner.fullname" . | lower }}-configmap
+data:
+  {{- range $modelName, $modelConfig := .Values.models }}
+  {{ $modelName }}.status: "pending"
+  {{- end }}
diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/templates/deployment.yaml b/examples/kubernetes/llamacpp/charts/modelRunner/templates/deployment.yaml
new file mode 100644
index 0000000000000..d9e4aa67e0ed8
--- /dev/null
+++ b/examples/kubernetes/llamacpp/charts/modelRunner/templates/deployment.yaml
@@ -0,0 +1,161 @@
+{{ $port := .Values.service.port }}
+{{- range $modelName, $modelConfig := .Values.models }}
+{{- if $modelConfig.enabled }}
+
+{{- if $modelConfig.download }}
+
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ include "modelRunner.fullname" $ | lower }}-download-{{ $modelName }}
+  labels:
+    app: {{ include "modelRunner.fullname" $ | lower }}
+spec:
+  template:
+    spec:
+      initContainers:
+        - name: fix-permissions
+          image: busybox
+          command: ["sh", "-c", "chmod -R 777 /models"]
+          volumeMounts:
+            - name: {{ include "modelRunner.fullname" $ | lower }}-pv-{{ $modelName }}
+              mountPath: /models
+        
+      containers:
+      - name: init-job
+        image: alpine/k8s:1.27.11
+        command:
+          - sh
+          - -c
+          - |
+            set -e
+            if curl -L {{ $modelConfig.url }} --output /models/{{ $modelName }}/{{ $modelName }}.gguf; then
+              kubectl patch configmap {{ include "modelRunner.fullname" $ | lower }}-configmap --type merge -p '{"data": {"{{ $modelName }}.status": "completed"}}'
+              echo "Download succeeded"
+            else
+              echo "Download failed"
+              exit 1
+            fi
+        volumeMounts:
+          - name: {{ include "modelRunner.fullname" $ | lower }}-pv-{{ $modelName }}
+            mountPath: /models/{{ $modelName }}
+          - name: kubeconfig
+            mountPath: /.kube
+      restartPolicy: OnFailure
+      volumes:
+        - name: {{ include "modelRunner.fullname" $ | lower }}-pv-{{ $modelName }}
+          persistentVolumeClaim:
+            claimName: {{ include "modelRunner.fullname" $ | lower }}-pvc-{{ $modelName }}
+        - name: kubeconfig
+          hostPath:
+            path: /home/vadmin/.kube
+  backoffLimit: 4
+
+---
+
+{{- end }}
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "modelRunner.fullname" $ | lower }}-{{ $modelName }}
+  labels:
+    {{- include "modelRunner.labels" $ | nindent 4 }}
+    modelnameInternal: {{ $modelName }}
+
+spec:
+  {{- if ne (int $modelConfig.replicas) 0 }}
+  replicas: {{ $modelConfig.replicas }}
+  {{- end }}
+  selector:
+    matchLabels:
+      {{- include "modelRunner.labels" $ | nindent 6 }}
+      modelnameInternal: {{ $modelName }}
+  template:
+    metadata:
+      labels:
+        {{- include "modelRunner.labels" $ | nindent 8 }}
+        modelnameInternal: {{ $modelName }}
+    spec:
+      securityContext: 
+        fsGroup: 2000 
+      initContainers:
+        - name: check-download-job
+          image: alpine/k8s:1.27.11
+          command:
+            - sh
+            - -c
+            - |
+              while true; do
+                STATUS=$(kubectl get configmap {{ include "modelRunner.fullname" $ | lower }}-configmap -o jsonpath='{.data.{{ $modelName }}\.status}')
+                if [ "$STATUS" == "completed" ]; then
+                  echo "Configmap updated"
+                  sleep 5
+                  exit 0
+                fi
+                kubectl get configmap {{ include "modelRunner.fullname" $ | lower }}-configmap -o jsonpath='{.data.{{ $modelName }}\.status}'
+                echo "Waiting for configmap update..."
+                sleep 15
+              done
+          volumeMounts:
+            - name: kubeconfig
+              mountPath: /.kube
+      containers:
+        - name: {{ $modelName }}
+          image: "{{ $modelConfig.image }}" 
+          command: 
+            {{- if eq $modelConfig.device  "cuda" }}
+            - /server
+            {{- else }}
+            - /llama-server
+            {{- end }}
+            - --verbose
+            - -m
+            - /models/{{ $modelName }}.gguf
+            - --host
+            - 0.0.0.0
+            {{- if eq $modelConfig.device  "cuda" }}
+            - --n-gpu-layers
+            - "99"
+            {{- end }}
+            
+          volumeMounts:
+            - name: {{ include "modelRunner.fullname" $ | lower }}-pv-{{ $modelName }}
+              mountPath: /models
+              readOnly: false
+          securityContext:
+            runAsUser: 1001
+            runAsGroup: 2000
+          ports: 
+            - name: http
+              containerPort: {{ $port }}
+              protocol: TCP
+          tolerations:
+            - key: "node.kubernetes.io/unreachable"
+              operator: "Exists"
+              effect: "NoExecute"
+          {{- if eq $modelConfig.device  "cuda" }}
+          resources:
+            limits:
+              nvidia.com/gpu: 1 # Request 1 GPU
+          {{- end }}
+      livenessProbe:
+        {{- toYaml $.Values.livenessProbe | nindent 12 }}
+      readinessProbe:
+        {{- toYaml $.Values.readinessProbe | nindent 12 }}
+
+      volumes:
+        - name: {{ include "modelRunner.fullname" $ | lower }}-pv-{{ $modelName }}
+          persistentVolumeClaim:
+            claimName: {{ include "modelRunner.fullname" $ | lower }}-pvc-{{ $modelName }}
+        - name: kubeconfig
+          hostPath:
+            path: /home/vadmin/.kube
+
+      nodeSelector:
+        {{- if $modelConfig.nodeType }}
+        {{- end }}
+
+---
+{{- end }}
+{{- end }}
diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/templates/hpa.yaml b/examples/kubernetes/llamacpp/charts/modelRunner/templates/hpa.yaml
new file mode 100644
index 0000000000000..10c75b5d43db6
--- /dev/null
+++ b/examples/kubernetes/llamacpp/charts/modelRunner/templates/hpa.yaml
@@ -0,0 +1,24 @@
+{{- range $modelName, $modelConfig := .Values.models }}
+{{ if $modelConfig.autoScale.enabled }}
+apiVersion: autoscaling/v2beta2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ include "modelRunner.fullname" $ | lower }}-{{ $modelName }}-hpa
+  labels:
+    app: {{ include "modelRunner.name" . }}
+    chart: {{ include "modelRunner.chart" . }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "modelRunner.fullname" $ | lower }}-{{ $modelName }}
+  minReplicas: {{ $modelConfig.autoScale.minReplicas }}
+  maxReplicas: {{ $modelConfig.autoScale.maxReplicas }}
+  metrics:
+    - type: Resource
+      resource:
+        name: cpu
+        targetAverageUtilization: {{ $modelConfig.autoScale.targetAverageUtilization }}
+---
+{{- end }}
+{{- end }}
diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/templates/persistentvolumeclaim.yaml b/examples/kubernetes/llamacpp/charts/modelRunner/templates/persistentvolumeclaim.yaml
new file mode 100644
index 0000000000000..a770a75b9ca78
--- /dev/null
+++ b/examples/kubernetes/llamacpp/charts/modelRunner/templates/persistentvolumeclaim.yaml
@@ -0,0 +1,18 @@
+{{- range $modelName, $modelConfig := .Values.models }}
+
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: {{ include "modelRunner.fullname" $ | lower }}-pvc-{{ $modelName }}
+  labels:
+    {{- include "modelRunner.labels" $ | nindent 4 }}
+    modelnameInternal: {{ $modelName }}
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 10Gi
+---
+
+{{- end}}
diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/templates/service.yaml b/examples/kubernetes/llamacpp/charts/modelRunner/templates/service.yaml
new file mode 100644
index 0000000000000..dd00212917420
--- /dev/null
+++ b/examples/kubernetes/llamacpp/charts/modelRunner/templates/service.yaml
@@ -0,0 +1,22 @@
+{{- range $modelName, $modelConfig := .Values.models }}
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "modelRunner.fullname" $ | lower }}-{{ $modelName }}-svc
+  labels:
+    {{- include "modelRunner.labels" $ | nindent 4 }}
+    modelnameInternal: {{ $modelName }}
+spec:
+  type: {{ $.Values.service.type }}
+  ports:
+    - port: {{ $.Values.service.port }}
+      targetPort: http
+      protocol: TCP
+      name: http
+  selector:
+    {{- include "modelRunner.selectorLabels" $ | nindent 4 }}
+    modelnameInternal: {{ $modelName }}
+
+---
+{{- end }}
\ No newline at end of file
diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/templates/sidecar.yaml b/examples/kubernetes/llamacpp/charts/modelRunner/templates/sidecar.yaml
new file mode 100644
index 0000000000000..cb2f025976de8
--- /dev/null
+++ b/examples/kubernetes/llamacpp/charts/modelRunner/templates/sidecar.yaml
@@ -0,0 +1,29 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "modelRunner.fullname" $ | lower }}-sidecar
+  labels:
+    {{- include "modelRunner.labels" $ | nindent 4 }}
+    modelnameInternal: sidecar
+
+spec:
+  selector:
+    matchLabels:
+      {{- include "modelRunner.labels" $ | nindent 6 }}
+  template:
+    metadata:
+      labels:
+        {{- include "modelRunner.labels" $ | nindent 8 }}
+        modelnameInternal: sidecar
+
+    spec:
+      securityContext: 
+        fsGroup: 2000 
+      containers:
+        - name: curl-sidecar
+          image: curlimages/curl:7.79.1
+          command: ["/bin/sh", "-c", "tail -f /dev/null"]
+
+
+---
+
diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/values.yaml b/examples/kubernetes/llamacpp/charts/modelRunner/values.yaml
new file mode 100644
index 0000000000000..03c296ef0c677
--- /dev/null
+++ b/examples/kubernetes/llamacpp/charts/modelRunner/values.yaml
@@ -0,0 +1,17 @@
+
+
+livenessProbe:
+  httpGet:
+    path: /
+    port: http
+readinessProbe:
+  httpGet:
+    path: /
+    port: http
+
+
+models: {
+  
+}
+
+
diff --git a/examples/kubernetes/llamacpp/templates/NOTES.txt b/examples/kubernetes/llamacpp/templates/NOTES.txt
new file mode 100644
index 0000000000000..8b137891791fe
--- /dev/null
+++ b/examples/kubernetes/llamacpp/templates/NOTES.txt
@@ -0,0 +1 @@
+
diff --git a/examples/kubernetes/llamacpp/templates/_helpers.tpl b/examples/kubernetes/llamacpp/templates/_helpers.tpl
new file mode 100644
index 0000000000000..9cf785f8a7dbd
--- /dev/null
+++ b/examples/kubernetes/llamacpp/templates/_helpers.tpl
@@ -0,0 +1,66 @@
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "llamacpp.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "llamacpp.fullname" -}}
+{{- if .Values.fullnameOverride }}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- $name := default .Chart.Name .Values.nameOverride }}
+{{- if contains $name .Release.Name }}
+{{- .Release.Name | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
+{{- end }}
+{{- end }}
+{{- end }}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "llamacpp.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Common labels
+*/}}
+{{- define "llamacpp.labels" -}}
+helm.sh/chart: {{ include "llamacpp.chart" . }}
+{{ include "llamacpp.selectorLabels" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
+
+{{/*
+Selector labels
+*/}}
+{{- define "llamacpp.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "llamacpp.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
+
+
+{{/*
+Create the name of the service account to use
+*/}}
+{{- define "llamacpp.serviceAccountName" -}}
+{{- if .Values.serviceAccount.create }}
+{{- default (include "llamacpp.fullname" .) .Values.serviceAccount.name }}
+{{- else }}
+{{- default "default" .Values.serviceAccount.name }}
+{{- end }}
+{{- end }}
+
+
+
diff --git a/examples/kubernetes/llamacpp/templates/ingress.yaml b/examples/kubernetes/llamacpp/templates/ingress.yaml
new file mode 100644
index 0000000000000..75d42a2f4fcef
--- /dev/null
+++ b/examples/kubernetes/llamacpp/templates/ingress.yaml
@@ -0,0 +1,85 @@
+{{- if .Values.ingress.enabled -}}
+{{- $fullName := include "llamacpp.fullname" . -}}
+{{- $svcPort := .Values.modelRunner.service.port -}}
+{{- $modelRunner := .Values.modelRunner -}}
+{{- $modelRunnerFullname := $modelRunner.fullname -}}
+{{- $embedding := .Values.embedding -}}
+{{- $embeddingFullname := $embedding.fullname -}}
+{{- $embeddingSvcPort := .Values.embedding.service.port -}}
+
+{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }}
+  {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }}
+  {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}}
+  {{- end }}
+{{- end }}
+{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}}
+apiVersion: networking.k8s.io/v1
+{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}}
+apiVersion: networking.k8s.io/v1beta1
+{{- else -}}
+apiVersion: extensions/v1beta1
+{{- end }}
+kind: Ingress
+metadata:
+  name: {{ $fullName }}
+  labels:
+    {{- include "llamacpp.labels" . | nindent 4 }}
+  {{- with .Values.ingress.annotations }}
+  annotations:
+    
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }}
+  ingressClassName: {{ .Values.ingress.className }}
+  {{- end }}
+  {{- if .Values.ingress.tls }}
+  tls:
+    {{- range .Values.ingress.tls }}
+    - hosts:
+        {{- range .hosts }}
+        - {{ . | quote }}
+        {{- end }}
+      secretName: {{ .secretName }}
+    {{- end }}
+  {{- end }}
+  rules:
+  {{- range .Values.ingress.hosts }}
+    {{ $pathtype := .pathtype }}
+    - host: {{ .host | quote }}
+      http:
+        paths:
+          {{- range $modelName, $modelConfig := $modelRunner.models }}
+          - path: {{ $modelConfig.endpoint }}(/|$)(.*)
+            {{- if and $pathtype (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }}
+            pathType: {{ $pathtype }}
+            {{- end }}
+            backend:
+              {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }}
+              service:
+                name: {{ $.Release.Name }}-{{ $modelRunnerFullname }}-{{ $modelName }}-svc
+                port:
+                  number: {{ $svcPort }}
+              {{- else }}
+              serviceName: {{ $.Release.Name }}-{{ $modelRunnerFullname }}-{{ $modelName }}-svc
+              servicePort: {{ $svcPort }}
+              {{- end }}
+          {{- end }}
+          {{- range $modelName, $modelConfig := $embedding.models }}
+          - path: {{ $modelConfig.endpoint }}(/|$)(.*)
+            {{- if and $pathtype (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }}
+            pathType: {{ $pathtype }}
+            {{- end }}
+            backend:
+              {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }}
+              service:
+                name: {{ $.Release.Name }}-{{ $embeddingFullname }}-{{ $modelName }}-svc
+                port:
+                  number: {{ $svcPort }}
+              {{- else }}
+              serviceName: {{ $.Release.Name }}-{{ $embeddingFullname }}-{{ $modelName }}-svc
+              servicePort: {{ $svcPort }}
+              {{- end }}
+          {{- end }}
+  {{- end }}
+{{- end }}
diff --git a/examples/kubernetes/llamacpp/values.yaml b/examples/kubernetes/llamacpp/values.yaml
new file mode 100644
index 0000000000000..afb693d5735e5
--- /dev/null
+++ b/examples/kubernetes/llamacpp/values.yaml
@@ -0,0 +1,108 @@
+# Default values for llamacpp.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+
+
+podAnnotations: {}
+podLabels: {}
+
+podSecurityContext: {}
+
+securityContext: {}
+
+ingress:
+  enabled: true
+  className: ""
+  annotations: {
+    nginx.ingress.kubernetes.io/rewrite-target: /$2,
+    nginx.ingress.kubernetes.io/use-regex: "true"
+  }
+  hosts:
+    - host: demo.local
+      pathtype: ImplementationSpecific
+        
+  tls: []
+
+
+
+
+livenessProbe:
+  httpGet:
+    path: /
+    port: http
+  
+readinessProbe:
+  httpGet:
+    path: /
+    port: http
+
+
+
+
+modelRunner:
+  fullname: "modelrunner"
+  service:
+    type: ClusterIP
+    port: 8080
+  modelPath: 
+    val: <Path to local>
+  models: {
+    "model1":{
+      "enabled": true,
+      "download": true,
+      "replicas": 3,
+      "device": "cpu",
+      "autoScale": {
+        "enabled": false,
+        "minReplicas": 1,
+        "maxReplicas": 100,
+        "targetCPUUtilizationPercentage": 80
+      },
+      "url": "https://huggingface.co/TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF/resolve/main/capybarahermes-2.5-mistral-7b.Q4_0.gguf",
+      "image": "ghcr.io/ggerganov/llama.cpp:server",
+      "endpoint": "/model1"
+    },
+    "model2": {
+      "enabled": true,
+      "replicas": 1,
+      "download": true,
+      "device": "cuda",
+      "autoScale": {
+        "enabled": false,
+        "minReplicas": 1,
+        "maxReplicas": 100,
+        "targetCPUUtilizationPercentage": 80
+      },
+      "url": "https://huggingface.co/TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF/resolve/main/capybarahermes-2.5-mistral-7b.Q4_0.gguf",
+      "image": "ghcr.io/ggerganov/llama.cpp:server-cuda",
+      "endpoint": "/model2"
+    },
+  }
+
+
+embedding:
+  fullname: "embedding"
+  service:
+    type: ClusterIP
+    port: 8080
+  modelPath: 
+    val: /home/vadmin/Desktop/models
+  models: {
+    "emod": {
+      "enabled": true,
+      "replicas": 1,
+      "download": true,
+      "device": "cpu",
+      "autoScale": {
+        "enabled": false,
+        "minReplicas": 1,
+        "maxReplicas": 100,
+        "targetCPUUtilizationPercentage": 80
+      },
+      "url": "https://huggingface.co/TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF/resolve/main/capybarahermes-2.5-mistral-7b.Q4_0.gguf",
+      "image": "ghcr.io/ggerganov/llama.cpp:server",
+      "endpoint": "/e"
+    },
+  }
+

From 0579fbee8086dd3f5060049b79902b0cb7090081 Mon Sep 17 00:00:00 2001
From: Shobhit <shobhit.narayanan@vulcan-ai.com>
Date: Sun, 21 Jul 2024 19:57:02 +0800
Subject: [PATCH 2/2] Updated readme with feature set

---
 examples/kubernetes/README.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/examples/kubernetes/README.md b/examples/kubernetes/README.md
index 1a9b10fd5ce76..0fd448275b9e4 100644
--- a/examples/kubernetes/README.md
+++ b/examples/kubernetes/README.md
@@ -68,6 +68,18 @@ Adjust the model path to a local directory that stores the models. The models ar
 
 You can also adjust the number of replicas, the device, the image, the endpoint, and the autoscaling parameters.
 
+Ensure that the ingress is enabled on your cluster. You can use the following command to enable the ingress:
+
+```shell
+microk8s enable ingress
+```
+
+And add the url to `/etc/hosts`:
+
+```shell
+demo.local      127.0.0.1
+```
+
 
 ### Metrics monitoring
 
@@ -93,6 +105,7 @@ helm install \
 - [x] Auto scaling
 - [x] CUDA support
 - [x] Downloading functionality
+- [ ] Redownload on upgrade hook. (Currently the models are downloaded only on the first deployment, there is no redownload functionality on upgrade if required)
 
 ## Pending testing