From 8029c4099f2cfa43d0a17f5dc22acdfdee34a580 Mon Sep 17 00:00:00 2001 From: Shobhit Date: Sun, 21 Jul 2024 19:52:57 +0800 Subject: [PATCH 1/2] Added demo chart. Version is functional on single GPU system pending testing. --- examples/kubernetes/README.md | 91 +++++++--- examples/kubernetes/llama-cpp/Chart.yaml | 6 - .../kubernetes/llama-cpp/templates/NOTES.txt | 28 --- .../llama-cpp/templates/deployment.yaml | 102 ----------- .../kubernetes/llama-cpp/templates/hpa.yaml | 32 ---- .../templates/ingress-completions.yaml | 64 ------- .../templates/ingress-embeddings.yaml | 64 ------- .../kubernetes/llama-cpp/templates/jobs.yaml | 66 ------- .../llama-cpp/templates/pod-monitor.yaml | 16 -- .../kubernetes/llama-cpp/templates/pvc.yaml | 17 -- .../llama-cpp/templates/service.yaml | 15 -- .../templates/tests/test-connection.yaml | 15 -- examples/kubernetes/llama-cpp/values.yaml | 121 ------------- .../{llama-cpp => llamacpp}/.helmignore | 0 examples/kubernetes/llamacpp/Chart.lock | 9 + examples/kubernetes/llamacpp/Chart.yaml | 32 ++++ .../llamacpp/charts/embedding/.helmignore | 23 +++ .../llamacpp/charts/embedding/Chart.yaml | 24 +++ .../charts/embedding/templates/NOTES.txt | 0 .../embedding/templates/PersistentVolume.yaml | 21 +++ .../charts/embedding}/templates/_helpers.tpl | 20 +-- .../charts/embedding/templates/configMap.yaml | 8 + .../embedding/templates/deployment.yaml | 161 ++++++++++++++++++ .../charts/embedding/templates/hpa.yaml | 24 +++ .../templates/persistentvolumeclaim.yaml | 18 ++ .../charts/embedding/templates/service.yaml | 22 +++ .../templates/tests/test-connection.yaml | 15 ++ .../llamacpp/charts/embedding/values.yaml | 16 ++ .../llamacpp/charts/modelRunner/.helmignore | 23 +++ .../llamacpp/charts/modelRunner/Chart.yaml | 24 +++ .../charts/modelRunner/templates/NOTES.txt | 0 .../templates/PersistentVolume.yaml | 21 +++ .../charts/modelRunner/templates/_helpers.tpl | 64 +++++++ .../modelRunner/templates/configMap.yaml | 8 + .../modelRunner/templates/deployment.yaml | 161 ++++++++++++++++++ .../charts/modelRunner/templates/hpa.yaml | 24 +++ .../templates/persistentvolumeclaim.yaml | 18 ++ .../charts/modelRunner/templates/service.yaml | 22 +++ .../charts/modelRunner/templates/sidecar.yaml | 29 ++++ .../llamacpp/charts/modelRunner/values.yaml | 17 ++ .../kubernetes/llamacpp/templates/NOTES.txt | 1 + .../llamacpp/templates/_helpers.tpl | 66 +++++++ .../llamacpp/templates/ingress.yaml | 85 +++++++++ examples/kubernetes/llamacpp/values.yaml | 108 ++++++++++++ 44 files changed, 1118 insertions(+), 583 deletions(-) delete mode 100644 examples/kubernetes/llama-cpp/Chart.yaml delete mode 100644 examples/kubernetes/llama-cpp/templates/NOTES.txt delete mode 100644 examples/kubernetes/llama-cpp/templates/deployment.yaml delete mode 100644 examples/kubernetes/llama-cpp/templates/hpa.yaml delete mode 100644 examples/kubernetes/llama-cpp/templates/ingress-completions.yaml delete mode 100644 examples/kubernetes/llama-cpp/templates/ingress-embeddings.yaml delete mode 100644 examples/kubernetes/llama-cpp/templates/jobs.yaml delete mode 100644 examples/kubernetes/llama-cpp/templates/pod-monitor.yaml delete mode 100644 examples/kubernetes/llama-cpp/templates/pvc.yaml delete mode 100644 examples/kubernetes/llama-cpp/templates/service.yaml delete mode 100644 examples/kubernetes/llama-cpp/templates/tests/test-connection.yaml delete mode 100644 examples/kubernetes/llama-cpp/values.yaml rename examples/kubernetes/{llama-cpp => llamacpp}/.helmignore (100%) create mode 100644 examples/kubernetes/llamacpp/Chart.lock create mode 100644 examples/kubernetes/llamacpp/Chart.yaml create mode 100644 examples/kubernetes/llamacpp/charts/embedding/.helmignore create mode 100644 examples/kubernetes/llamacpp/charts/embedding/Chart.yaml create mode 100644 examples/kubernetes/llamacpp/charts/embedding/templates/NOTES.txt create mode 100644 examples/kubernetes/llamacpp/charts/embedding/templates/PersistentVolume.yaml rename examples/kubernetes/{llama-cpp => llamacpp/charts/embedding}/templates/_helpers.tpl (72%) create mode 100644 examples/kubernetes/llamacpp/charts/embedding/templates/configMap.yaml create mode 100644 examples/kubernetes/llamacpp/charts/embedding/templates/deployment.yaml create mode 100644 examples/kubernetes/llamacpp/charts/embedding/templates/hpa.yaml create mode 100644 examples/kubernetes/llamacpp/charts/embedding/templates/persistentvolumeclaim.yaml create mode 100644 examples/kubernetes/llamacpp/charts/embedding/templates/service.yaml create mode 100644 examples/kubernetes/llamacpp/charts/embedding/templates/tests/test-connection.yaml create mode 100644 examples/kubernetes/llamacpp/charts/embedding/values.yaml create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/.helmignore create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/Chart.yaml create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/templates/NOTES.txt create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/templates/PersistentVolume.yaml create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/templates/_helpers.tpl create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/templates/configMap.yaml create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/templates/deployment.yaml create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/templates/hpa.yaml create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/templates/persistentvolumeclaim.yaml create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/templates/service.yaml create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/templates/sidecar.yaml create mode 100644 examples/kubernetes/llamacpp/charts/modelRunner/values.yaml create mode 100644 examples/kubernetes/llamacpp/templates/NOTES.txt create mode 100644 examples/kubernetes/llamacpp/templates/_helpers.tpl create mode 100644 examples/kubernetes/llamacpp/templates/ingress.yaml create mode 100644 examples/kubernetes/llamacpp/values.yaml diff --git a/examples/kubernetes/README.md b/examples/kubernetes/README.md index 5a0806977ae77..1a9b10fd5ce76 100644 --- a/examples/kubernetes/README.md +++ b/examples/kubernetes/README.md @@ -1,5 +1,21 @@ # llama.cpp/example/kubernetes + +## Setup kubernetes + +You can use microk8s to setup a kubernetes cluster on your local machine. + +Once downloaded enable the following addons for the cluster: + +```shell +microk8s enable dns storage registry helm3 gpu +``` + +You can also set up your system to use the microk8s kubectl [here](https://microk8s.io/docs/working-with-kubectl). + + +## Usage + This example demonstrates how to deploy [llama.cpp server](../server) on a [kubernetes cluster](https://kubernetes.io). ![llama.cpp.kubernetes.png](llama.cpp.kubernetes.png) @@ -10,19 +26,48 @@ We provide an [Helm chart](https://helm.sh/) repository to deploy llama.cpp at helm repo add llama.cpp https://ggerganov.github.io/llama.cpp helm repo update -helm install example llama-cpp --namespace llama-cpp --create-namespace +helm install example llamacpp --namespace llama-cpp --create-namespace ``` -## Prerequisites +This chart features 2 subcharts that can be deployed independently: +1. modelRunner: Responsible for completion +2. embeddings: Responsible for embeddings + +In order to set the various parameters for the deployment, you can use the `values.yaml` file: + +```yaml + +modelRunner: + fullname: "modelrunner" + service: + type: ClusterIP + port: 8080 + modelPath: + val: + models: { + "model1":{ + "enabled": true, + "download": true, + "replicas": 3, + "device": "cpu", + "autoScale": { + "enabled": false, + "minReplicas": 1, + "maxReplicas": 100, + "targetCPUUtilizationPercentage": 80 + }, + "url": "https://huggingface.co/TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF/resolve/main/capybarahermes-2.5-mistral-7b.Q4_0.gguf", + "image": "ghcr.io/ggerganov/llama.cpp:server", + "endpoint": "/model1" + } + } -Obviously you need a kubernetes cluster. +``` -Required access to an API server with the following `roles`: +Adjust the model path to a local directory that stores the models. The models are downloaded from the provided URL and stored in the local directory. The models are then mounted to the pod. -- verbs: `["get", "list", "watch", "create", "update", "patch", "delete"]` -- resources: `["pods", "deployments", "services", "pvc", "jobs", "ingresses]` +You can also adjust the number of replicas, the device, the image, the endpoint, and the autoscaling parameters. -If you do not have a real k8s cluster, you can give a try to [kind](https://kind.sigs.k8s.io/). ### Metrics monitoring @@ -38,29 +83,21 @@ helm install \ --namespace monitoring ``` -## Goals -Deploy a production ready LLM API over kubernetes, including: -- High availability -- multi models -- support of embeddings and completions models -- load balancing -- Auto scaling -- Security +## Feature set for the Helm chart + +- [x] High availability +- [x] Multi models +- [x] Support of embeddings and completions models +- [ ] Load balancing +- [x] Auto scaling +- [x] CUDA support +- [x] Downloading functionality -### Limitations -This example does not cover [NVidia based docker engine](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html), the target architecture remains the same, just switch to [cuda based images](../../.devops/server-cuda.Dockerfile). +## Pending testing -## Proposed architectures +- [ ] Load balancing +- [ ] multi GPU support using MiG for kubernetes [docs](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html) & [microk8s](https://microk8s.io/docs/addon-gpu) -**Constraints:** -- llama.cpp server is mono model -- GGUF models files are heavy (even quantized) -**Approach** -1. Models file are downloaded once on a `PV` by a `Job` when the stack is deployed -2. Server `Deployment` is using an init containers to verify if the model is downloaded -3. `Ingress` rules are routing incoming request to the target models -3. `Probes` are used to monitor the `pods` healthiness -4. [Prometheus](https://prometheus.io/) is used as the metrics server diff --git a/examples/kubernetes/llama-cpp/Chart.yaml b/examples/kubernetes/llama-cpp/Chart.yaml deleted file mode 100644 index 02cce93ef28c3..0000000000000 --- a/examples/kubernetes/llama-cpp/Chart.yaml +++ /dev/null @@ -1,6 +0,0 @@ -apiVersion: v2 -name: llama-cpp -description: llama.cpp Helm chart for Kubernetes -type: application -version: 0.0.1 -appVersion: "77d1ac7e00bf049b9f2bba1b5a310a78318c49c4" diff --git a/examples/kubernetes/llama-cpp/templates/NOTES.txt b/examples/kubernetes/llama-cpp/templates/NOTES.txt deleted file mode 100644 index 44d5a115a3d42..0000000000000 --- a/examples/kubernetes/llama-cpp/templates/NOTES.txt +++ /dev/null @@ -1,28 +0,0 @@ -1. Get the application URL by running these commands: -{{- if .Values.ingresses.completions.enabled }} -{{- range $host := .Values.ingresses.completions.hosts }} - {{- range .paths }} - http{{ if $.Values.ingresses.completions.tls }}s{{ end }}://{{ if .host }}{{ .host }}{{else}}localhost{{ end }}{{ .path }} --data '{"messages": [{"role": "user", "message":"hello llama.cpp"}]}' - {{- end }} -{{- end }} -{{- else if .Values.ingresses.embeddings.enabled }} -{{- range $host := .Values.ingresses.embeddings.hosts }} - {{- range .paths }} - curl http{{ if $.Values.ingresses.embeddings.tls }}s{{ end }}://{{ $host.host }}{{ .path }} --data '{"input": "hello llama.cpp"}' -a {{- end }} -{{- end }} -{{- else if contains "NodePort" .Values.service.type }} - export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "server.llama.cpp.fullname" . }}) - export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") - echo http://$NODE_IP:$NODE_PORT -{{- else if contains "LoadBalancer" .Values.service.type }} - NOTE: It may take a few minutes for the LoadBalancer IP to be available. - You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "server.llama.cpp.fullname" . }}' - export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "server.llama.cpp.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") - echo http://$SERVICE_IP:{{ .Values.service.port }} -{{- else if contains "ClusterIP" .Values.service.type }} - export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "server.llama.cpp.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") - export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") - echo "Visit http://127.0.0.1:8080 to use your application" - kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT -{{- end }} diff --git a/examples/kubernetes/llama-cpp/templates/deployment.yaml b/examples/kubernetes/llama-cpp/templates/deployment.yaml deleted file mode 100644 index 223b5dd8a7d68..0000000000000 --- a/examples/kubernetes/llama-cpp/templates/deployment.yaml +++ /dev/null @@ -1,102 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "server.llama.cpp.fullname" . }} - labels: - {{- include "server.llama.cpp.labels" . | nindent 4 }} -spec: - {{- if not .Values.autoscaling.enabled }} - replicas: {{ .Values.replicaCount }} - {{- end }} - selector: - matchLabels: - {{- include "server.llama.cpp.selectorLabels" . | nindent 6 }} - template: - metadata: - annotations: - {{- include "server.llama.cpp.labels" . | nindent 8 }} - {{- if .Values.server.metrics }} - prometheus.io/scrape: 'true' - prometheus.io/port: '{{ .Values.server.port }}' - {{- end }} - {{- with .Values.podAnnotations }} - {{- toYaml . | nindent 8 }} - {{- end }} - labels: - prometheus.io/scrape: 'true' - {{- include "server.llama.cpp.labels" . | nindent 8 }} - {{- with .Values.podLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - spec: - {{- with .Values.imagePullSecrets }} - imagePullSecrets: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} - containers: - - name: {{ .Chart.Name }} - securityContext: - {{- toYaml .Values.securityContext | nindent 12 }} - image: "{{ .Values.images.server.repository }}:{{ .Values.images.server.name }}-{{ .Values.images.server.tag | default .Chart.AppVersion }}" - imagePullPolicy: {{ .Values.images.pullPolicy }} - resources: - {{- toYaml .Values.resources | nindent 12 }} - command: - - {{ .Values.server.command }} - args: - - --host - - {{ .Values.server.host }} - - --port - - "{{ .Values.server.port }}" - - --model - - {{ .Values.model.path }}/{{ regexReplaceAll "(.*/)?([^/]+).gguf" .Values.model.file "${2}.gguf" }} - - --cont-batching - - --alias - - {{ .Values.model.alias }} - - --ctx-size - - "{{ .Values.server.kvCache.size }}" - - --parallel - - "{{ .Values.server.slots }}" - {{- if .Values.server.embeddings }} - - --embedding - {{- end }} - {{- if .Values.server.metrics }} - - --metrics - {{- end }} - - --log-format - - {{ .Values.server.log.format }} - {{- if .Values.server.log.disabled }} - - --log-disable - {{- end }} - {{- with .Values.server.extraArgs }} - {{- toYaml . | nindent 12 }} - {{- end }} - ports: - - name: http - containerPort: {{ .Values.server.port }} - protocol: TCP - {{- with .Values.volumeMounts }} - volumeMounts: - {{- toYaml . | nindent 12 }} - {{- end }} - volumeMounts: - - mountPath: {{ .Values.model.path }} - name: models - readOnly: true - volumes: - - name: models - persistentVolumeClaim: - claimName: {{ include "server.llama.cpp.fullname" . }} - readOnly: true diff --git a/examples/kubernetes/llama-cpp/templates/hpa.yaml b/examples/kubernetes/llama-cpp/templates/hpa.yaml deleted file mode 100644 index ad8841bac27ce..0000000000000 --- a/examples/kubernetes/llama-cpp/templates/hpa.yaml +++ /dev/null @@ -1,32 +0,0 @@ -{{- if .Values.autoscaling.enabled }} -apiVersion: autoscaling/v2 -kind: HorizontalPodAutoscaler -metadata: - name: {{ include "server.llama.cpp.fullname" . }} - labels: - {{- include "server.llama.cpp.labels" . | nindent 4 }} -spec: - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: {{ include "server.llama.cpp.fullname" . }} - minReplicas: {{ .Values.autoscaling.minReplicas }} - maxReplicas: {{ .Values.autoscaling.maxReplicas }} - metrics: - {{- if .Values.autoscaling.targetCPUUtilizationPercentage }} - - type: Resource - resource: - name: cpu - target: - type: Utilization - averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} - {{- end }} - {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }} - - type: Resource - resource: - name: memory - target: - type: Utilization - averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} - {{- end }} -{{- end }} diff --git a/examples/kubernetes/llama-cpp/templates/ingress-completions.yaml b/examples/kubernetes/llama-cpp/templates/ingress-completions.yaml deleted file mode 100644 index d1ef1bda4541c..0000000000000 --- a/examples/kubernetes/llama-cpp/templates/ingress-completions.yaml +++ /dev/null @@ -1,64 +0,0 @@ -{{- if and .Values.server.completions .Values.ingresses.completions.enabled -}} -{{- $fullName := include "server.llama.cpp.fullname" . -}} -{{- $svcPort := .Values.service.port -}} -{{- if and .Values.ingresses.completions.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} - {{- if not (hasKey .Values.ingresses.completions.annotations "kubernetes.io/ingress.class") }} - {{- $_ := set .Values.ingresses.completions.annotations "kubernetes.io/ingress.class" .Values.ingresses.completions.className}} - {{- end }} -{{- end }} -{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} -apiVersion: networking.k8s.io/v1 -{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} -apiVersion: networking.k8s.io/v1beta1 -{{- else -}} -apiVersion: extensions/v1beta1 -{{- end }} -kind: Ingress -metadata: - name: {{ $fullName }}-completions - labels: - {{- include "server.llama.cpp.labels" . | nindent 4 }} - {{- with .Values.ingresses.completions.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -spec: - {{- if and .Values.ingresses.completions.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} - ingressClassName: {{ .Values.ingresses.completions.className }} - {{- end }} - {{- if .Values.ingresses.completions.tls }} - tls: - {{- range .Values.ingresses.completions.tls }} - - hosts: - {{- range .hosts }} - - {{ . | quote }} - {{- end }} - secretName: {{ .secretName }} - {{- end }} - {{- end }} - rules: - {{- range .Values.ingresses.completions.hosts }} - - http: - paths: - {{- range .paths }} - - path: {{ .path }} - {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} - pathType: {{ .pathType }} - {{- end }} - backend: - {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} - service: - name: {{ $fullName }} - port: - number: {{ $svcPort }} - {{- else }} - serviceName: {{ $fullName }} - servicePort: {{ $svcPort }} - {{- end }} - {{- end }} - {{- end }} - {{- if .host }} - host: {{ .host | quote }} - {{- end }} - -{{- end }} diff --git a/examples/kubernetes/llama-cpp/templates/ingress-embeddings.yaml b/examples/kubernetes/llama-cpp/templates/ingress-embeddings.yaml deleted file mode 100644 index 1085d62580e46..0000000000000 --- a/examples/kubernetes/llama-cpp/templates/ingress-embeddings.yaml +++ /dev/null @@ -1,64 +0,0 @@ -{{- if and .Values.server.embeddings .Values.ingresses.embeddings.enabled -}} -{{- $fullName := include "server.llama.cpp.fullname" . -}} -{{- $svcPort := .Values.service.port -}} -{{- if and .Values.ingresses.embeddings.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} - {{- if not (hasKey .Values.ingresses.embeddings.annotations "kubernetes.io/ingress.class") }} - {{- $_ := set .Values.ingresses.embeddings.annotations "kubernetes.io/ingress.class" .Values.ingresses.embeddings.className}} - {{- end }} -{{- end }} -{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} -apiVersion: networking.k8s.io/v1 -{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} -apiVersion: networking.k8s.io/v1beta1 -{{- else -}} -apiVersion: extensions/v1beta1 -{{- end }} -kind: Ingress -metadata: - name: {{ $fullName }}-embeddings - labels: - {{- include "server.llama.cpp.labels" . | nindent 4 }} - {{- with .Values.ingresses.embeddings.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -spec: - {{- if and .Values.ingresses.embeddings.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} - ingressClassName: {{ .Values.ingresses.embeddings.className }} - {{- end }} - {{- if .Values.ingresses.embeddings.tls }} - tls: - {{- range .Values.ingresses.embeddings.tls }} - - hosts: - {{- range .hosts }} - - {{ . | quote }} - {{- end }} - secretName: {{ .secretName }} - {{- end }} - {{- end }} - rules: - {{- range .Values.ingresses.embeddings.hosts }} - - http: - paths: - {{- range .paths }} - - path: {{ .path }} - {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} - pathType: {{ .pathType }} - {{- end }} - backend: - {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} - service: - name: {{ $fullName }} - port: - number: {{ $svcPort }} - {{- else }} - serviceName: {{ $fullName }} - servicePort: {{ $svcPort }} - {{- end }} - {{- end }} - {{- end }} - {{- if .host }} - host: {{ .host | quote }} - {{- end }} - -{{- end }} diff --git a/examples/kubernetes/llama-cpp/templates/jobs.yaml b/examples/kubernetes/llama-cpp/templates/jobs.yaml deleted file mode 100644 index 9142bfbfa996d..0000000000000 --- a/examples/kubernetes/llama-cpp/templates/jobs.yaml +++ /dev/null @@ -1,66 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: {{ include "server.llama.cpp.fullname" . }}-download-model - labels: - {{- include "server.llama.cpp.labels" . | nindent 4 }} -spec: - template: - metadata: - name: {{ include "server.llama.cpp.fullname" . }}-download-model - {{- with .Values.podAnnotations }} - annotations: - {{- toYaml . | nindent 8 }} - {{- end }} - labels: - {{- include "server.llama.cpp.labels" . | nindent 8 }} - {{- with .Values.jobLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - spec: - containers: - - name: {{ include "server.llama.cpp.fullname" . }}-download-model - securityContext: - {{- toYaml .Values.securityContext | nindent 12 }} - image: {{ .Values.images.downloader.repository }}:{{ .Values.images.downloader.name }}{{if .Values.images.downloader.tag }}-{{end}}{{ .Values.images.downloader.tag }} - env: - - name: MODEL_PATH - value: {{ .Values.model.path }} - - name: MODEL_FILE - value: {{ regexReplaceAll "(.*/)?([^/]+).gguf" .Values.model.file "${2}.gguf" }} - - name: MODEL_SHA256 - value: {{ .Values.model.sha256 }} - - name: MODEL_DOWNLOAD_REPO - value: {{ .Values.model.repo }} - - name: MODEL_DOWNLOAD_FILE - value: {{ .Values.model.file }} - command: - - sh - - -c - args: - - > - set -eux; - if ! echo "${MODEL_SHA256} *${MODEL_PATH}/${MODEL_FILE}" | sha256sum -c -s - ; then - wget -q -c -O ${MODEL_PATH}/${MODEL_FILE} https://huggingface.co/${MODEL_DOWNLOAD_REPO}/resolve/main/${MODEL_DOWNLOAD_FILE}; - fi - volumeMounts: - - mountPath: {{ .Values.model.path }} - name: models - restartPolicy: OnFailure - volumes: - - name: models - persistentVolumeClaim: - claimName: {{ include "server.llama.cpp.fullname" . }} - readOnly: false - {{- with .Values.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} \ No newline at end of file diff --git a/examples/kubernetes/llama-cpp/templates/pod-monitor.yaml b/examples/kubernetes/llama-cpp/templates/pod-monitor.yaml deleted file mode 100644 index f2a9ba0ce29e4..0000000000000 --- a/examples/kubernetes/llama-cpp/templates/pod-monitor.yaml +++ /dev/null @@ -1,16 +0,0 @@ -{{- if .Values.server.metrics }} -apiVersion: monitoring.coreos.com/v1 -kind: PodMonitor -metadata: - name: {{ include "server.llama.cpp.fullname" . }} - labels: - {{- include "server.llama.cpp.labels" . | nindent 4 }} -spec: - selector: - matchLabels: - {{- include "server.llama.cpp.selectorLabels" . | nindent 6 }} - podMetricsEndpoints: - - port: http - interval: 30s - path: /metrics -{{end}} \ No newline at end of file diff --git a/examples/kubernetes/llama-cpp/templates/pvc.yaml b/examples/kubernetes/llama-cpp/templates/pvc.yaml deleted file mode 100644 index e2c40e5acaea4..0000000000000 --- a/examples/kubernetes/llama-cpp/templates/pvc.yaml +++ /dev/null @@ -1,17 +0,0 @@ -kind: PersistentVolumeClaim -apiVersion: v1 -metadata: - name: {{ include "server.llama.cpp.fullname" . }} - labels: - {{- include "server.llama.cpp.labels" . | nindent 4 }} - annotations: - helm.sh/resource-policy: "keep" -spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: {{ .Values.model.size | quote }} -{{- if .Values.persistence.storageClass }} - storageClassName: {{ .Values.persistence.storageClass }} -{{- end }} \ No newline at end of file diff --git a/examples/kubernetes/llama-cpp/templates/service.yaml b/examples/kubernetes/llama-cpp/templates/service.yaml deleted file mode 100644 index 09cab5f400968..0000000000000 --- a/examples/kubernetes/llama-cpp/templates/service.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: {{ include "server.llama.cpp.fullname" . }} - labels: - {{- include "server.llama.cpp.labels" . | nindent 4 }} -spec: - type: {{ .Values.service.type }} - ports: - - port: {{ .Values.service.port }} - targetPort: {{ .Values.server.port }} - protocol: TCP - name: http - selector: - {{- include "server.llama.cpp.selectorLabels" . | nindent 4 }} diff --git a/examples/kubernetes/llama-cpp/templates/tests/test-connection.yaml b/examples/kubernetes/llama-cpp/templates/tests/test-connection.yaml deleted file mode 100644 index 5685bf3421180..0000000000000 --- a/examples/kubernetes/llama-cpp/templates/tests/test-connection.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: "{{ include "server.llama.cpp.fullname" . }}-test-connection" - labels: - {{- include "server.llama.cpp.labels" . | nindent 4 }} - annotations: - "helm.sh/hook": test -spec: - containers: - - name: wget - image: busybox - command: ['wget'] - args: ['{{ include "server.llama.cpp.fullname" . }}:{{ .Values.service.port }}/health'] - restartPolicy: Never diff --git a/examples/kubernetes/llama-cpp/values.yaml b/examples/kubernetes/llama-cpp/values.yaml deleted file mode 100644 index 2ac6ed35ad4cc..0000000000000 --- a/examples/kubernetes/llama-cpp/values.yaml +++ /dev/null @@ -1,121 +0,0 @@ -# Default values for server.llama.cpp. -# This is a YAML-formatted file. -# Declare variables to be passed into your templates. - -replicaCount: 2 - -images: - server: - repository: ghcr.io/ggerganov/llama.cpp - name: server - tag: - downloader: - repository: busybox - name: 1.36.1 - tag: "glibc" - - pullPolicy: IfNotPresent - # Overrides the image tag whose default is the chart appVersion. - -imagePullSecrets: [ ] -nameOverride: "" -fullnameOverride: "" - -podAnnotations: { } -podLabels: { } - -jobAnnotations: { } -jobLabels: { } - -podSecurityContext: - runAsNonRoot: true - -securityContext: - readOnlyRootFilesystem: false #FIXME - runAsNonRoot: true - runAsUser: 1000 - -model: - path: /tmp - alias: microsoft-phi2 - repo: ggml-org/models # TheBloke/phi-2-GGUF - file: tinyllamas/stories260K.gguf # phi-2.Q4_K_M.gguf - size: 2Mi # 1.8Gi - sha256: 047bf46455a544931cff6fef14d7910154c56afbc23ab1c5e56a72e69912c04b # 324356668fa5ba9f4135de348447bb2bbe2467eaa1b8fcfb53719de62fbd2499 - -server: - command: /server - host: 0.0.0.0 - port: 8080 - completions: true - embeddings: false - metrics: true - kvCache: - size: 64 - slots: 2 - log: - format: text - disabled: false - extraArgs: [] - -deployments: - init - -service: - type: ClusterIP - port: 80 - -ingresses: - completions: - enabled: true - className: "" - annotations: - kubernetes.io/ingress.class: nginx - - hosts: - - #host: llama-cpp.mydomain - paths: - - path: /v1/completions - pathType: Prefix - tls: [ ] - # - secretName: chart-example-tls - # hosts: - # - chart-example.local - - embeddings: - enabled: true - className: "" - annotations: - kubernetes.io/ingress.class: nginx - - hosts: - - #host: llama-cpp.mydomain - paths: - - path: /v1/embeddings - pathType: Prefix - tls: [ ] - # - secretName: chart-example-tls - # hosts: - # - chart-example.local - -resources: { } - -autoscaling: - enabled: true - minReplicas: 1 - maxReplicas: 4 - targetCPUUtilizationPercentage: 80 - targetMemoryUtilizationPercentage: 80 - -volumes: [ ] - -volumeMounts: [ ] - -nodeSelector: { } - -tolerations: [ ] - -affinity: { } - -persistence: - storageClass: \ No newline at end of file diff --git a/examples/kubernetes/llama-cpp/.helmignore b/examples/kubernetes/llamacpp/.helmignore similarity index 100% rename from examples/kubernetes/llama-cpp/.helmignore rename to examples/kubernetes/llamacpp/.helmignore diff --git a/examples/kubernetes/llamacpp/Chart.lock b/examples/kubernetes/llamacpp/Chart.lock new file mode 100644 index 0000000000000..599acdcd85512 --- /dev/null +++ b/examples/kubernetes/llamacpp/Chart.lock @@ -0,0 +1,9 @@ +dependencies: +- name: model-runner + repository: file://charts/model-runner + version: 0.1.0 +- name: embedding + repository: file://charts/embedding + version: 0.1.0 +digest: sha256:91f709ba2b6a0d17e8ebfe5ee93141115d1d85ae6d1fd3cf77bc6dfaec76d69c +generated: "2024-05-12T19:06:38.283833152+08:00" diff --git a/examples/kubernetes/llamacpp/Chart.yaml b/examples/kubernetes/llamacpp/Chart.yaml new file mode 100644 index 0000000000000..0e64676a079c0 --- /dev/null +++ b/examples/kubernetes/llamacpp/Chart.yaml @@ -0,0 +1,32 @@ +apiVersion: v2 +name: llamacpp +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.16.0" + +dependencies: + - name: modelRunner + version: 0.1.0 + repository: "file://charts/modelRunner" + - name: embedding + version: 0.1.0 + repository: "file://charts/embedding" \ No newline at end of file diff --git a/examples/kubernetes/llamacpp/charts/embedding/.helmignore b/examples/kubernetes/llamacpp/charts/embedding/.helmignore new file mode 100644 index 0000000000000..0e8a0eb36f4ca --- /dev/null +++ b/examples/kubernetes/llamacpp/charts/embedding/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/examples/kubernetes/llamacpp/charts/embedding/Chart.yaml b/examples/kubernetes/llamacpp/charts/embedding/Chart.yaml new file mode 100644 index 0000000000000..f0c186a519321 --- /dev/null +++ b/examples/kubernetes/llamacpp/charts/embedding/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: embedding +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.16.0" diff --git a/examples/kubernetes/llamacpp/charts/embedding/templates/NOTES.txt b/examples/kubernetes/llamacpp/charts/embedding/templates/NOTES.txt new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/examples/kubernetes/llamacpp/charts/embedding/templates/PersistentVolume.yaml b/examples/kubernetes/llamacpp/charts/embedding/templates/PersistentVolume.yaml new file mode 100644 index 0000000000000..385d2aeae5e1f --- /dev/null +++ b/examples/kubernetes/llamacpp/charts/embedding/templates/PersistentVolume.yaml @@ -0,0 +1,21 @@ +{{- range $modelName, $modelConfig := .Values.models }} + + +apiVersion: v1 +kind: PersistentVolume +metadata: + name: {{ include "embedding.fullname" $ | lower }}-pv-{{ $modelName }} + labels: + {{- include "embedding.labels" $ | nindent 4 }} + +spec: + capacity: + storage: 10Gi + accessModes: + - ReadWriteMany + hostPath: + path: {{ $.Values.modelPath.val }} + +--- + +{{- end}} \ No newline at end of file diff --git a/examples/kubernetes/llama-cpp/templates/_helpers.tpl b/examples/kubernetes/llamacpp/charts/embedding/templates/_helpers.tpl similarity index 72% rename from examples/kubernetes/llama-cpp/templates/_helpers.tpl rename to examples/kubernetes/llamacpp/charts/embedding/templates/_helpers.tpl index 54bc8197f1f57..7522c671a87ca 100644 --- a/examples/kubernetes/llama-cpp/templates/_helpers.tpl +++ b/examples/kubernetes/llamacpp/charts/embedding/templates/_helpers.tpl @@ -1,7 +1,7 @@ {{/* Expand the name of the chart. */}} -{{- define "server.llama.cpp.name" -}} +{{- define "embedding.name" -}} {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} {{- end }} @@ -10,7 +10,7 @@ Create a default fully qualified app name. We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). If release name contains chart name it will be used as a full name. */}} -{{- define "server.llama.cpp.fullname" -}} +{{- define "embedding.fullname" -}} {{- if .Values.fullnameOverride }} {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} {{- else }} @@ -26,16 +26,16 @@ If release name contains chart name it will be used as a full name. {{/* Create chart name and version as used by the chart label. */}} -{{- define "server.llama.cpp.chart" -}} +{{- define "embedding.chart" -}} {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} {{- end }} {{/* Common labels */}} -{{- define "server.llama.cpp.labels" -}} -helm.sh/chart: {{ include "server.llama.cpp.chart" . }} -{{ include "server.llama.cpp.selectorLabels" . }} +{{- define "embedding.labels" -}} +helm.sh/chart: {{ include "embedding.chart" . }} +{{ include "embedding.selectorLabels" . }} {{- if .Chart.AppVersion }} app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} {{- end }} @@ -45,17 +45,17 @@ app.kubernetes.io/managed-by: {{ .Release.Service }} {{/* Selector labels */}} -{{- define "server.llama.cpp.selectorLabels" -}} -app.kubernetes.io/name: {{ include "server.llama.cpp.name" . }} +{{- define "embedding.selectorLabels" -}} +app.kubernetes.io/name: {{ include "embedding.name" . }} app.kubernetes.io/instance: {{ .Release.Name }} {{- end }} {{/* Create the name of the service account to use */}} -{{- define "server.llama.cpp.serviceAccountName" -}} +{{- define "embedding.serviceAccountName" -}} {{- if .Values.serviceAccount.create }} -{{- default (include "server.llama.cpp.fullname" .) .Values.serviceAccount.name }} +{{- default (include "embedding.fullname" .) .Values.serviceAccount.name }} {{- else }} {{- default "default" .Values.serviceAccount.name }} {{- end }} diff --git a/examples/kubernetes/llamacpp/charts/embedding/templates/configMap.yaml b/examples/kubernetes/llamacpp/charts/embedding/templates/configMap.yaml new file mode 100644 index 0000000000000..9ee1acce3e149 --- /dev/null +++ b/examples/kubernetes/llamacpp/charts/embedding/templates/configMap.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "embedding.fullname" . | lower }}-configmap +data: + {{- range $modelName, $modelConfig := .Values.models }} + {{ $modelName }}.status: "pending" + {{- end }} diff --git a/examples/kubernetes/llamacpp/charts/embedding/templates/deployment.yaml b/examples/kubernetes/llamacpp/charts/embedding/templates/deployment.yaml new file mode 100644 index 0000000000000..0e566eef234d9 --- /dev/null +++ b/examples/kubernetes/llamacpp/charts/embedding/templates/deployment.yaml @@ -0,0 +1,161 @@ +{{ $port := .Values.service.port }} +{{- range $modelName, $modelConfig := .Values.models }} +{{- if $modelConfig.enabled }} + +{{- if $modelConfig.download }} + +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "embedding.fullname" $ | lower }}-download-{{ $modelName }} + labels: + app: {{ include "embedding.fullname" $ | lower }} +spec: + template: + spec: + initContainers: + - name: fix-permissions + image: busybox + command: ["sh", "-c", "chmod -R 777 /models"] + volumeMounts: + - name: {{ include "embedding.fullname" $ | lower }}-pv-{{ $modelName }} + mountPath: /models + + containers: + - name: init-job + image: alpine/k8s:1.27.11 + command: + - sh + - -c + - | + set -e + if curl -L {{ $modelConfig.url }} --output /models/{{ $modelName }}/{{ $modelName }}.gguf; then + kubectl patch configmap {{ include "embedding.fullname" $ | lower }}-configmap --type merge -p '{"data": {"{{ $modelName }}.status": "completed"}}' + echo "Download succeeded" + else + echo "Download failed" + exit 1 + fi + volumeMounts: + - name: {{ include "embedding.fullname" $ | lower }}-pv-{{ $modelName }} + mountPath: /models/{{ $modelName }} + - name: kubeconfig + mountPath: /.kube + restartPolicy: OnFailure + volumes: + - name: {{ include "embedding.fullname" $ | lower }}-pv-{{ $modelName }} + persistentVolumeClaim: + claimName: {{ include "embedding.fullname" $ | lower }}-pvc-{{ $modelName }} + - name: kubeconfig + hostPath: + path: /home/vadmin/.kube + backoffLimit: 4 + +--- + +{{- end }} + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "embedding.fullname" $ | lower }}-{{ $modelName }} + labels: + {{- include "embedding.labels" $ | nindent 4 }} + modelnameInternal: {{ $modelName }} + +spec: + {{- if ne (int $modelConfig.replicas) 0 }} + replicas: {{ $modelConfig.replicas }} + {{- end }} + selector: + matchLabels: + {{- include "embedding.labels" $ | nindent 6 }} + modelnameInternal: {{ $modelName }} + template: + metadata: + labels: + {{- include "embedding.labels" $ | nindent 8 }} + modelnameInternal: {{ $modelName }} + spec: + securityContext: + fsGroup: 2000 + initContainers: + - name: check-download-job + image: alpine/k8s:1.27.11 + command: + - sh + - -c + - | + while true; do + STATUS=$(kubectl get configmap {{ include "embedding.fullname" $ | lower }}-configmap -o jsonpath='{.data.{{ $modelName }}\.status}') + if [ "$STATUS" == "completed" ]; then + echo "Configmap updated" + sleep 5 + exit 0 + fi + kubectl get configmap {{ include "embedding.fullname" $ | lower }}-configmap -o jsonpath='{.data.{{ $modelName }}\.status}' + echo "Waiting for configmap update..." + sleep 15 + done + volumeMounts: + - name: kubeconfig + mountPath: /.kube + containers: + - name: {{ $modelName }} + image: "{{ $modelConfig.image }}" + command: + {{- if eq $modelConfig.device "cuda" }} + - /server + {{- else }} + - /llama-server + {{- end }} + - --verbose + - -m + - /models/{{ $modelName }}.gguf + - --host + - 0.0.0.0 + {{- if eq $modelConfig.device "cuda" }} + - --n-gpu-layers + - "99" + {{- end }} + + volumeMounts: + - name: {{ include "embedding.fullname" $ | lower }}-pv-{{ $modelName }} + mountPath: /models + readOnly: false + securityContext: + runAsUser: 1001 + runAsGroup: 2000 + ports: + - name: http + containerPort: {{ $port }} + protocol: TCP + tolerations: + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + {{- if eq $modelConfig.device "cuda" }} + resources: + limits: + nvidia.com/gpu: 1 # Request 1 GPU + {{- end }} + livenessProbe: + {{- toYaml $.Values.livenessProbe | nindent 12 }} + readinessProbe: + {{- toYaml $.Values.readinessProbe | nindent 12 }} + + volumes: + - name: {{ include "embedding.fullname" $ | lower }}-pv-{{ $modelName }} + persistentVolumeClaim: + claimName: {{ include "embedding.fullname" $ | lower }}-pvc-{{ $modelName }} + - name: kubeconfig + hostPath: + path: /home/vadmin/.kube + + nodeSelector: + {{- if $modelConfig.nodeType }} + {{- end }} + +--- +{{- end }} +{{- end }} diff --git a/examples/kubernetes/llamacpp/charts/embedding/templates/hpa.yaml b/examples/kubernetes/llamacpp/charts/embedding/templates/hpa.yaml new file mode 100644 index 0000000000000..6efb26c85f14f --- /dev/null +++ b/examples/kubernetes/llamacpp/charts/embedding/templates/hpa.yaml @@ -0,0 +1,24 @@ +{{- range $modelName, $modelConfig := .Values.models }} +{{ if $modelConfig.autoScale.enabled }} +apiVersion: autoscaling/v2beta2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "embedding.fullname" $ | lower }}-{{ $modelName }}-hpa + labels: + app: {{ include "embedding.name" . }} + chart: {{ include "embedding.chart" . }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "embedding.fullname" $ | lower }}-{{ $modelName }} + minReplicas: {{ $modelConfig.autoScale.minReplicas }} + maxReplicas: {{ $modelConfig.autoScale.maxReplicas }} + metrics: + - type: Resource + resource: + name: cpu + targetAverageUtilization: {{ $modelConfig.autoScale.targetAverageUtilization }} +--- +{{- end }} +{{- end }} diff --git a/examples/kubernetes/llamacpp/charts/embedding/templates/persistentvolumeclaim.yaml b/examples/kubernetes/llamacpp/charts/embedding/templates/persistentvolumeclaim.yaml new file mode 100644 index 0000000000000..783b737ee4aa5 --- /dev/null +++ b/examples/kubernetes/llamacpp/charts/embedding/templates/persistentvolumeclaim.yaml @@ -0,0 +1,18 @@ +{{- range $modelName, $modelConfig := .Values.models }} + +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "embedding.fullname" $ | lower }}-pvc-{{ $modelName }} + labels: + {{- include "embedding.labels" $ | nindent 4 }} + modelnameInternal: {{ $modelName }} +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 10Gi +--- + +{{- end}} diff --git a/examples/kubernetes/llamacpp/charts/embedding/templates/service.yaml b/examples/kubernetes/llamacpp/charts/embedding/templates/service.yaml new file mode 100644 index 0000000000000..4c5fe8558c322 --- /dev/null +++ b/examples/kubernetes/llamacpp/charts/embedding/templates/service.yaml @@ -0,0 +1,22 @@ +{{- range $modelName, $modelConfig := .Values.models }} + +apiVersion: v1 +kind: Service +metadata: + name: {{ include "embedding.fullname" $ | lower }}-{{ $modelName }}-svc + labels: + {{- include "embedding.labels" $ | nindent 4 }} + modelnameInternal: {{ $modelName }} +spec: + type: {{ $.Values.service.type }} + ports: + - port: {{ $.Values.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + {{- include "embedding.selectorLabels" $ | nindent 4 }} + modelnameInternal: {{ $modelName }} + +--- +{{- end }} \ No newline at end of file diff --git a/examples/kubernetes/llamacpp/charts/embedding/templates/tests/test-connection.yaml b/examples/kubernetes/llamacpp/charts/embedding/templates/tests/test-connection.yaml new file mode 100644 index 0000000000000..21edb7b569748 --- /dev/null +++ b/examples/kubernetes/llamacpp/charts/embedding/templates/tests/test-connection.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "embedding.fullname" . }}-test-connection" + labels: + {{- include "embedding.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test +spec: + containers: + - name: wget + image: busybox + command: ['wget'] + args: ['{{ include "embedding.fullname" . }}:{{ .Values.service.port }}'] + restartPolicy: Never diff --git a/examples/kubernetes/llamacpp/charts/embedding/values.yaml b/examples/kubernetes/llamacpp/charts/embedding/values.yaml new file mode 100644 index 0000000000000..fb92b9db27d05 --- /dev/null +++ b/examples/kubernetes/llamacpp/charts/embedding/values.yaml @@ -0,0 +1,16 @@ + +livenessProbe: + httpGet: + path: / + port: http +readinessProbe: + httpGet: + path: / + port: http + + +models: { + +} + + diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/.helmignore b/examples/kubernetes/llamacpp/charts/modelRunner/.helmignore new file mode 100644 index 0000000000000..0e8a0eb36f4ca --- /dev/null +++ b/examples/kubernetes/llamacpp/charts/modelRunner/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/Chart.yaml b/examples/kubernetes/llamacpp/charts/modelRunner/Chart.yaml new file mode 100644 index 0000000000000..6d3f6f44a0270 --- /dev/null +++ b/examples/kubernetes/llamacpp/charts/modelRunner/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: modelRunner +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.16.0" diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/templates/NOTES.txt b/examples/kubernetes/llamacpp/charts/modelRunner/templates/NOTES.txt new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/templates/PersistentVolume.yaml b/examples/kubernetes/llamacpp/charts/modelRunner/templates/PersistentVolume.yaml new file mode 100644 index 0000000000000..545dd74a30cfb --- /dev/null +++ b/examples/kubernetes/llamacpp/charts/modelRunner/templates/PersistentVolume.yaml @@ -0,0 +1,21 @@ +{{- range $modelName, $modelConfig := .Values.models }} + + +apiVersion: v1 +kind: PersistentVolume +metadata: + name: {{ include "modelRunner.fullname" $ | lower }}-pv-{{ $modelName }} + labels: + {{- include "modelRunner.labels" $ | nindent 4 }} + +spec: + capacity: + storage: 10Gi + accessModes: + - ReadWriteMany + hostPath: + path: {{ $.Values.modelPath.val }}/{{ $modelName }} + +--- + +{{- end}} \ No newline at end of file diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/templates/_helpers.tpl b/examples/kubernetes/llamacpp/charts/modelRunner/templates/_helpers.tpl new file mode 100644 index 0000000000000..ec5864594b398 --- /dev/null +++ b/examples/kubernetes/llamacpp/charts/modelRunner/templates/_helpers.tpl @@ -0,0 +1,64 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "modelRunner.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "modelRunner.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "modelRunner.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "modelRunner.labels" -}} +helm.sh/chart: {{ include "modelRunner.chart" . }} +{{ include "modelRunner.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "modelRunner.selectorLabels" -}} +app.kubernetes.io/name: {{ include "modelRunner.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "modelRunner.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "modelRunner.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} + + diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/templates/configMap.yaml b/examples/kubernetes/llamacpp/charts/modelRunner/templates/configMap.yaml new file mode 100644 index 0000000000000..dec3586f85cce --- /dev/null +++ b/examples/kubernetes/llamacpp/charts/modelRunner/templates/configMap.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "modelRunner.fullname" . | lower }}-configmap +data: + {{- range $modelName, $modelConfig := .Values.models }} + {{ $modelName }}.status: "pending" + {{- end }} diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/templates/deployment.yaml b/examples/kubernetes/llamacpp/charts/modelRunner/templates/deployment.yaml new file mode 100644 index 0000000000000..d9e4aa67e0ed8 --- /dev/null +++ b/examples/kubernetes/llamacpp/charts/modelRunner/templates/deployment.yaml @@ -0,0 +1,161 @@ +{{ $port := .Values.service.port }} +{{- range $modelName, $modelConfig := .Values.models }} +{{- if $modelConfig.enabled }} + +{{- if $modelConfig.download }} + +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "modelRunner.fullname" $ | lower }}-download-{{ $modelName }} + labels: + app: {{ include "modelRunner.fullname" $ | lower }} +spec: + template: + spec: + initContainers: + - name: fix-permissions + image: busybox + command: ["sh", "-c", "chmod -R 777 /models"] + volumeMounts: + - name: {{ include "modelRunner.fullname" $ | lower }}-pv-{{ $modelName }} + mountPath: /models + + containers: + - name: init-job + image: alpine/k8s:1.27.11 + command: + - sh + - -c + - | + set -e + if curl -L {{ $modelConfig.url }} --output /models/{{ $modelName }}/{{ $modelName }}.gguf; then + kubectl patch configmap {{ include "modelRunner.fullname" $ | lower }}-configmap --type merge -p '{"data": {"{{ $modelName }}.status": "completed"}}' + echo "Download succeeded" + else + echo "Download failed" + exit 1 + fi + volumeMounts: + - name: {{ include "modelRunner.fullname" $ | lower }}-pv-{{ $modelName }} + mountPath: /models/{{ $modelName }} + - name: kubeconfig + mountPath: /.kube + restartPolicy: OnFailure + volumes: + - name: {{ include "modelRunner.fullname" $ | lower }}-pv-{{ $modelName }} + persistentVolumeClaim: + claimName: {{ include "modelRunner.fullname" $ | lower }}-pvc-{{ $modelName }} + - name: kubeconfig + hostPath: + path: /home/vadmin/.kube + backoffLimit: 4 + +--- + +{{- end }} + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "modelRunner.fullname" $ | lower }}-{{ $modelName }} + labels: + {{- include "modelRunner.labels" $ | nindent 4 }} + modelnameInternal: {{ $modelName }} + +spec: + {{- if ne (int $modelConfig.replicas) 0 }} + replicas: {{ $modelConfig.replicas }} + {{- end }} + selector: + matchLabels: + {{- include "modelRunner.labels" $ | nindent 6 }} + modelnameInternal: {{ $modelName }} + template: + metadata: + labels: + {{- include "modelRunner.labels" $ | nindent 8 }} + modelnameInternal: {{ $modelName }} + spec: + securityContext: + fsGroup: 2000 + initContainers: + - name: check-download-job + image: alpine/k8s:1.27.11 + command: + - sh + - -c + - | + while true; do + STATUS=$(kubectl get configmap {{ include "modelRunner.fullname" $ | lower }}-configmap -o jsonpath='{.data.{{ $modelName }}\.status}') + if [ "$STATUS" == "completed" ]; then + echo "Configmap updated" + sleep 5 + exit 0 + fi + kubectl get configmap {{ include "modelRunner.fullname" $ | lower }}-configmap -o jsonpath='{.data.{{ $modelName }}\.status}' + echo "Waiting for configmap update..." + sleep 15 + done + volumeMounts: + - name: kubeconfig + mountPath: /.kube + containers: + - name: {{ $modelName }} + image: "{{ $modelConfig.image }}" + command: + {{- if eq $modelConfig.device "cuda" }} + - /server + {{- else }} + - /llama-server + {{- end }} + - --verbose + - -m + - /models/{{ $modelName }}.gguf + - --host + - 0.0.0.0 + {{- if eq $modelConfig.device "cuda" }} + - --n-gpu-layers + - "99" + {{- end }} + + volumeMounts: + - name: {{ include "modelRunner.fullname" $ | lower }}-pv-{{ $modelName }} + mountPath: /models + readOnly: false + securityContext: + runAsUser: 1001 + runAsGroup: 2000 + ports: + - name: http + containerPort: {{ $port }} + protocol: TCP + tolerations: + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + {{- if eq $modelConfig.device "cuda" }} + resources: + limits: + nvidia.com/gpu: 1 # Request 1 GPU + {{- end }} + livenessProbe: + {{- toYaml $.Values.livenessProbe | nindent 12 }} + readinessProbe: + {{- toYaml $.Values.readinessProbe | nindent 12 }} + + volumes: + - name: {{ include "modelRunner.fullname" $ | lower }}-pv-{{ $modelName }} + persistentVolumeClaim: + claimName: {{ include "modelRunner.fullname" $ | lower }}-pvc-{{ $modelName }} + - name: kubeconfig + hostPath: + path: /home/vadmin/.kube + + nodeSelector: + {{- if $modelConfig.nodeType }} + {{- end }} + +--- +{{- end }} +{{- end }} diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/templates/hpa.yaml b/examples/kubernetes/llamacpp/charts/modelRunner/templates/hpa.yaml new file mode 100644 index 0000000000000..10c75b5d43db6 --- /dev/null +++ b/examples/kubernetes/llamacpp/charts/modelRunner/templates/hpa.yaml @@ -0,0 +1,24 @@ +{{- range $modelName, $modelConfig := .Values.models }} +{{ if $modelConfig.autoScale.enabled }} +apiVersion: autoscaling/v2beta2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "modelRunner.fullname" $ | lower }}-{{ $modelName }}-hpa + labels: + app: {{ include "modelRunner.name" . }} + chart: {{ include "modelRunner.chart" . }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "modelRunner.fullname" $ | lower }}-{{ $modelName }} + minReplicas: {{ $modelConfig.autoScale.minReplicas }} + maxReplicas: {{ $modelConfig.autoScale.maxReplicas }} + metrics: + - type: Resource + resource: + name: cpu + targetAverageUtilization: {{ $modelConfig.autoScale.targetAverageUtilization }} +--- +{{- end }} +{{- end }} diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/templates/persistentvolumeclaim.yaml b/examples/kubernetes/llamacpp/charts/modelRunner/templates/persistentvolumeclaim.yaml new file mode 100644 index 0000000000000..a770a75b9ca78 --- /dev/null +++ b/examples/kubernetes/llamacpp/charts/modelRunner/templates/persistentvolumeclaim.yaml @@ -0,0 +1,18 @@ +{{- range $modelName, $modelConfig := .Values.models }} + +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "modelRunner.fullname" $ | lower }}-pvc-{{ $modelName }} + labels: + {{- include "modelRunner.labels" $ | nindent 4 }} + modelnameInternal: {{ $modelName }} +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 10Gi +--- + +{{- end}} diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/templates/service.yaml b/examples/kubernetes/llamacpp/charts/modelRunner/templates/service.yaml new file mode 100644 index 0000000000000..dd00212917420 --- /dev/null +++ b/examples/kubernetes/llamacpp/charts/modelRunner/templates/service.yaml @@ -0,0 +1,22 @@ +{{- range $modelName, $modelConfig := .Values.models }} + +apiVersion: v1 +kind: Service +metadata: + name: {{ include "modelRunner.fullname" $ | lower }}-{{ $modelName }}-svc + labels: + {{- include "modelRunner.labels" $ | nindent 4 }} + modelnameInternal: {{ $modelName }} +spec: + type: {{ $.Values.service.type }} + ports: + - port: {{ $.Values.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + {{- include "modelRunner.selectorLabels" $ | nindent 4 }} + modelnameInternal: {{ $modelName }} + +--- +{{- end }} \ No newline at end of file diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/templates/sidecar.yaml b/examples/kubernetes/llamacpp/charts/modelRunner/templates/sidecar.yaml new file mode 100644 index 0000000000000..cb2f025976de8 --- /dev/null +++ b/examples/kubernetes/llamacpp/charts/modelRunner/templates/sidecar.yaml @@ -0,0 +1,29 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "modelRunner.fullname" $ | lower }}-sidecar + labels: + {{- include "modelRunner.labels" $ | nindent 4 }} + modelnameInternal: sidecar + +spec: + selector: + matchLabels: + {{- include "modelRunner.labels" $ | nindent 6 }} + template: + metadata: + labels: + {{- include "modelRunner.labels" $ | nindent 8 }} + modelnameInternal: sidecar + + spec: + securityContext: + fsGroup: 2000 + containers: + - name: curl-sidecar + image: curlimages/curl:7.79.1 + command: ["/bin/sh", "-c", "tail -f /dev/null"] + + +--- + diff --git a/examples/kubernetes/llamacpp/charts/modelRunner/values.yaml b/examples/kubernetes/llamacpp/charts/modelRunner/values.yaml new file mode 100644 index 0000000000000..03c296ef0c677 --- /dev/null +++ b/examples/kubernetes/llamacpp/charts/modelRunner/values.yaml @@ -0,0 +1,17 @@ + + +livenessProbe: + httpGet: + path: / + port: http +readinessProbe: + httpGet: + path: / + port: http + + +models: { + +} + + diff --git a/examples/kubernetes/llamacpp/templates/NOTES.txt b/examples/kubernetes/llamacpp/templates/NOTES.txt new file mode 100644 index 0000000000000..8b137891791fe --- /dev/null +++ b/examples/kubernetes/llamacpp/templates/NOTES.txt @@ -0,0 +1 @@ + diff --git a/examples/kubernetes/llamacpp/templates/_helpers.tpl b/examples/kubernetes/llamacpp/templates/_helpers.tpl new file mode 100644 index 0000000000000..9cf785f8a7dbd --- /dev/null +++ b/examples/kubernetes/llamacpp/templates/_helpers.tpl @@ -0,0 +1,66 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "llamacpp.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "llamacpp.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "llamacpp.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "llamacpp.labels" -}} +helm.sh/chart: {{ include "llamacpp.chart" . }} +{{ include "llamacpp.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "llamacpp.selectorLabels" -}} +app.kubernetes.io/name: {{ include "llamacpp.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + + +{{/* +Create the name of the service account to use +*/}} +{{- define "llamacpp.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "llamacpp.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} + + + diff --git a/examples/kubernetes/llamacpp/templates/ingress.yaml b/examples/kubernetes/llamacpp/templates/ingress.yaml new file mode 100644 index 0000000000000..75d42a2f4fcef --- /dev/null +++ b/examples/kubernetes/llamacpp/templates/ingress.yaml @@ -0,0 +1,85 @@ +{{- if .Values.ingress.enabled -}} +{{- $fullName := include "llamacpp.fullname" . -}} +{{- $svcPort := .Values.modelRunner.service.port -}} +{{- $modelRunner := .Values.modelRunner -}} +{{- $modelRunnerFullname := $modelRunner.fullname -}} +{{- $embedding := .Values.embedding -}} +{{- $embeddingFullname := $embedding.fullname -}} +{{- $embeddingSvcPort := .Values.embedding.service.port -}} + +{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} + {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }} + {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}} + {{- end }} +{{- end }} +{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1 +{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1beta1 +{{- else -}} +apiVersion: extensions/v1beta1 +{{- end }} +kind: Ingress +metadata: + name: {{ $fullName }} + labels: + {{- include "llamacpp.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + {{ $pathtype := .pathtype }} + - host: {{ .host | quote }} + http: + paths: + {{- range $modelName, $modelConfig := $modelRunner.models }} + - path: {{ $modelConfig.endpoint }}(/|$)(.*) + {{- if and $pathtype (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} + pathType: {{ $pathtype }} + {{- end }} + backend: + {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} + service: + name: {{ $.Release.Name }}-{{ $modelRunnerFullname }}-{{ $modelName }}-svc + port: + number: {{ $svcPort }} + {{- else }} + serviceName: {{ $.Release.Name }}-{{ $modelRunnerFullname }}-{{ $modelName }}-svc + servicePort: {{ $svcPort }} + {{- end }} + {{- end }} + {{- range $modelName, $modelConfig := $embedding.models }} + - path: {{ $modelConfig.endpoint }}(/|$)(.*) + {{- if and $pathtype (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} + pathType: {{ $pathtype }} + {{- end }} + backend: + {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} + service: + name: {{ $.Release.Name }}-{{ $embeddingFullname }}-{{ $modelName }}-svc + port: + number: {{ $svcPort }} + {{- else }} + serviceName: {{ $.Release.Name }}-{{ $embeddingFullname }}-{{ $modelName }}-svc + servicePort: {{ $svcPort }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/examples/kubernetes/llamacpp/values.yaml b/examples/kubernetes/llamacpp/values.yaml new file mode 100644 index 0000000000000..afb693d5735e5 --- /dev/null +++ b/examples/kubernetes/llamacpp/values.yaml @@ -0,0 +1,108 @@ +# Default values for llamacpp. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + + + +podAnnotations: {} +podLabels: {} + +podSecurityContext: {} + +securityContext: {} + +ingress: + enabled: true + className: "" + annotations: { + nginx.ingress.kubernetes.io/rewrite-target: /$2, + nginx.ingress.kubernetes.io/use-regex: "true" + } + hosts: + - host: demo.local + pathtype: ImplementationSpecific + + tls: [] + + + + +livenessProbe: + httpGet: + path: / + port: http + +readinessProbe: + httpGet: + path: / + port: http + + + + +modelRunner: + fullname: "modelrunner" + service: + type: ClusterIP + port: 8080 + modelPath: + val: + models: { + "model1":{ + "enabled": true, + "download": true, + "replicas": 3, + "device": "cpu", + "autoScale": { + "enabled": false, + "minReplicas": 1, + "maxReplicas": 100, + "targetCPUUtilizationPercentage": 80 + }, + "url": "https://huggingface.co/TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF/resolve/main/capybarahermes-2.5-mistral-7b.Q4_0.gguf", + "image": "ghcr.io/ggerganov/llama.cpp:server", + "endpoint": "/model1" + }, + "model2": { + "enabled": true, + "replicas": 1, + "download": true, + "device": "cuda", + "autoScale": { + "enabled": false, + "minReplicas": 1, + "maxReplicas": 100, + "targetCPUUtilizationPercentage": 80 + }, + "url": "https://huggingface.co/TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF/resolve/main/capybarahermes-2.5-mistral-7b.Q4_0.gguf", + "image": "ghcr.io/ggerganov/llama.cpp:server-cuda", + "endpoint": "/model2" + }, + } + + +embedding: + fullname: "embedding" + service: + type: ClusterIP + port: 8080 + modelPath: + val: /home/vadmin/Desktop/models + models: { + "emod": { + "enabled": true, + "replicas": 1, + "download": true, + "device": "cpu", + "autoScale": { + "enabled": false, + "minReplicas": 1, + "maxReplicas": 100, + "targetCPUUtilizationPercentage": 80 + }, + "url": "https://huggingface.co/TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF/resolve/main/capybarahermes-2.5-mistral-7b.Q4_0.gguf", + "image": "ghcr.io/ggerganov/llama.cpp:server", + "endpoint": "/e" + }, + } + From 0579fbee8086dd3f5060049b79902b0cb7090081 Mon Sep 17 00:00:00 2001 From: Shobhit Date: Sun, 21 Jul 2024 19:57:02 +0800 Subject: [PATCH 2/2] Updated readme with feature set --- examples/kubernetes/README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/examples/kubernetes/README.md b/examples/kubernetes/README.md index 1a9b10fd5ce76..0fd448275b9e4 100644 --- a/examples/kubernetes/README.md +++ b/examples/kubernetes/README.md @@ -68,6 +68,18 @@ Adjust the model path to a local directory that stores the models. The models ar You can also adjust the number of replicas, the device, the image, the endpoint, and the autoscaling parameters. +Ensure that the ingress is enabled on your cluster. You can use the following command to enable the ingress: + +```shell +microk8s enable ingress +``` + +And add the url to `/etc/hosts`: + +```shell +demo.local 127.0.0.1 +``` + ### Metrics monitoring @@ -93,6 +105,7 @@ helm install \ - [x] Auto scaling - [x] CUDA support - [x] Downloading functionality +- [ ] Redownload on upgrade hook. (Currently the models are downloaded only on the first deployment, there is no redownload functionality on upgrade if required) ## Pending testing