stackhpc
diff --git a/‎.gitignore
Lines changed: 6 additions & 2 deletions b/‎.gitignore
Lines changed: 6 additions & 2 deletions
diff --git a/‎chart/Chart.yaml
Lines changed: 6 additions & 1 deletion b/‎chart/Chart.yaml
Lines changed: 6 additions & 1 deletion
diff --git a/‎chart/azimuth-ui.schema.yaml
Lines changed: 20 additions & 1 deletion b/‎chart/azimuth-ui.schema.yaml
Lines changed: 20 additions & 1 deletion
diff --git a/‎chart/templates/NOTES.txt
Lines changed: 12 additions & 1 deletion b/‎chart/templates/NOTES.txt
Lines changed: 12 additions & 1 deletion
diff --git a/‎chart/templates/api/deployment.yml
Lines changed: 3 additions & 8 deletions b/‎chart/templates/api/deployment.yml
Lines changed: 3 additions & 8 deletions
diff --git a/‎chart/templates/api/zenith-reservation.yml
Lines changed: 2 additions & 0 deletions b/‎chart/templates/api/zenith-reservation.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎chart/templates/ui/deployment.yml
Lines changed: 3 additions & 0 deletions b/‎chart/templates/ui/deployment.yml
Lines changed: 3 additions & 0 deletions
diff --git a/‎chart/values.schema.json
Lines changed: 48 additions & 2 deletions b/‎chart/values.schema.json
Lines changed: 48 additions & 2 deletions
diff --git a/‎chart/values.yaml
Lines changed: 12 additions & 12 deletions b/‎chart/values.yaml
Lines changed: 12 additions & 12 deletions
diff --git a/‎chart/web-app/api_startup_check.py
Lines changed: 0 additions & 19 deletions b/‎chart/web-app/api_startup_check.py
Lines changed: 0 additions & 19 deletions
@@ -1,11 +1,15 @@
 **/kubeconfig.y[a]ml
 *kubeconfig*.y[a]ml
-venv/
 .vscode/
 __pycache__/
 **/*.secret
 
 # Ignore local dev helpers
 test-values.y[a]ml
 chart/web-app/settings.yml
-gradio-client-test.py
+gradio-client-test.py
+venv*/
+
+# Helm chart stuff
+chart/Chart.lock
+chart/charts
@@ -26,4 +26,9 @@ appVersion: "1.16.0"
 icon: https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.svg
 
 annotations:
-  azimuth.stackhpc.com/label: HuggingFace LLM
+  azimuth.stackhpc.com/label: HuggingFace LLM
+
+dependencies:
+  - name: reloader
+    version: 1.0.63
+    repository: https://stakater.github.io/stakater-charts
@@ -1,4 +1,23 @@
 controls:
   /huggingface/model:
     type: TextControl
-    placeholder: tiiuae/falcon-7b-instruct
+  /huggingface/token:
+    type: TextControl
+    secret: true
+  /ui/appSettings/model_instruction:
+    type: TextControl
+  /ui/appSettings/llm_max_tokens:
+    type: NumberControl
+  /ui/appSettings/llm_temperature:
+    type: NumberControl
+  /ui/appSettings/llm_top_p:
+    type: NumberControl
+  /ui/appSettings/llm_frequency_penalty:
+    type: NumberControl
+  /ui/appSettings/llm_presence_penalty:
+    type: NumberControl
+  # Use mirror to mimic yaml anchor in base Helm chart
+  /ui/appSettings/model_name:
+    type: MirrorControl
+    path: /huggingface/model
+    visuallyHidden: true
@@ -1 +1,12 @@
-The LLM app allows users to deploy machine learning models using [vLLM](https://docs.vllm.ai/en/latest/) as a model serving backend and [gradio](https://github.com/gradio-app/gradio) as a web interface.
+The LLM chatbot app allows users to deploy machine learning models from [Huggingface](https://huggingface.co/models) and interact with them through a simple web interface. 
+
+Note: The target Kubernetes cluster must have a GPU worker node group configured, otherwise the app will remain in an 'Installing' state until a GPU node becomes available for scheduling.
+
+On deployment of a new model, the app must first download the model's weights from Huggingface. 
+This can take a significant amount of time depending on model choice and network speeds.
+Download progress can be monitored by inspecting the logs for the LLM API pod(s) via the Kubernetes Dashboard for the target cluster.
+
+The app use [vLLM](https://docs.vllm.ai/en/latest/) as a model serving backend and [gradio](https://github.com/gradio-app/gradio) + [LangChain](https://python.langchain.com/docs/get_started/introduction) to provide the web interface.
+The official list of Huggingface models supported by vLLM can be found [here](https://docs.vllm.ai/en/latest/models/supported_models.html), though some of these may not be compatible with the LangChain prompt format. 
+See [this documentation](https://github.com/stackhpc/azimuth-llm/) for a non-exhaustive list of languange models against which the app has been tested.
+ 
@@ -25,11 +25,7 @@ spec:
         volumeMounts:
         - name: data
           mountPath: /root/.cache/huggingface
-        command:
-        - python3
         args:
-          - -m
-          - vllm.entrypoints.api_server
           - --model
           - {{ .Values.huggingface.model }}
           {{- if .Values.api.extraArgs -}}
@@ -47,15 +43,14 @@ spec:
         {{- fail "Either secretName or token value must be set for Llama and other gated models" }}
         {{- end }}
         readinessProbe:
-          tcpSocket:
+          httpGet:
             port: 8000
-          initialDelaySeconds: 15
-          periodSeconds: 10
+            path: /health
+          periodSeconds: 60
         resources:
           limits:
             nvidia.com/gpu: {{ .Values.api.gpus | int }}
       volumes:
-        # TODO: Make this configurable (e.g. hostPath or PV)
         - name: data
           {{- .Values.api.cacheVolume | toYaml | nindent 10 }}
         # Suggested in vLLM docs
 
@@ -3,6 +3,8 @@ apiVersion: zenith.stackhpc.com/v1alpha1
 kind: Reservation
 metadata:
   name: {{ .Release.Name }}-api
+  labels:
+    {{- include "azimuth-llm.labels" . | nindent 4 }}
   annotations:
     azimuth.stackhpc.com/service-label: {{ quote .Values.api.service.zenith.label }}
     azimuth.stackhpc.com/service-icon-url: {{ .Values.api.service.zenith.iconUrl }}
 
@@ -4,6 +4,9 @@ metadata:
   name: {{ .Release.Name }}-ui
   labels:
     {{- include "azimuth-llm.labels" . | nindent 4 }}
+  annotations:
+    # Make sure UI is reloaded when app settings are updated
+    reloader.stakater.com/auto: "true"
 spec:
   replicas: 1
   selector:
 
@@ -8,8 +8,7 @@
                 "model": {
                     "type": "string",
                     "title": "Model",
-                    "description": "The HuggingFace model to deploy.",
-                    "default": "tiiuae/falcon-7b-instruct"
+                    "description": "The HuggingFace model to deploy (Hint: For a simple, lightweight demo try ise-uiuc/Magicoder-S-DS-6.7B)"
                 },
                 "token": {
                     "type": "string",
@@ -19,6 +18,53 @@
                 }
             },
             "required": ["model"]
+        },
+        "ui": {
+            "type": "object",
+            "properties": {
+                "appSettings": {
+                    "type": "object",
+                    "properties": {
+                        "model_name": {
+                            "type": "string",
+                            "title": "Model Name",
+                            "description": "Model name supplied to OpenAI client in frontend web app. Should match huggingface.model above."
+                        },
+                        "model_instruction": {
+                            "type": "string",
+                            "title": "Model instruction",
+                            "description": "The initial model prompt (i.e. the hidden instructions) to use when generating responses."
+                        },
+                        "llm_max_tokens": {
+                            "type": "number",
+                            "title": "LLM temperature",
+                            "description": "The maximum number of new [tokens](https://platform.openai.com/docs/api-reference/chat/create#chat-create-max_tokens) to generate for each LLM responses."
+                        },
+                        "llm_temperature": {
+                            "type": "number",
+                            "title": "LLM temperature",
+                            "description": "The '[temperature](https://platform.openai.com/docs/api-reference/chat/create#chat-create-temperature)' value to use when generating LLM responses."
+                        },
+                        "llm_top_p": {
+                            "type": "number",
+                            "title": "LLM Top P",
+                            "description": "The [top p](https://platform.openai.com/docs/api-reference/chat/create#chat-create-top_p) value to use when generating LLM responses."
+                        },
+                        "llm_presence_penalty": {
+                            "type": "number",
+                            "title": "LLM Presence Penalty",
+                            "description": "The [presence penalty](https://platform.openai.com/docs/api-reference/chat/create#chat-create-presence_penalty) to use when generating LLM responses."
+                        },
+                        "llm_frequency_penalty": {
+                            "type": "number",
+                            "title": "LLM Frequency Penalty",
+                            "description": "The [frequency_penalty](https://platform.openai.com/docs/api-reference/chat/create#chat-create-frequency_penalty) to use when generating LLM responses."
+                        }
+
+                    },
+                    "required": ["model_name"]
+                }
+            }
         }
     },
     "required": ["huggingface"]
 
@@ -4,12 +4,8 @@
 
 huggingface:
   # The name of the HuggingFace model to use
-  model: tiiuae/falcon-7b-instruct
-  # Other (partially tested) options:
-  # (some of which may not fit on a single GPU and will take a long time to download)
-  # - meta-llama/Llama-2-7b-chat-hf # Requires licence token
-  # - tiiuae/falcon-40b # Weights ~160GB disk size
-  # - bigscience/bloom # Weights were trending towards ~360GB disk size
+  # Use a yaml anchor to avoid duplication elsewhere
+  model: &model-name ise-uiuc/Magicoder-S-DS-6.7B
 
   # For private/gated huggingface models (e.g. Meta's Llama models) 
   # you must provide your own huggingface token, for details see:
@@ -30,7 +26,7 @@ api:
   # Container image config
   image:
     repository: vllm/vllm-openai
-    version: v0.2.4
+    version: v0.2.7
   # Service config 
   service:
     name: llm-backend
@@ -39,7 +35,7 @@ api:
       enabled: false
       skipAuth: false
       label: Inference API
-      iconUrl:
+      iconUrl: https://raw.githubusercontent.com/vllm-project/vllm/v0.2.7/docs/source/assets/logos/vllm-logo-only-light.png
       description: |
         The raw inference API endpoints for the deployed LLM.
   # Config for huggingface model cache volume
@@ -70,24 +66,28 @@ ui:
   # The values to be written to settings.yml for parsing as frontend app setting
   # (see example_app.py and config.py for example using pydantic-settings to configure app)
   appSettings:
-    prompt_template: ""
+    model_name: *model-name
+    model_instruction: "You are a helpful AI assistant. Please response appropriately."
   # Container image config
   image:
     repository: ghcr.io/stackhpc/azimuth-llm-ui-base
-    version: de4324c
+    version: "984c499"
   # Service config 
   service:
     name: web-app
     type: ClusterIP
     zenith:
       enabled: true
       skipAuth: false
-      label: Web Interface
+      label: Chat Interface
       iconUrl: https://raw.githubusercontent.com/gradio-app/gradio/5524e590577769b0444a5332b8d444aafb0c5c12/js/app/public/static/img/logo.svg
       description: |
         A web-based user inferface for interacting with the deployed LLM.
   # The update strategy to use for the deployment
   updateStrategy:
     rollingUpdate:
       maxSurge: 25%
-      maxUnavailable: 25%
+      maxUnavailable: 25%
+
+reloader:
+  watchGlobally: false