kubernetes-sigs
diff --git a/‎config/charts/inferencemodel/.helmignore
Lines changed: 23 additions & 0 deletions b/‎config/charts/inferencemodel/.helmignore
Lines changed: 23 additions & 0 deletions
diff --git a/‎config/charts/inferencemodel/Chart.yaml
Lines changed: 24 additions & 0 deletions b/‎config/charts/inferencemodel/Chart.yaml
Lines changed: 24 additions & 0 deletions
diff --git a/‎config/charts/inferencemodel/templates/_helpers.tpl
Lines changed: 24 additions & 0 deletions b/‎config/charts/inferencemodel/templates/_helpers.tpl
Lines changed: 24 additions & 0 deletions
diff --git a/‎config/charts/inferencemodel/templates/inferencemodel.yaml
Lines changed: 34 additions & 0 deletions b/‎config/charts/inferencemodel/templates/inferencemodel.yaml
Lines changed: 34 additions & 0 deletions
diff --git a/‎config/charts/inferencemodel/values.yaml
Lines changed: 107 additions & 0 deletions b/‎config/charts/inferencemodel/values.yaml
Lines changed: 107 additions & 0 deletions
diff --git a/‎config/manifests/benchmark/model-server-service.yaml
Lines changed: 0 additions & 12 deletions b/‎config/manifests/benchmark/model-server-service.yaml
Lines changed: 0 additions & 12 deletions
diff --git a/‎site-src/performance/benchmark/index.md
Lines changed: 17 additions & 15 deletions b/‎site-src/performance/benchmark/index.md
Lines changed: 17 additions & 15 deletions
diff --git a/‎tools/benchmark/.gitignore
Lines changed: 1 addition & 0 deletions b/‎tools/benchmark/.gitignore
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
@@ -0,0 +1,24 @@
+apiVersion: v2
+name: inferencemodel
+description: A Helm chart for InferenceModel
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.1.0
+
+# This is the version number of the application being deployed. This version number should be
+# incremented each time you make changes to the application. Versions are not expected to
+# follow Semantic Versioning. They should reflect the version the application is using.
+# It is recommended to use it with quotes.
+appVersion: "1.16.0"
@@ -0,0 +1,24 @@
+{{/*
+Common labels
+*/}}
+{{- define "gateway-api-inference-extension.labels" -}}
+app.kubernetes.io/name: {{ include "gateway-api-inference-extension.name" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+{{- end }}
+
+{{/*
+Inference extension name
+*/}}
+{{- define "gateway-api-inference-extension.name" -}}
+{{- $base := .Values.inferencePool.name | default "default-pool" | lower | trim | trunc 40 -}}
+{{ $base }}-epp
+{{- end -}}
+
+{{/*
+Selector labels
+*/}}
+{{- define "gateway-api-inference-extension.selectorLabels" -}}
+app: {{ include "gateway-api-inference-extension.name" . }}
+{{- end -}}
@@ -0,0 +1,34 @@
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: inferencemodel-sample
+spec:
+  modelName: tweet-summary
+  criticality: Critical
+  poolRef:
+    name: vllm-llama2-7b
+  targetModels:
+  - name: tweet-summary-1
+    weight: 100
+
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: inferencemodel-base-model
+spec:
+  modelName: meta-llama/Llama-2-7b-hf
+  criticality: Critical
+  poolRef:
+    name: vllm-llama2-7b
+
+---
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: inferencemodel-base-model-cpu
+spec:
+  modelName: Qwen/Qwen2.5-1.5B-Instruct
+  criticality: Critical
+  poolRef:
+    name: vllm-llama2-7b
@@ -0,0 +1,107 @@
+# Default values for inferencemodel.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+replicaCount: 1
+
+image:
+  repository: nginx
+  pullPolicy: IfNotPresent
+  # Overrides the image tag whose default is the chart appVersion.
+  tag: ""
+
+imagePullSecrets: []
+nameOverride: ""
+fullnameOverride: ""
+
+serviceAccount:
+  # Specifies whether a service account should be created
+  create: true
+  # Automatically mount a ServiceAccount's API credentials?
+  automount: true
+  # Annotations to add to the service account
+  annotations: {}
+  # The name of the service account to use.
+  # If not set and create is true, a name is generated using the fullname template
+  name: ""
+
+podAnnotations: {}
+podLabels: {}
+
+podSecurityContext: {}
+  # fsGroup: 2000
+
+securityContext: {}
+  # capabilities:
+  #   drop:
+  #   - ALL
+  # readOnlyRootFilesystem: true
+  # runAsNonRoot: true
+  # runAsUser: 1000
+
+service:
+  type: ClusterIP
+  port: 80
+
+ingress:
+  enabled: false
+  className: ""
+  annotations: {}
+    # kubernetes.io/ingress.class: nginx
+    # kubernetes.io/tls-acme: "true"
+  hosts:
+    - host: chart-example.local
+      paths:
+        - path: /
+          pathType: ImplementationSpecific
+  tls: []
+  #  - secretName: chart-example-tls
+  #    hosts:
+  #      - chart-example.local
+
+resources: {}
+  # We usually recommend not to specify default resources and to leave this as a conscious
+  # choice for the user. This also increases chances charts run on environments with little
+  # resources, such as Minikube. If you do want to specify resources, uncomment the following
+  # lines, adjust them as necessary, and remove the curly braces after 'resources:'.
+  # limits:
+  #   cpu: 100m
+  #   memory: 128Mi
+  # requests:
+  #   cpu: 100m
+  #   memory: 128Mi
+
+livenessProbe:
+  httpGet:
+    path: /
+    port: http
+readinessProbe:
+  httpGet:
+    path: /
+    port: http
+
+autoscaling:
+  enabled: false
+  minReplicas: 1
+  maxReplicas: 100
+  targetCPUUtilizationPercentage: 80
+  # targetMemoryUtilizationPercentage: 80
+
+# Additional volumes on the output Deployment definition.
+volumes: []
+# - name: foo
+#   secret:
+#     secretName: mysecret
+#     optional: false
+
+# Additional volumeMounts on the output Deployment definition.
+volumeMounts: []
+# - name: foo
+#   mountPath: "/etc/foo"
+#   readOnly: true
+
+nodeSelector: {}
+
+tolerations: []
+
+affinity: {}
@@ -5,30 +5,26 @@ inference extension, and a Kubernetes service as the load balancing strategy. Th
 benchmark uses the [Latency Profile Generator](https://github.com/AI-Hypercomputer/inference-benchmark) (LPG)
 tool to generate load and collect results.
 
-## Prerequisites
+## Run benchmarks manually
 
-### Deploy the inference extension and sample model server
+### Prerequisite: have an endpoint ready to server inference traffic
 
-Follow this user guide https://gateway-api-inference-extension.sigs.k8s.io/guides/ to deploy the
-sample vLLM application, and the inference extension.
+To serve via a Gateway using the inference extension, follow this [user guide](https://gateway-api-inference-extension.sigs.k8s.io/guides/)
+to deploy the sample vLLM application, and the inference extension.
 
-### [Optional] Scale the sample vLLM deployment
-
-You will more likely to see the benefits of the inference extension when there are a decent number of replicas to make the optimal routing decision. 
+You will more likely to see the benefits of the inference extension when there are a decent number of replicas to make the optimal routing decision. So consider scaling the sample application with more replicas:
 
 ```bash
 kubectl scale --replicas=8 -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml
 ```
 
-### Expose the model server via a k8s service
-
-As the baseline, let's also expose the vLLM deployment as a k8s service:
+To serve via a Kubernetes LoadBalancer service as a baseline comparison, you can expose the sample application:
 
 ```bash
 kubectl expose -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml --port=8081 --target-port=8000 --type=LoadBalancer
 ```
 
-## Run benchmark
+### Run benchmark
 
 The LPG benchmark tool works by sending traffic to the specified target IP and port, and collect results. Follow the steps below to run a single benchmark. You can deploy multiple LPG instances if you want to run benchmarks in parallel against different targets.
 
@@ -60,18 +56,24 @@ to specify what this benchmark is for. For instance, `inference-extension` or `k
 the script below will watch for that log line and then start downloading results.
 
     ```bash
-    benchmark_id='my-benchmark' ./tools/benchmark/download-benchmark-results.bash
+    benchmark_id='my-benchmark' ./tools/benchmark/scripts/download-benchmark-results.bash
     ```
 
 1. After the script finishes, you should see benchmark results under `./tools/benchmark/output/default-run/my-benchmark/results/json` folder.
 
-### Tips
+#### Tips
 
-* You can specify `run_id="runX"` environment variable when running the `./download-benchmark-results.bash` script.
+* You can specify `run_id="runX"` environment variable when running the `download-benchmark-results.bash` script.
 This is useful when you run benchmarks multiple times to get a more statistically meaningful results and group the results accordingly.
 * Update the `request_rates` that best suit your benchmark environment.
 
-### Advanced Benchmark Configurations
+## Run benchmarks automatically
+
+The [benchmark automation tool](https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/tools/benchmark) enables defining benchmarks via a config file and running the benchmarks
+automatically. It's currently experimental. To try it, refer to its [user guide](https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/tools/benchmark).
+
+
+## Advanced Benchmark Configurations
 
 Pls refer to the [LPG user guide](https://github.com/AI-Hypercomputer/inference-benchmark?tab=readme-ov-file#configuring-the-benchmark) for a detailed list of configuration knobs.
 
 
@@ -0,0 +1 @@
+output/