Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion config/manifests/gateway/gke/gcp-backend-policy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ spec:
targetRef:
group: "inference.networking.x-k8s.io"
kind: InferencePool
name: vllm-llama3-8b-instruct
name: llama3-8b-instruct
default:
timeoutSec: 300
logging:
Expand Down
2 changes: 1 addition & 1 deletion config/manifests/gateway/gke/healthcheck.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ spec:
targetRef:
group: "inference.networking.x-k8s.io"
kind: InferencePool
name: vllm-llama3-8b-instruct
name: llama3-8b-instruct
default:
config:
type: HTTP
Expand Down
2 changes: 1 addition & 1 deletion config/manifests/gateway/gke/httproute.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ spec:
- backendRefs:
- group: inference.networking.x-k8s.io
kind: InferencePool
name: vllm-llama3-8b-instruct
name: llama3-8b-instruct
matches:
- path:
type: PathPrefix
Expand Down
6 changes: 3 additions & 3 deletions config/manifests/inferencemodel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ spec:
modelName: food-review
criticality: Standard
poolRef:
name: vllm-llama3-8b-instruct
name: llama3-8b-instruct
targetModels:
- name: food-review-1
weight: 100
Expand All @@ -19,7 +19,7 @@ spec:
modelName: meta-llama/Llama-3.1-8B-Instruct
criticality: Critical
poolRef:
name: vllm-llama3-8b-instruct
name: llama3-8b-instruct
---
apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferenceModel
Expand All @@ -29,4 +29,4 @@ spec:
modelName: Qwen/Qwen2.5-1.5B-Instruct
criticality: Critical
poolRef:
name: vllm-llama3-8b-instruct
name: llama3-8b-instruct
20 changes: 10 additions & 10 deletions config/manifests/inferencepool-resources.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,22 @@ apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferencePool
metadata:
labels:
name: vllm-llama3-8b-instruct
name: llama3-8b-instruct
spec:
targetPortNumber: 8000
selector:
app: vllm-llama3-8b-instruct
app: vllm-llama3-8b-instruct # Change this to target a different Model Server Deployment
extensionRef:
name: vllm-llama3-8b-instruct-epp
name: llama3-8b-instruct-epp
---
apiVersion: v1
kind: Service
metadata:
name: vllm-llama3-8b-instruct-epp
name: llama3-8b-instruct-epp
namespace: default
spec:
selector:
app: vllm-llama3-8b-instruct-epp
app: llama3-8b-instruct-epp
ports:
- protocol: TCP
port: 9002
Expand All @@ -31,19 +31,19 @@ spec:
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-llama3-8b-instruct-epp
name: llama3-8b-instruct-epp
namespace: default
labels:
app: vllm-llama3-8b-instruct-epp
app: llama3-8b-instruct-epp
spec:
replicas: 1
selector:
matchLabels:
app: vllm-llama3-8b-instruct-epp
app: llama3-8b-instruct-epp
template:
metadata:
labels:
app: vllm-llama3-8b-instruct-epp
app: llama3-8b-instruct-epp
spec:
# Conservatively, this timeout should mirror the longest grace period of the pods within the pool
terminationGracePeriodSeconds: 130
Expand All @@ -53,7 +53,7 @@ spec:
imagePullPolicy: Always
args:
- -poolName
- "vllm-llama3-8b-instruct"
- "llama3-8b-instruct"
- -v
- "4"
- --zap-encoder
Expand Down