diff --git a/Makefile b/Makefile index 44759e457..e34a1a922 100644 --- a/Makefile +++ b/Makefile @@ -105,9 +105,8 @@ vet: ## Run go vet against code. test: manifests generate fmt vet envtest ## Run tests. KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -coverprofile cover.out -# Utilize Kind or modify the e2e tests to load the image locally, enabling compatibility with other vendors. -.PHONY: test-e2e # Run the e2e tests against a Kind k8s instance that is spun up. -test-e2e: +.PHONY: test-e2e +test-e2e: ## Run end-to-end tests against an existing Kubernetes cluster with at least 3 available GPUs. go test ./test/e2e/ -v -ginkgo.v .PHONY: lint diff --git a/README.md b/README.md index 85185a2bf..a15e9542d 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,10 @@ This project is currently in development. Follow this [README](./pkg/README.md) to get the inference-extension up and running on your cluster! +## End-to-End Tests + +Follow this [README](./test/e2e/README.md) to learn more about running the inference-extension end-to-end test suite on your cluster. + ## Website Detailed documentation is available on our website: https://gateway-api-inference-extension.sigs.k8s.io/ diff --git a/go.mod b/go.mod index 2ae6913b5..42d606a8c 100644 --- a/go.mod +++ b/go.mod @@ -20,11 +20,13 @@ require ( google.golang.org/grpc v1.70.0 google.golang.org/protobuf v1.36.4 k8s.io/api v0.32.1 + k8s.io/apiextensions-apiserver v0.32.1 k8s.io/apimachinery v0.32.1 k8s.io/client-go v0.32.1 k8s.io/code-generator v0.32.1 k8s.io/component-base v0.32.1 k8s.io/klog/v2 v2.130.1 + k8s.io/utils v0.0.0-20241210054802-24370beab758 sigs.k8s.io/controller-runtime v0.20.1 sigs.k8s.io/structured-merge-diff/v4 v4.5.0 ) @@ -73,6 +75,7 @@ require ( github.com/google/gofuzz v1.2.0 // indirect github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect github.com/google/uuid v1.6.0 // indirect + github.com/gorilla/websocket v1.5.0 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect github.com/huandu/xstrings v1.3.3 // indirect github.com/imdario/mergo v0.3.11 // indirect @@ -87,9 +90,11 @@ require ( github.com/mattn/go-isatty v0.0.20 // indirect github.com/mitchellh/copystructure v1.2.0 // indirect github.com/mitchellh/reflectwalk v1.0.2 // indirect + github.com/moby/spdystream v0.5.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect github.com/pkg/errors v0.9.1 // indirect github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect @@ -99,6 +104,7 @@ require ( github.com/spf13/cobra v1.8.1 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/stoewer/go-strcase v1.3.0 // indirect + github.com/stretchr/objx v0.5.2 // indirect github.com/x448/float16 v0.8.4 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 // indirect go.opentelemetry.io/otel v1.32.0 // indirect @@ -128,11 +134,9 @@ require ( gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiextensions-apiserver v0.32.0 // indirect - k8s.io/apiserver v0.32.0 // indirect + k8s.io/apiserver v0.32.1 // indirect k8s.io/gengo/v2 v2.0.0-20240911193312-2b36238f13e9 // indirect k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f // indirect - k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 // indirect sigs.k8s.io/controller-tools v0.14.0 // indirect sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect diff --git a/go.sum b/go.sum index d346a198a..f8584f8fe 100644 --- a/go.sum +++ b/go.sum @@ -19,6 +19,8 @@ github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 h1:JYp7IbQjafo github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a h1:idn718Q4B6AGu/h5Sxe66HYVdqdGu2l9Iebqhi/AEoA= github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= @@ -116,6 +118,8 @@ github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad/go.mod h1:vavhavw2zAx github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc= +github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 h1:bkypFPDjIYGfCYD5mRBvpqxfYX1YCS1PXdKYWi8FsN0= github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0/go.mod h1:P+Lt/0by1T8bfcF3z737NnSbmxQAppXMRziHUxPOC8k= github.com/huandu/xstrings v1.3.3 h1:/Gcsuc1x8JVbJ9/rlye4xZnVAbEkGauT8lbebqcQws4= @@ -160,6 +164,8 @@ github.com/mitchellh/copystructure v1.2.0/go.mod h1:qLl+cE2AmVv+CoeAwDPye/v+N2HK github.com/mitchellh/reflectwalk v1.0.0/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw= github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ= github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw= +github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU= +github.com/moby/spdystream v0.5.0/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -167,6 +173,8 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= +github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= @@ -207,6 +215,8 @@ github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8w github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= @@ -341,12 +351,12 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= k8s.io/api v0.32.1 h1:f562zw9cy+GvXzXf0CKlVQ7yHJVYzLfL6JAS4kOAaOc= k8s.io/api v0.32.1/go.mod h1:/Yi/BqkuueW1BgpoePYBRdDYfjPF5sgTr5+YqDZra5k= -k8s.io/apiextensions-apiserver v0.32.0 h1:S0Xlqt51qzzqjKPxfgX1xh4HBZE+p8KKBq+k2SWNOE0= -k8s.io/apiextensions-apiserver v0.32.0/go.mod h1:86hblMvN5yxMvZrZFX2OhIHAuFIMJIZ19bTvzkP+Fmw= +k8s.io/apiextensions-apiserver v0.32.1 h1:hjkALhRUeCariC8DiVmb5jj0VjIc1N0DREP32+6UXZw= +k8s.io/apiextensions-apiserver v0.32.1/go.mod h1:sxWIGuGiYov7Io1fAS2X06NjMIk5CbRHc2StSmbaQto= k8s.io/apimachinery v0.32.1 h1:683ENpaCBjma4CYqsmZyhEzrGz6cjn1MY/X2jB2hkZs= k8s.io/apimachinery v0.32.1/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE= -k8s.io/apiserver v0.32.0 h1:VJ89ZvQZ8p1sLeiWdRJpRD6oLozNZD2+qVSLi+ft5Qs= -k8s.io/apiserver v0.32.0/go.mod h1:HFh+dM1/BE/Hm4bS4nTXHVfN6Z6tFIZPi649n83b4Ag= +k8s.io/apiserver v0.32.1 h1:oo0OozRos66WFq87Zc5tclUX2r0mymoVHRq8JmR7Aak= +k8s.io/apiserver v0.32.1/go.mod h1:UcB9tWjBY7aryeI5zAgzVJB/6k7E97bkr1RgqDz0jPw= k8s.io/client-go v0.32.1 h1:otM0AxdhdBIaQh7l1Q0jQpmo7WOFIk5FFa4bg6YMdUU= k8s.io/client-go v0.32.1/go.mod h1:aTTKZY7MdxUaJ/KiUs8D+GssR9zJZi77ZqtzcGXIiDg= k8s.io/code-generator v0.32.1 h1:4lw1kFNDuFYXquTkB7Sl5EwPMUP2yyW9hh6BnFfRZFY= @@ -359,8 +369,8 @@ k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f h1:GA7//TjRY9yWGy1poLzYYJJ4JRdzg3+O6e8I+e+8T5Y= k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f/go.mod h1:R/HEjbvWI0qdfb8viZUeVZm0X6IZnxAydC7YU42CMw4= -k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 h1:M3sRQVHv7vB20Xc2ybTt7ODCeFj6JSWYFzOFnYeS6Ro= -k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/utils v0.0.0-20241210054802-24370beab758 h1:sdbE21q2nlQtFh65saZY+rRM6x6aJJI8IUa1AmH/qa0= +k8s.io/utils v0.0.0-20241210054802-24370beab758/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 h1:CPT0ExVicCzcpeN4baWEV2ko2Z/AsiZgEdwgcfwLgMo= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= sigs.k8s.io/controller-runtime v0.20.1 h1:JbGMAG/X94NeM3xvjenVUaBjy6Ui4Ogd/J5ZtjZnHaE= diff --git a/pkg/README.md b/pkg/README.md index ee82aab2d..04ebfde28 100644 --- a/pkg/README.md +++ b/pkg/README.md @@ -4,31 +4,34 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ### Requirements - Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher - - A cluster that has built-in support for `ServiceType=LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running) - - For example, with Kind, you can follow these steps: https://kind.sigs.k8s.io/docs/user/loadbalancer + - A cluster with: + - Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind, + you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer). + - 3 GPUs to run the sample model server. Adjust the number of replicas in `./manifests/vllm/deployment.yaml` as needed. ### Steps -1. **Deploy Sample vLLM Application** +1. **Deploy Sample Model Server** - Create a Hugging Face secret to download the model [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf). Ensure that the token grants access to this model. + Create a Hugging Face secret to download the model [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf). Ensure that the token grants access to this model. Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. ```bash kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2 - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/vllm/vllm-lora-deployment.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/vllm/deployment.yaml ``` -1. **Install the CRDs into the cluster:** +1. **Install the Inference Extension CRDs:** ```sh kubectl apply -k https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd ``` -1. **Deploy InferenceModel and InferencePool** +1. **Deploy InferenceModel** - Deploy a sample InferenceModel and InferencePool configuration based on the vLLM deployments mentioned above. + Deploy the sample InferenceModel which is configured to load balance traffic between the `tweet-summary-0` and `tweet-summary-1` + [LoRA adapters](https://docs.vllm.ai/en/latest/features/lora.html) of the sample model server. ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/inferencepool-with-model.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/inferencemodel.yaml ``` 1. **Update Envoy Gateway Config to enable Patch Policy** @@ -46,11 +49,15 @@ This quickstart guide is intended for engineers familiar with k8s and model serv kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/gateway.yaml ``` > **_NOTE:_** This file couples together the gateway infra and the HTTPRoute infra for a convenient, quick startup. Creating additional/different InferencePools on the same gateway will require an additional set of: `Backend`, `HTTPRoute`, the resources included in the `./manifests/gateway/ext-proc.yaml` file, and an additional `./manifests/gateway/patch_policy.yaml` file. ***Should you choose to experiment, familiarity with xDS and Envoy are very useful.*** - - + Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: + ```bash + $ kubectl get gateway inference-gateway + NAME CLASS ADDRESS PROGRAMMED AGE + inference-gateway inference-gateway True 22s + ``` -1. **Deploy Ext-Proc** +1. **Deploy the Inference Extension and InferencePool** ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/ext_proc.yaml diff --git a/pkg/manifests/ext_proc.yaml b/pkg/manifests/ext_proc.yaml index 3263422f7..4eb0415a7 100644 --- a/pkg/manifests/ext_proc.yaml +++ b/pkg/manifests/ext_proc.yaml @@ -40,6 +40,16 @@ roleRef: kind: ClusterRole name: pod-read --- +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: InferencePool +metadata: + labels: + name: vllm-llama2-7b-pool +spec: + targetPortNumber: 8000 + selector: + app: vllm-llama2-7b-pool +--- apiVersion: apps/v1 kind: Deployment metadata: diff --git a/pkg/manifests/inferencepool-with-model.yaml b/pkg/manifests/inferencemodel.yaml similarity index 72% rename from pkg/manifests/inferencepool-with-model.yaml rename to pkg/manifests/inferencemodel.yaml index b5980e2ea..0085a89da 100644 --- a/pkg/manifests/inferencepool-with-model.yaml +++ b/pkg/manifests/inferencemodel.yaml @@ -1,14 +1,4 @@ apiVersion: inference.networking.x-k8s.io/v1alpha1 -kind: InferencePool -metadata: - labels: - name: vllm-llama2-7b-pool -spec: - targetPortNumber: 8000 - selector: - app: vllm-llama2-7b-pool ---- -apiVersion: inference.networking.x-k8s.io/v1alpha1 kind: InferenceModel metadata: labels: diff --git a/pkg/manifests/vllm/vllm-lora-deployment.yaml b/pkg/manifests/vllm/deployment.yaml similarity index 99% rename from pkg/manifests/vllm/vllm-lora-deployment.yaml rename to pkg/manifests/vllm/deployment.yaml index a453eb7ee..30f6f6711 100644 --- a/pkg/manifests/vllm/vllm-lora-deployment.yaml +++ b/pkg/manifests/vllm/deployment.yaml @@ -10,14 +10,11 @@ spec: port: 8000 targetPort: 8000 type: ClusterIP - --- - apiVersion: apps/v1 kind: Deployment metadata: name: vllm-llama2-7b-pool - namespace: default spec: replicas: 3 selector: diff --git a/test/e2e/README.md b/test/e2e/README.md new file mode 100644 index 000000000..584d89143 --- /dev/null +++ b/test/e2e/README.md @@ -0,0 +1,38 @@ +# End-to-End Tests + +This document provides instructions on how to run the end-to-end tests. + +## Overview + +The end-to-end tests are designed to validate end-to-end Gateway API Inference Extension functionality. These tests are executed against a Kubernetes cluster and use the Ginkgo testing framework to ensure the extension behaves as expected. + +## Prerequisites + +- [Go](https://golang.org/doc/install) installed on your machine. +- [Make](https://www.gnu.org/software/make/manual/make.html) installed to run the end-to-end test target. +- A Hugging Face Hub token with access to the [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) model. + +## Running the End-to-End Tests + +Follow these steps to run the end-to-end tests: + +1. **Clone the Repository**: Clone the `gateway-api-inference-extension` repository: + + ```sh + git clone https://github.com/kubernetes-sigs/gateway-api-inference-extension.git && cd gateway-api-inference-extension + ``` + +1. **Export Your Hugging Face Hub Token**: The token is required to run the test model server: + + ```sh + export HF_TOKEN= + ``` + +1. **Run the Tests**: Run the `test-e2e` target: + + ```sh + make test-e2e + ``` + + The test suite prints details for each step. Note that the `vllm-llama2-7b-pool` model server deployment + may take several minutes to report an `Available=True` status due to the time required for bootstraping. diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index 2da01a9c8..7449d5f61 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -17,16 +17,339 @@ limitations under the License. package e2e import ( + "context" "fmt" + "os" + "strings" "testing" + "time" - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" + "github.com/onsi/ginkgo/v2" + "github.com/onsi/gomega" + infextv1a1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + testutils "inference.networking.x-k8s.io/gateway-api-inference-extension/test/utils" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + apiextv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/serializer" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/config" ) -// Run e2e tests using the Ginkgo runner. -func TestE2E(t *testing.T) { - RegisterFailHandler(Fail) - _, _ = fmt.Fprintf(GinkgoWriter, "Starting api suite\n") - RunSpecs(t, "e2e suite") +const ( + // defaultExistsTimeout is the default timeout for a resource to exist in the api server. + defaultExistsTimeout = 30 * time.Second + // defaultReadyTimeout is the default timeout for a resource to report a ready state. + defaultReadyTimeout = 3 * time.Minute + // defaultModelReadyTimeout is the default timeout for the model server deployment to report a ready state. + defaultModelReadyTimeout = 10 * time.Minute + // defaultInterval is the default interval to check if a resource exists or ready conditions. + defaultInterval = time.Millisecond * 250 + // nsName is the name of the Namespace used for tests. + // TODO [danehans]: Must be "default" until https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/227 is fixed + nsName = "default" + // modelServerName is the name of the model server test resources. + modelServerName = "vllm-llama2-7b-pool" + // modelName is the test model name. + modelName = "tweet-summary" + // envoyName is the name of the envoy proxy test resources. + envoyName = "envoy" + // envoyPort is the listener port number of the test envoy proxy. + envoyPort = "8081" + // inferExtName is the name of the inference extension test resources. + inferExtName = "inference-gateway-ext-proc" + // clientManifest is the manifest for the client test resources. + clientManifest = "../testdata/client.yaml" + // modelServerManifest is the manifest for the model server test resources. + modelServerManifest = "../../pkg/manifests/vllm/deployment.yaml" + // modelServerSecretManifest is the manifest for the model server secret resource. + modelServerSecretManifest = "../testdata/model-secret.yaml" + // inferPoolManifest is the manifest for the inference pool CRD. + inferPoolManifest = "../../config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml" + // inferModelManifest is the manifest for the inference model CRD. + inferModelManifest = "../../config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml" + // inferExtManifest is the manifest for the inference extension test resources. + inferExtManifest = "../../pkg/manifests/ext_proc.yaml" + // envoyManifest is the manifest for the envoy proxy test resources. + envoyManifest = "../testdata/envoy.yaml" +) + +var ( + ctx context.Context + cli client.Client + // Required for exec'ing in curl pod + kubeCli *kubernetes.Clientset + scheme = runtime.NewScheme() + cfg = config.GetConfigOrDie() +) + +func TestAPIs(t *testing.T) { + gomega.RegisterFailHandler(ginkgo.Fail) + ginkgo.RunSpecs(t, + "End To End Test Suite", + ) +} + +var _ = ginkgo.BeforeSuite(func() { + ginkgo.By("Setting up the test suite") + setupSuite() + + ginkgo.By("Creating test infrastructure") + setupInfra() +}) + +func setupInfra() { + crds := map[string]string{ + "inferencepools.inference.networking.x-k8s.io": inferPoolManifest, + "inferencemodels.inference.networking.x-k8s.io": inferModelManifest, + } + createCRDs(cli, crds) + createInferExt(cli, inferExtManifest) + createClient(cli, clientManifest) + createEnvoy(cli, envoyManifest) + // Run this step last, as it requires additional time for the model server to become ready. + createModelServer(cli, modelServerSecretManifest, modelServerManifest) +} + +var _ = ginkgo.AfterSuite(func() { + ginkgo.By("Performing global cleanup") + cleanupResources() +}) + +// setupSuite initializes the test suite by setting up the Kubernetes client, +// loading required API schemes, and validating configuration. +func setupSuite() { + ctx = context.Background() + gomega.ExpectWithOffset(1, cfg).NotTo(gomega.BeNil()) + + err := clientgoscheme.AddToScheme(scheme) + gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) + + err = apiextv1.AddToScheme(scheme) + gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) + + err = infextv1a1.AddToScheme(scheme) + gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) + + cli, err = client.New(cfg, client.Options{Scheme: scheme}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(cli).NotTo(gomega.BeNil()) + + kubeCli, err = kubernetes.NewForConfig(cfg) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) +} + +func cleanupResources() { + gomega.Expect(testutils.DeleteClusterResources(ctx, cli)).To(gomega.Succeed()) + gomega.Expect(testutils.DeleteNamespacedResources(ctx, cli, nsName)).To(gomega.Succeed()) +} + +func cleanupInferModelResources() { + gomega.Expect(testutils.DeleteInferenceModelResources(ctx, cli, nsName)).To(gomega.Succeed()) +} + +func getTimeout(key string, fallback time.Duration) time.Duration { + if value, ok := os.LookupEnv(key); ok { + if parsed, err := time.ParseDuration(value); err == nil { + return parsed + } + } + return fallback +} + +var ( + existsTimeout = getTimeout("EXISTS_TIMEOUT", defaultExistsTimeout) + readyTimeout = getTimeout("READY_TIMEOUT", defaultReadyTimeout) + modelReadyTimeout = getTimeout("MODEL_READY_TIMEOUT", defaultModelReadyTimeout) + interval = defaultInterval +) + +// namespaceExists ensures that a specified namespace exists and is ready for use. +func namespaceExists(k8sClient client.Client, ns string) { + ginkgo.By("Ensuring namespace exists: " + ns) + testutils.EventuallyExists(ctx, func() error { + return k8sClient.Get(ctx, types.NamespacedName{Name: ns}, &corev1.Namespace{}) + }, existsTimeout, interval) +} + +// createCRDs creates the Inference Extension CRDs used for testing. +func createCRDs(k8sClient client.Client, crds map[string]string) { + for name, path := range crds { + ginkgo.By("Creating CRD resource from manifest: " + path) + applyYAMLFile(k8sClient, path) + + // Wait for the CRD to exist. + crd := &apiextv1.CustomResourceDefinition{} + testutils.EventuallyExists(ctx, func() error { + return k8sClient.Get(ctx, types.NamespacedName{Name: name}, crd) + }, existsTimeout, interval) + + // Wait for the CRD to be established. + testutils.CRDEstablished(ctx, k8sClient, crd, readyTimeout, interval) + } +} + +// createClient creates the client pod used for testing from the given filePath. +func createClient(k8sClient client.Client, filePath string) { + ginkgo.By("Creating client resources from manifest: " + filePath) + applyYAMLFile(k8sClient, filePath) + + // Wait for the pod to exist. + pod := &corev1.Pod{} + testutils.EventuallyExists(ctx, func() error { + return k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: "curl"}, pod) + }, existsTimeout, interval) + + // Wait for the pod to be ready. + testutils.PodReady(ctx, k8sClient, pod, readyTimeout, interval) +} + +// createModelServer creates the model server resources used for testing from the given filePaths. +func createModelServer(k8sClient client.Client, secretPath, deployPath string) { + ginkgo.By("Ensuring the HF_TOKEN environment variable is set") + token := os.Getenv("HF_TOKEN") + gomega.Expect(token).NotTo(gomega.BeEmpty(), "HF_TOKEN is not set") + + inManifests := readYaml(secretPath) + ginkgo.By("Replacing placeholder secret data with HF_TOKEN environment variable") + outManifests := []string{} + for _, m := range inManifests { + outManifests = append(outManifests, strings.Replace(m, "$HF_TOKEN", token, 1)) + } + + ginkgo.By("Creating model server secret resource from manifest: " + deployPath) + createObjsFromYaml(k8sClient, outManifests) + + // Wait for the secret to exist before proceeding with test. + testutils.EventuallyExists(ctx, func() error { + return k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: "hf-token"}, &corev1.Secret{}) + }, existsTimeout, interval) + + ginkgo.By("Creating model server resources from manifest: " + deployPath) + applyYAMLFile(k8sClient, deployPath) + + // Wait for the deployment to exist. + deploy := &appsv1.Deployment{} + testutils.EventuallyExists(ctx, func() error { + return k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: modelServerName}, deploy) + }, existsTimeout, interval) + + // Wait for the deployment to be available. + testutils.DeploymentAvailable(ctx, k8sClient, deploy, modelReadyTimeout, interval) + + // Wait for the service to exist. + testutils.EventuallyExists(ctx, func() error { + return k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: modelServerName}, &corev1.Service{}) + }, existsTimeout, interval) +} + +// createEnvoy creates the envoy proxy resources used for testing from the given filePath. +func createEnvoy(k8sClient client.Client, filePath string) { + ginkgo.By("Creating envoy proxy resources from manifest: " + filePath) + applyYAMLFile(k8sClient, filePath) + + // Wait for the configmap to exist before proceeding with test. + cfgMap := &corev1.ConfigMap{} + testutils.EventuallyExists(ctx, func() error { + return k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: envoyName}, cfgMap) + }, existsTimeout, interval) + + // Wait for the deployment to exist. + deploy := &appsv1.Deployment{} + testutils.EventuallyExists(ctx, func() error { + return k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: envoyName}, deploy) + }, existsTimeout, interval) + + // Wait for the deployment to be available. + testutils.DeploymentAvailable(ctx, k8sClient, deploy, readyTimeout, interval) + + // Wait for the service to exist. + testutils.EventuallyExists(ctx, func() error { + return k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: envoyName}, &corev1.Service{}) + }, existsTimeout, interval) +} + +// createInferExt creates the inference extension resources used for testing from the given filePath. +func createInferExt(k8sClient client.Client, filePath string) { + ginkgo.By("Creating inference extension resources from manifest: " + filePath) + applyYAMLFile(k8sClient, filePath) + + // Wait for the clusterrole to exist. + testutils.EventuallyExists(ctx, func() error { + return k8sClient.Get(ctx, types.NamespacedName{Name: "pod-read"}, &rbacv1.ClusterRole{}) + }, existsTimeout, interval) + + // Wait for the clusterrolebinding to exist. + testutils.EventuallyExists(ctx, func() error { + return k8sClient.Get(ctx, types.NamespacedName{Name: "pod-read-binding"}, &rbacv1.ClusterRoleBinding{}) + }, existsTimeout, interval) + + // Wait for the deployment to exist. + deploy := &appsv1.Deployment{} + testutils.EventuallyExists(ctx, func() error { + return k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: inferExtName}, deploy) + }, existsTimeout, interval) + + // Wait for the deployment to be available. + testutils.DeploymentAvailable(ctx, k8sClient, deploy, modelReadyTimeout, interval) + + // Wait for the service to exist. + testutils.EventuallyExists(ctx, func() error { + return k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: inferExtName}, &corev1.Service{}) + }, existsTimeout, interval) +} + +// applyYAMLFile reads a file containing YAML (possibly multiple docs) +// and applies each object to the cluster. +func applyYAMLFile(k8sClient client.Client, filePath string) { + // Create the resources from the manifest file + createObjsFromYaml(k8sClient, readYaml(filePath)) +} + +func readYaml(filePath string) []string { + ginkgo.By("Reading YAML file: " + filePath) + yamlBytes, err := os.ReadFile(filePath) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + // Split multiple docs, if needed + return strings.Split(string(yamlBytes), "\n---") +} + +func createObjsFromYaml(k8sClient client.Client, docs []string) { + // For each doc, decode and create + decoder := serializer.NewCodecFactory(scheme).UniversalDeserializer() + for _, doc := range docs { + trimmed := strings.TrimSpace(doc) + if trimmed == "" { + continue + } + // Decode into a runtime.Object + obj, gvk, decodeErr := decoder.Decode([]byte(trimmed), nil, nil) + gomega.Expect(decodeErr).NotTo(gomega.HaveOccurred(), + "Failed to decode YAML document to a Kubernetes object") + + ginkgo.By(fmt.Sprintf("Decoded GVK: %s", gvk)) + + unstrObj, ok := obj.(*unstructured.Unstructured) + if !ok { + // Fallback if it's a typed object + unstrObj = &unstructured.Unstructured{} + // Convert typed to unstructured + err := scheme.Convert(obj, unstrObj, nil) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + unstrObj.SetNamespace(nsName) + + // Create the object + err := k8sClient.Create(ctx, unstrObj) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), + "Failed to create object from YAML") + } } diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 5da114779..801368a22 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -18,104 +18,103 @@ package e2e import ( "fmt" - "os/exec" - "time" - - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" - "inference.networking.x-k8s.io/gateway-api-inference-extension/test/utils" + "strings" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/onsi/ginkgo/v2" + "github.com/onsi/gomega" + infextv1a1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + testutils "inference.networking.x-k8s.io/gateway-api-inference-extension/test/utils" + "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/ptr" ) -const namespace = "api-system" - -var _ = Describe("controller", Ordered, func() { - BeforeAll(func() { - By("installing prometheus operator") - Expect(utils.InstallPrometheusOperator()).To(Succeed()) - - By("installing the cert-manager") - Expect(utils.InstallCertManager()).To(Succeed()) - - By("creating manager namespace") - cmd := exec.Command("kubectl", "create", "ns", namespace) - _, _ = utils.Run(cmd) +var _ = ginkgo.Describe("InferencePool", func() { + ginkgo.BeforeEach(func() { + ginkgo.By("Waiting for the namespace to exist.") + namespaceExists(cli, nsName) }) - AfterAll(func() { - By("uninstalling the Prometheus manager bundle") - utils.UninstallPrometheusOperator() - - By("uninstalling the cert-manager bundle") - utils.UninstallCertManager() - - By("removing manager namespace") - cmd := exec.Command("kubectl", "delete", "ns", namespace) - _, _ = utils.Run(cmd) + ginkgo.AfterEach(func() { + ginkgo.By("Deleting the InferenceModel test resource.") + cleanupInferModelResources() }) - Context("Operator", func() { - It("should run successfully", func() { - var controllerPodName string - var err error - - // projectimage stores the name of the image used in the example - var projectimage = "example.com/api:v0.0.1" - - By("building the manager(Operator) image") - cmd := exec.Command("make", "docker-build", "IMG=%s"+projectimage) - _, err = utils.Run(cmd) - ExpectWithOffset(1, err).NotTo(HaveOccurred()) - - By("loading the manager(Operator) image on Kind") - err = utils.LoadImageToKindClusterWithName(projectimage) - ExpectWithOffset(1, err).NotTo(HaveOccurred()) - - By("installing CRDs") - cmd = exec.Command("make", "install") - _, err = utils.Run(cmd) - ExpectWithOffset(1, err).NotTo(HaveOccurred()) - - By("deploying the controller-manager") - cmd = exec.Command("make", "deploy", "IMG=%s"+projectimage) - _, err = utils.Run(cmd) - ExpectWithOffset(1, err).NotTo(HaveOccurred()) - - By("validating that the controller-manager pod is running as expected") - verifyControllerUp := func() error { - // Get pod name + ginkgo.When("The Inference Extension is running", func() { + ginkgo.It("Should route traffic to target model servers", func() { + ginkgo.By("Creating an InferenceModel resource") + infModel := newInferenceModel(nsName) + gomega.Expect(cli.Create(ctx, infModel)).To(gomega.Succeed()) + + ginkgo.By("Ensuring the InferenceModel resource exists in the namespace") + gomega.Eventually(func() error { + err := cli.Get(ctx, types.NamespacedName{Namespace: infModel.Namespace, Name: infModel.Name}, infModel) + if err != nil { + return err + } + return nil + }, existsTimeout, interval).Should(gomega.Succeed()) - cmd = exec.Command("kubectl", "get", - "pods", "-l", "control-plane=controller-manager", - "-o", "go-template={{ range .items }}"+ - "{{ if not .metadata.deletionTimestamp }}"+ - "{{ .metadata.name }}"+ - "{{ \"\\n\" }}{{ end }}{{ end }}", - "-n", namespace, - ) + ginkgo.By("Verifying connectivity through the inference extension") + curlCmd := getCurlCommand(envoyName, nsName, envoyPort, modelName) - podOutput, err := utils.Run(cmd) - ExpectWithOffset(2, err).NotTo(HaveOccurred()) - podNames := utils.GetNonEmptyLines(string(podOutput)) - if len(podNames) != 1 { - return fmt.Errorf("expect 1 controller pods running, but got %d", len(podNames)) + // Ensure the expected responses include the inferencemodel target model names. + var expected []string + for _, m := range infModel.Spec.TargetModels { + expected = append(expected, m.Name) + } + actual := []string{} + gomega.Eventually(func() error { + resp, err := testutils.ExecCommandInPod(ctx, cfg, scheme, kubeCli, nsName, "curl", "curl", curlCmd) + if err != nil || !strings.Contains(resp, "200 OK") { + return err } - controllerPodName = podNames[0] - ExpectWithOffset(2, controllerPodName).Should(ContainSubstring("controller-manager")) - - // Validate pod status - cmd = exec.Command("kubectl", "get", - "pods", controllerPodName, "-o", "jsonpath={.status.phase}", - "-n", namespace, - ) - status, err := utils.Run(cmd) - ExpectWithOffset(2, err).NotTo(HaveOccurred()) - if string(status) != "Running" { - return fmt.Errorf("controller pod in %s status", status) + for _, m := range expected { + if strings.Contains(resp, m) { + actual = append(actual, m) + } + } + // Compare expected and actual models in responses, ignoring order. + if !cmp.Equal(actual, expected, cmpopts.SortSlices(func(a, b string) bool { return a < b })) { + return err } return nil - } - EventuallyWithOffset(1, verifyControllerUp, time.Minute, time.Second).Should(Succeed()) - + }, existsTimeout, interval).Should(gomega.Succeed()) }) }) }) + +// newInferenceModel creates an InferenceModel in the given namespace for testutils. +func newInferenceModel(ns string) *infextv1a1.InferenceModel { + targets := []infextv1a1.TargetModel{ + { + Name: modelName + "%-0", + Weight: ptr.To(int32(50)), + }, + { + Name: modelName + "-1", + Weight: ptr.To(int32(50)), + }, + } + return testutils.MakeModelWrapper("inferencemodel-sample", ns). + SetCriticality(infextv1a1.Critical). + SetModelName(modelName). + SetPoolRef(modelServerName). + SetTargetModels(targets). + Obj() +} + +// getCurlCommand returns the command, as a slice of strings, for curl'ing +// the test model server at the given name, namespace, port, and model name. +func getCurlCommand(name, ns, port, model string) []string { + return []string{ + "curl", + "-i", + fmt.Sprintf("%s.%s.svc:%s/v1/completions", name, ns, port), + "-H", + "Content-Type: application/json", + "-d", + fmt.Sprintf(`{"model": "%s", "prompt": "Write as if you were a critic: San Francisco", "max_tokens": 100, "temperature": 0}`, model), + } +} diff --git a/test/testdata/client.yaml b/test/testdata/client.yaml new file mode 100644 index 000000000..f2559189d --- /dev/null +++ b/test/testdata/client.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Pod +metadata: + annotations: + labels: + app: curl + name: curl + namespace: inf-ext-e2e +spec: + containers: + - command: + - tail + - -f + - /dev/null + image: curlimages/curl:7.83.1 + imagePullPolicy: IfNotPresent + name: curl + restartPolicy: Never + schedulerName: default-scheduler diff --git a/test/testdata/envoy.yaml b/test/testdata/envoy.yaml new file mode 100644 index 000000000..386d25875 --- /dev/null +++ b/test/testdata/envoy.yaml @@ -0,0 +1,288 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: envoy + namespace: inf-ext-e2e + labels: + app: envoy +data: + envoy.yaml: | + admin: + address: + socket_address: + address: 127.0.0.1 + port_value: 19000 + access_log: + - name: envoy.access_loggers.file + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog + path: /dev/null + static_resources: + listeners: + - name: envoy-proxy-ready-0.0.0.0-19001 + address: + socket_address: + address: 0.0.0.0 + port_value: 19001 + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: envoy-ready-http + route_config: + name: local_route + virtual_hosts: + - name: prometheus_stats + domains: ["*"] + routes: + - match: + prefix: "/stats/prometheus" + route: + cluster: "prometheus_stats" + http_filters: + - name: envoy.filters.http.health_check + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.health_check.v3.HealthCheck + pass_through_mode: false + headers: + - name: ":path" + string_match: + exact: "/ready" + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + - name: vllm + address: + socket_address: + address: 0.0.0.0 + port_value: 8081 + per_connection_buffer_limit_bytes: 32768 + access_log: + - name: envoy.access_loggers.file + filter: + response_flag_filter: + flags: ["NR"] + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog + path: /dev/stdout + log_format: + text_format_source: + inline_string: "{\"start_time\":\"%START_TIME%\",\"method\":\"%REQ(:METHOD)%\",...}\n" + filter_chains: + - name: vllm + filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: http-8081 + route_config: + name: vllm + virtual_hosts: + - name: vllm-default + domains: ["*"] + routes: + - match: + prefix: "/" + route: + cluster: original_destination_cluster + timeout: 86400s + idle_timeout: 86400s + upgrade_configs: + - upgrade_type: websocket + typed_per_filter_config: + envoy.filters.http.ext_proc: + "@type": type.googleapis.com/envoy.config.route.v3.FilterConfig + config: {} + http_filters: + - name: envoy.filters.http.ext_proc + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.ext_proc.v3.ExternalProcessor + grpc_service: + envoy_grpc: + cluster_name: ext_proc + authority: inference-gateway-ext-proc.inf-ext-e2e:9002 + timeout: 10s + processing_mode: + request_header_mode: SEND + response_header_mode: SKIP + request_body_mode: BUFFERED + request_trailer_mode: SKIP + response_trailer_mode: SKIP + message_timeout: 1000s + # Mark it as disabled if needed for troubleshooting: + # disabled: true + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + suppress_envoy_headers: true + http2_protocol_options: + max_concurrent_streams: 100 + initial_stream_window_size: 65536 + initial_connection_window_size: 1048576 + use_remote_address: true + normalize_path: true + merge_slashes: true + server_header_transformation: PASS_THROUGH + common_http_protocol_options: + headers_with_underscores_action: REJECT_REQUEST + path_with_escaped_slashes_action: UNESCAPE_AND_REDIRECT + access_log: + - name: envoy.access_loggers.file + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog + path: /dev/stdout + log_format: + text_format_source: + inline_string: "{\"start_time\":\"%START_TIME%\",\"method\":\"%REQ(:METHOD)%\",...}\n" + clusters: + - name: prometheus_stats + type: STATIC + connect_timeout: 0.250s + load_assignment: + cluster_name: prometheus_stats + endpoints: + - lb_endpoints: + - endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 19000 + - name: original_destination_cluster + type: ORIGINAL_DST + connect_timeout: 1000s + lb_policy: CLUSTER_PROVIDED + circuit_breakers: + thresholds: + - max_connections: 40000 + max_pending_requests: 40000 + max_requests: 40000 + original_dst_lb_config: + use_http_header: true + http_header_name: target-pod + - name: ext_proc + type: STRICT_DNS + connect_timeout: 86400s + lb_policy: LEAST_REQUEST + circuit_breakers: + thresholds: + - max_connections: 40000 + max_pending_requests: 40000 + max_requests: 40000 + max_retries: 1024 + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http2_protocol_options: + initial_stream_window_size: 65536 + initial_connection_window_size: 1048576 + load_assignment: + cluster_name: ext_proc + endpoints: + - locality: + region: ext_proc/e2e/0 + lb_endpoints: + - endpoint: + address: + socket_address: + address: inference-gateway-ext-proc.inf-ext-e2e + port_value: 9002 + health_status: HEALTHY + load_balancing_weight: 1 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: envoy + namespace: inf-ext-e2e + labels: + app: envoy +spec: + replicas: 1 + selector: + matchLabels: + app: envoy + template: + metadata: + labels: + app: envoy + annotations: + prometheus.io/path: /stats/prometheus + prometheus.io/port: "19001" + prometheus.io/scrape: "true" + spec: + containers: + - name: envoy + image: docker.io/envoyproxy/envoy:distroless-v1.32.2 + args: + - "--service-cluster" + - "default/inference-gateway" + - "--service-node" + - "$(ENVOY_POD_NAME)" + - "--log-level" + - "debug" + - "--cpuset-threads" + - "--drain-strategy" + - "immediate" + - "--drain-time-s" + - "60" + - "-c" + - "/etc/envoy/envoy.yaml" + command: + - envoy + env: + - name: ENVOY_NS_NAME + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: ENVOY_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + ports: + - containerPort: 8081 + name: http-8081 + - containerPort: 19001 + name: metrics + readinessProbe: + failureThreshold: 1 + httpGet: + path: /ready + port: 19001 + scheme: HTTP + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + resources: + requests: + cpu: 100m + memory: 512Mi + volumeMounts: + - name: config + mountPath: /etc/envoy + readOnly: true + volumes: + - name: config + configMap: + name: envoy + items: + - key: envoy.yaml + path: envoy.yaml +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: envoy + name: envoy + namespace: inf-ext-e2e +spec: + ports: + - name: http-8081 + port: 8081 + protocol: TCP + targetPort: 8081 + selector: + app: envoy + type: ClusterIP diff --git a/test/testdata/model-secret.yaml b/test/testdata/model-secret.yaml new file mode 100644 index 000000000..01c2556c3 --- /dev/null +++ b/test/testdata/model-secret.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Secret +metadata: + name: hf-token + labels: + app: vllm +stringData: + token: $HF_TOKEN diff --git a/test/utils/utils.go b/test/utils/utils.go index 1f96382e3..337599c34 100644 --- a/test/utils/utils.go +++ b/test/utils/utils.go @@ -17,124 +17,271 @@ limitations under the License. package utils import ( + "bytes" + "context" "fmt" - "os" - "os/exec" - "strings" + "time" - . "github.com/onsi/ginkgo/v2" //nolint:golint,revive + "github.com/onsi/ginkgo/v2" + "github.com/onsi/gomega" + infextv1a1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + apiextv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/remotecommand" + "sigs.k8s.io/controller-runtime/pkg/client" ) -const ( - prometheusOperatorVersion = "v0.72.0" - prometheusOperatorURL = "https://github.com/prometheus-operator/prometheus-operator/" + - "releases/download/%s/bundle.yaml" - - certmanagerVersion = "v1.14.4" - certmanagerURLTmpl = "https://github.com/jetstack/cert-manager/releases/download/%s/cert-manager.yaml" -) - -func warnError(err error) { - _, _ = fmt.Fprintf(GinkgoWriter, "warning: %v\n", err) +// DeleteClusterResources deletes all cluster-scoped objects the tests typically create. +func DeleteClusterResources(ctx context.Context, cli client.Client) error { + binding := &rbacv1.ClusterRoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-read-binding", + }, + } + err := cli.Delete(ctx, binding, client.PropagationPolicy(metav1.DeletePropagationForeground)) + if err != nil && !apierrors.IsNotFound(err) { + return err + } + role := &rbacv1.ClusterRole{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-read", + }, + } + err = cli.Delete(ctx, role, client.PropagationPolicy(metav1.DeletePropagationForeground)) + if err != nil && !apierrors.IsNotFound(err) { + return err + } + model := &apiextv1.CustomResourceDefinition{ + ObjectMeta: metav1.ObjectMeta{ + Name: "inferencemodels.inference.networking.x-k8s.io", + }, + } + err = cli.Delete(ctx, model, client.PropagationPolicy(metav1.DeletePropagationForeground)) + if err != nil && !apierrors.IsNotFound(err) { + return err + } + pool := &apiextv1.CustomResourceDefinition{ + ObjectMeta: metav1.ObjectMeta{ + Name: "inferencepools.inference.networking.x-k8s.io", + }, + } + err = cli.Delete(ctx, pool, client.PropagationPolicy(metav1.DeletePropagationForeground)) + if err != nil && !apierrors.IsNotFound(err) { + return err + } + return nil } -// InstallPrometheusOperator installs the prometheus Operator to be used to export the enabled metrics. -func InstallPrometheusOperator() error { - url := fmt.Sprintf(prometheusOperatorURL, prometheusOperatorVersion) - cmd := exec.Command("kubectl", "create", "-f", url) - _, err := Run(cmd) - return err +// DeleteNamespacedResources deletes all namespace-scoped objects the tests typically create. +// The given namespace will also be deleted if it's not "default". +func DeleteNamespacedResources(ctx context.Context, cli client.Client, ns string) error { + if ns == "" { + return nil + } + err := cli.DeleteAllOf(ctx, &appsv1.Deployment{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + if err != nil && !apierrors.IsNotFound(err) { + return err + } + err = cli.DeleteAllOf(ctx, &corev1.Service{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + if err != nil && !apierrors.IsNotFound(err) { + return err + } + err = cli.DeleteAllOf(ctx, &corev1.Pod{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + if err != nil && !apierrors.IsNotFound(err) { + return err + } + err = cli.DeleteAllOf(ctx, &corev1.ConfigMap{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + if err != nil && !apierrors.IsNotFound(err) { + return err + } + err = cli.DeleteAllOf(ctx, &corev1.Secret{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + if err != nil && !apierrors.IsNotFound(err) { + return err + } + err = cli.DeleteAllOf(ctx, &infextv1a1.InferencePool{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + if err != nil && !apierrors.IsNotFound(err) { + return err + } + err = cli.DeleteAllOf(ctx, &infextv1a1.InferenceModel{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + if err != nil && !apierrors.IsNotFound(err) { + return err + } + if ns != "default" { + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: ns, + }, + } + if err := cli.Delete(ctx, ns, client.PropagationPolicy(metav1.DeletePropagationForeground)); err != nil && !apierrors.IsNotFound(err) { + return err + } + } + return nil } -// Run executes the provided command within this context -func Run(cmd *exec.Cmd) ([]byte, error) { - dir, _ := GetProjectDir() - cmd.Dir = dir - - if err := os.Chdir(cmd.Dir); err != nil { - _, _ = fmt.Fprintf(GinkgoWriter, "chdir dir: %s\n", err) +// DeleteInferenceModelResources deletes all InferenceModel objects in the given namespace. +func DeleteInferenceModelResources(ctx context.Context, cli client.Client, ns string) error { + if ns == "" { + return nil } - - cmd.Env = append(os.Environ(), "GO111MODULE=on") - command := strings.Join(cmd.Args, " ") - _, _ = fmt.Fprintf(GinkgoWriter, "running: %s\n", command) - output, err := cmd.CombinedOutput() - if err != nil { - return output, fmt.Errorf("%s failed with error: (%v) %s", command, err, string(output)) + err := cli.DeleteAllOf(ctx, &infextv1a1.InferenceModel{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + if err != nil && !apierrors.IsNotFound(err) { + return err } - - return output, nil + return nil } -// UninstallPrometheusOperator uninstalls the prometheus -func UninstallPrometheusOperator() { - url := fmt.Sprintf(prometheusOperatorURL, prometheusOperatorVersion) - cmd := exec.Command("kubectl", "delete", "-f", url) - if _, err := Run(cmd); err != nil { - warnError(err) +// PodReady checks if the given Pod reports the "Ready" status condition before the given timeout. +func PodReady(ctx context.Context, cli client.Client, pod *corev1.Pod, timeout, interval time.Duration) { + ginkgo.By(fmt.Sprintf("Checking pod %s/%s status is: %s", pod.Namespace, pod.Name, corev1.PodReady)) + conditions := []corev1.PodCondition{ + { + Type: corev1.PodReady, + Status: corev1.ConditionTrue, + }, } + gomega.Eventually(checkPodStatus, timeout, interval).WithArguments(ctx, cli, pod, conditions).Should(gomega.BeTrue()) } -// UninstallCertManager uninstalls the cert manager -func UninstallCertManager() { - url := fmt.Sprintf(certmanagerURLTmpl, certmanagerVersion) - cmd := exec.Command("kubectl", "delete", "-f", url) - if _, err := Run(cmd); err != nil { - warnError(err) +// checkPodStatus checks if the given Pod status matches the expected conditions. +func checkPodStatus(ctx context.Context, cli client.Client, pod *corev1.Pod, conditions []corev1.PodCondition) (bool, error) { + var fetchedPod corev1.Pod + if err := cli.Get(ctx, types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name}, &fetchedPod); err != nil { + return false, err + } + found := 0 + for _, want := range conditions { + for _, c := range fetchedPod.Status.Conditions { + if c.Type == want.Type && c.Status == want.Status { + found += 1 + } + } } + return found == len(conditions), nil } -// InstallCertManager installs the cert manager bundle. -func InstallCertManager() error { - url := fmt.Sprintf(certmanagerURLTmpl, certmanagerVersion) - cmd := exec.Command("kubectl", "apply", "-f", url) - if _, err := Run(cmd); err != nil { - return err +// DeploymentAvailable checks if the given Deployment reports the "Available" status condition before the given timeout. +func DeploymentAvailable(ctx context.Context, cli client.Client, deploy *appsv1.Deployment, timeout, interval time.Duration) { + ginkgo.By(fmt.Sprintf("Checking if deployment %s/%s status is: %s", deploy.Namespace, deploy.Name, appsv1.DeploymentAvailable)) + conditions := []appsv1.DeploymentCondition{ + { + Type: appsv1.DeploymentAvailable, + Status: corev1.ConditionTrue, + }, } - // Wait for cert-manager-webhook to be ready, which can take time if cert-manager - // was re-installed after uninstalling on a cluster. - cmd = exec.Command("kubectl", "wait", "deployment.apps/cert-manager-webhook", - "--for", "condition=Available", - "--namespace", "cert-manager", - "--timeout", "5m", - ) + gomega.Eventually(checkDeploymentStatus, timeout, interval).WithArguments(ctx, cli, deploy, conditions).Should(gomega.BeTrue()) +} - _, err := Run(cmd) - return err +// checkDeploymentStatus checks if the given Deployment status matches the expected conditions. +func checkDeploymentStatus(ctx context.Context, cli client.Client, deploy *appsv1.Deployment, conditions []appsv1.DeploymentCondition) (bool, error) { + var fetchedDeploy appsv1.Deployment + if err := cli.Get(ctx, types.NamespacedName{Namespace: deploy.Namespace, Name: deploy.Name}, &fetchedDeploy); err != nil { + return false, err + } + found := 0 + for _, want := range conditions { + for _, c := range fetchedDeploy.Status.Conditions { + if c.Type == want.Type && c.Status == want.Status { + found += 1 + } + } + } + return found == len(conditions), nil } -// LoadImageToKindClusterWithName loads a local docker image to the kind cluster -func LoadImageToKindClusterWithName(name string) error { - cluster := "kind" - if v, ok := os.LookupEnv("KIND_CLUSTER"); ok { - cluster = v +// CRDEstablished checks if the given CRD reports the "Established" status condition before the given timeout. +func CRDEstablished(ctx context.Context, cli client.Client, crd *apiextv1.CustomResourceDefinition, timeout, interval time.Duration) { + ginkgo.By(fmt.Sprintf("Checking CRD %s status is: %s", crd.Name, apiextv1.Established)) + conditions := []apiextv1.CustomResourceDefinitionCondition{ + { + Type: apiextv1.Established, + Status: apiextv1.ConditionTrue, + }, } - kindOptions := []string{"load", "docker-image", name, "--name", cluster} - cmd := exec.Command("kind", kindOptions...) - _, err := Run(cmd) - return err + gomega.Eventually(checkCrdStatus, timeout, interval).WithArguments(ctx, cli, crd, conditions).Should(gomega.BeTrue()) } -// GetNonEmptyLines converts given command output string into individual objects -// according to line breakers, and ignores the empty elements in it. -func GetNonEmptyLines(output string) []string { - var res []string - elements := strings.Split(output, "\n") - for _, element := range elements { - if element != "" { - res = append(res, element) +// checkCrdStatus checks if the given CRD status matches the expected conditions. +func checkCrdStatus( + ctx context.Context, + cli client.Client, + crd *apiextv1.CustomResourceDefinition, + conditions []apiextv1.CustomResourceDefinitionCondition, +) (bool, error) { + var fetchedCrd apiextv1.CustomResourceDefinition + if err := cli.Get(ctx, types.NamespacedName{Name: crd.Name}, &fetchedCrd); err != nil { + return false, err + } + found := 0 + for _, want := range conditions { + for _, c := range fetchedCrd.Status.Conditions { + if c.Type == want.Type && c.Status == want.Status { + found += 1 + } } } - - return res + return found == len(conditions), nil } -// GetProjectDir will return the directory where the project is -func GetProjectDir() (string, error) { - wd, err := os.Getwd() +// ExecCommandInPod runs a command in a given container of a given Pod, returning combined stdout+stderr. +func ExecCommandInPod( + ctx context.Context, + cfg *rest.Config, + scheme *runtime.Scheme, + kubeClient *kubernetes.Clientset, + podNamespace, podName, containerName string, + cmd []string, +) (string, error) { + + parameterCodec := runtime.NewParameterCodec(scheme) + + req := kubeClient.CoreV1().RESTClient(). + Post(). + Resource("pods"). + Name(podName). + Namespace(podNamespace). + SubResource("exec"). + VersionedParams(&corev1.PodExecOptions{ + Container: containerName, + Command: cmd, + Stdin: false, + Stdout: true, + Stderr: true, + TTY: false, + }, parameterCodec) + + exec, err := remotecommand.NewSPDYExecutor(cfg, "POST", req.URL()) if err != nil { - return wd, err + return "", fmt.Errorf("could not initialize executor: %w", err) } - wd = strings.ReplaceAll(wd, "/test/e2e", "") - return wd, nil + + var stdout, stderr bytes.Buffer + execErr := exec.StreamWithContext(ctx, remotecommand.StreamOptions{ + Stdout: &stdout, + Stderr: &stderr, + }) + + combinedOutput := stdout.String() + stderr.String() + + if execErr != nil { + return combinedOutput, fmt.Errorf("exec dial command %v failed: %w", cmd, execErr) + } + + return combinedOutput, nil +} + +// EventuallyExists checks if a Kubernetes resource exists and returns nil if successful. +// It takes a function `getResource` which retrieves the resource and returns an error if it doesn't exist. +func EventuallyExists(ctx context.Context, getResource func() error, timeout, interval time.Duration) { + gomega.Eventually(func() error { + return getResource() + }, timeout, interval).Should(gomega.Succeed()) } diff --git a/test/utils/wrappers.go b/test/utils/wrappers.go new file mode 100644 index 000000000..12ff856af --- /dev/null +++ b/test/utils/wrappers.go @@ -0,0 +1,78 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utils + +import ( + infextv1a1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// InferenceModelWrapper wraps an InferenceModel. +type InferenceModelWrapper struct { + infextv1a1.InferenceModel +} + +// MakeModelWrapper creates a wrapper for an MakeModelWrapper. +func MakeModelWrapper(name, ns string) *InferenceModelWrapper { + return &InferenceModelWrapper{ + infextv1a1.InferenceModel{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: ns, + }, + Spec: infextv1a1.InferenceModelSpec{ + ModelName: "", + PoolRef: infextv1a1.PoolObjectReference{}, + }, + }, + } +} + +// SetModelName sets the value of the inferenceModel.spec.modelName. +func (m *InferenceModelWrapper) SetModelName(name string) *InferenceModelWrapper { + m.Spec.ModelName = name + return m +} + +// SetCriticality sets the value of the inferenceModel.spec.criticality. +func (m *InferenceModelWrapper) SetCriticality(level infextv1a1.Criticality) *InferenceModelWrapper { + m.Spec.Criticality = &level + return m +} + +// SetPoolRef sets the value of the inferenceModel.spec.poolRef using defaults +// for group/kind and name as the PoolObjectReference name. +func (m *InferenceModelWrapper) SetPoolRef(name string) *InferenceModelWrapper { + ref := infextv1a1.PoolObjectReference{ + Group: infextv1a1.GroupVersion.Group, + Kind: "inferencepools", + Name: name, + } + m.Spec.PoolRef = ref + return m +} + +// SetTargetModels sets the value of the inferenceModel.spec.targetModels. +func (m *InferenceModelWrapper) SetTargetModels(models []infextv1a1.TargetModel) *InferenceModelWrapper { + m.Spec.TargetModels = models + return m +} + +// Obj returns the inner InferenceModel. +func (m *InferenceModelWrapper) Obj() *infextv1a1.InferenceModel { + return &m.InferenceModel +}