diff --git a/tests/assets/neuron/config.yaml b/tests/assets/neuron/config.yaml new file mode 100644 index 00000000..2fdc719d --- /dev/null +++ b/tests/assets/neuron/config.yaml @@ -0,0 +1,65 @@ +{{$uniformQps := DefaultParam .CL2_UNIFORM_QPS 500}} +{{$neuronResourcesPerPod := DefaultParam .CL2_NEURON_RESOURCES_PER_POD 64}} +{{$neuronPods := DefaultParam .CL2_NEURON_PODS .Nodes}} + +name: neuron-workers +namespace: + number: 1 +tuningSets: +- name: UniformQPS + qpsLoad: + qps: {{$uniformQps}} + +steps: +- name: Start measurements + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = neuron-worker + threshold: 60s +- name: Create pods + phases: + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: {{$neuronPods}} + tuningSet: UniformQPS + objectBundle: + - basename: neuron-worker + objectTemplatePath: pod.yaml + templateFillMap: + Group: neuron-worker + NeuronResources: {{$neuronResourcesPerPod}} + +- name: Wait for pods to be running + measurements: + - Identifier: WaitForRunningPods + Method: WaitForRunningPods + Params: + action: gather + desiredPodCount: {{$neuronPods}} + labelSelector: group = neuron-worker + timeout: 5m + +- name: Measure pod startup latency + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: gather + +- name: Delete pods + phases: + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: 0 + tuningSet: UniformQPS + objectBundle: + - basename: neuron-worker + objectTemplatePath: pod.yaml + templateFillMap: + Group: neuron-worker + NeuronResources: {{$neuronResourcesPerPod}} \ No newline at end of file diff --git a/tests/assets/neuron/pod.yaml b/tests/assets/neuron/pod.yaml new file mode 100644 index 00000000..a27a9525 --- /dev/null +++ b/tests/assets/neuron/pod.yaml @@ -0,0 +1,19 @@ +--- +apiVersion: v1 +kind: Pod +metadata: + generateName: neuron-worker- + labels: + group: {{.Group}} +spec: + containers: + - name: main + image: public.ecr.aws/amazonlinux/amazonlinux:2023 + command: + - "sleep" + - "infinity" + resources: + requests: + aws.amazon.com/neuron: "{{.NeuronResources}}" + limits: + aws.amazon.com/neuron: "{{.NeuronResources}}" \ No newline at end of file diff --git a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml index d9e62454..cd361914 100644 --- a/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml +++ b/tests/tekton-resources/pipelines/eks/awscli-cl2-load-with-addons-slos.yaml @@ -88,6 +88,16 @@ spec: default: "20m" - name: timeout-pia-pod-startup default: "5m" + - name: neuron-test-config-url + default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/neuron/config.yaml" + - name: neuron-test-pod-spec-url + default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/neuron/pod.yaml" + - name: cl2-neuron-uniform-qps + description: "Rate at which pods are created/deleted. Defaults to 500 QPS." + default: "500" + - name: cl2-neuron-resources-per-pod + description: "Neuron device units requested per pod. Defaults to 64 units." + default: "64" - name: unmanaged-nodegroup-cfn-url default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/asg_node_group.yaml" - name: launch-template-ami @@ -261,6 +271,50 @@ spec: workspaces: - name: config workspace: config + - name: install-neuron-device-plugin + params: + - name: cluster-name + value: $(params.cluster-name) + - name: endpoint + value: $(params.endpoint) + runAfter: + - create-mng-nodes + taskRef: + kind: Task + name: install-neuron-device-plugin + workspaces: + - name: config + workspace: config + - name: generate-neuron-load + params: + - name: cluster-name + value: $(params.cluster-name) + - name: results-bucket + value: $(params.results-bucket) + - name: nodes + value: $(params.desired-nodes) + - name: cl2-neuron-pods + value: $(params.desired-nodes) + - name: cl2-uniform-qps + value: $(params.cl2-neuron-uniform-qps) + - name: cl2-neuron-resources-per-pod + value: $(params.cl2-neuron-resources-per-pod) + - name: neuron-config-url + value: $(params.neuron-test-config-url) + - name: neuron-pod-url + value: $(params.neuron-test-pod-spec-url) + runAfter: + - install-neuron-device-plugin + taskRef: + kind: Task + name: load-neuron-device-plugin + workspaces: + - name: source + workspace: source + - name: results + workspace: results + - name: config + workspace: config - name: create-pod-identity-association params: - name: cluster-name @@ -274,7 +328,7 @@ spec: - name: pia-trust-policy-url value: $(params.pia-trust-policy-url) runAfter: - - create-mng-nodes + - generate-neuron-load taskRef: kind: Task name: awscli-eks-pia-create @@ -377,6 +431,72 @@ spec: taskRef: kind: Task name: cloudwatch + - name: cw-metrics-neuron-device-plugin-latency-p50 + params: + - name: dimensions + value: $(params.desired-nodes) + - name: value + value: $(tasks.generate-neuron-load.results.pod_startup_p50) + - name: namespace + value: neuron-device-plugin-$(params.kubernetes-version) + - name: metric-name + value: pod_startup_latency_p50 + - name: unit + value: Milliseconds + runAfter: + - generate-neuron-load + taskRef: + kind: Task + name: cloudwatch + - name: cw-metrics-neuron-device-plugin-latency-p90 + params: + - name: dimensions + value: $(params.desired-nodes) + - name: value + value: $(tasks.generate-neuron-load.results.pod_startup_p90) + - name: namespace + value: neuron-device-plugin-$(params.kubernetes-version) + - name: metric-name + value: pod_startup_latency_p90 + - name: unit + value: Milliseconds + runAfter: + - generate-neuron-load + taskRef: + kind: Task + name: cloudwatch + - name: cw-metrics-neuron-device-plugin-latency-p99 + params: + - name: dimensions + value: $(params.desired-nodes) + - name: value + value: $(tasks.generate-neuron-load.results.pod_startup_p99) + - name: namespace + value: neuron-device-plugin-$(params.kubernetes-version) + - name: metric-name + value: pod_startup_latency_p99 + - name: unit + value: Milliseconds + runAfter: + - generate-neuron-load + taskRef: + kind: Task + name: cloudwatch + - name: cw-metrics-neuron-load-test-outcome + params: + - name: dimensions + value: $(params.desired-nodes) + - name: value + value: $(tasks.generate-neuron-load.results.datapoint) + - name: metric-name + value: outcome + - name: namespace + value: neuron-device-plugin-$(params.kubernetes-version) + runAfter: + - generate-neuron-load + taskRef: + kind: Task + name: cloudwatch workspaces: - name: source - name: results diff --git a/tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml b/tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml new file mode 100644 index 00000000..4da5ba68 --- /dev/null +++ b/tests/tekton-resources/tasks/generators/clusterloader/load-neuron-device-plugin.yaml @@ -0,0 +1,195 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: load-neuron-device-plugin + namespace: scalability +spec: + description: "Run Neuron device plugin load test using clusterloader2" + params: + - name: giturl + description: "git url to clone the package" + default: https://github.com/kubernetes/perf-tests.git + - name: cl2-branch + description: "The branch of clusterloader2 you want to use" + default: "master" + - name: cl2-neuron-pods + description: "Number of pods to create during test" + - name: cl2-uniform-qps + description: "Rate of pod operations (create/delete) in queries per second. Defaults to 500 QPS." + default: "500" + - name: cl2-neuron-resources-per-pod + description: "Neuron device units requested per pod. Defaults to 64 units." + default: "64" + - name: neuron-config-url + description: "URL for the Neuron test configuration file for loadtest" + default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/neuron/config.yaml" + - name: neuron-pod-url + description: "URL for the Neuron pod specification file for loadtest" + default: "https://raw.githubusercontent.com/awslabs/kubernetes-iteration-toolkit/main/tests/assets/neuron/pod.yaml" + - name: nodes + description: "number of dataplane nodes to run the load test against" + - name: results-bucket + description: "S3 bucket for results" + - name: cluster-name + description: "The name of the EKS cluster" + - name: region + default: "us-west-2" + results: + - name: datapoint + description: Stores the CL2 result that can be consumed by other tasks + - name: s3_result + description: Stores the S3 result path after compute + - name: pod_startup_p50 + description: 50th percentile pod startup latency + - name: pod_startup_p90 + description: 90th percentile pod startup latency + - name: pod_startup_p99 + description: 99th percentile pod startup latency + workspaces: + - name: source + mountPath: /src/k8s.io/ + - name: results + - name: config + mountPath: /config/ + stepTemplate: + env: + - name: KUBECONFIG + value: /config/kubeconfig + steps: + - name: git-clone + image: alpine/git + workingDir: $(workspaces.source.path) + script: | + # Remove existing directory if it exists + rm -rf perf-tests + + # Clone fresh copy + git clone $(params.giturl) + cd $(workspaces.source.path)/perf-tests/ + git fetch origin --verbose --tags + git checkout $(params.cl2-branch) + git branch + + - name: prepare-test + image: golang:1.24 + workingDir: $(workspaces.source.path) + script: | + S3_RESULT_PATH=$(params.results-bucket) + echo $S3_RESULT_PATH > $(results.s3_result.path) + echo "S3 Path: $S3_RESULT_PATH" + echo "$(params.neuron-config-url)" + echo "$(params.neuron-pod-url)" + + echo "# Override configurations" > "$(workspaces.source.path)/overrides.yaml" + if [ -n "$(params.cl2-neuron-pods)" ]; then + echo "CL2_NEURON_PODS: $(params.cl2-neuron-pods)" >> "$(workspaces.source.path)/overrides.yaml" + fi + + if [ -n "$(params.cl2-uniform-qps)" ]; then + echo "CL2_UNIFORM_QPS: $(params.cl2-uniform-qps)" >> "$(workspaces.source.path)/overrides.yaml" + fi + + if [ -n "$(params.cl2-neuron-resources-per-pod)" ]; then + echo "CL2_NEURON_RESOURCES_PER_POD: $(params.cl2-neuron-resources-per-pod)" >> "$(workspaces.source.path)/overrides.yaml" + fi + + echo "Generated overrides.yaml:" + cat $(workspaces.source.path)/overrides.yaml + cp $(workspaces.source.path)/overrides.yaml $(workspaces.results.path)/overrides.yaml + + # Create test directory + mkdir -p $(workspaces.source.path)/perf-tests/clusterloader2/testing/neuron + + # Download test configurations + curl -s $(params.neuron-config-url) \ + -o $(workspaces.source.path)/perf-tests/clusterloader2/testing/neuron/config.yaml + curl -s $(params.neuron-pod-url) \ + -o $(workspaces.source.path)/perf-tests/clusterloader2/testing/neuron/pod.yaml + + # Building clusterloader2 binary + cd $(workspaces.source.path)/perf-tests/clusterloader2/ + GOOS=linux CGO_ENABLED=0 go build -v -o ./clusterloader ./cmd + + - name: run-test + image: alpine/k8s:1.30.2 + script: | + echo "Starting run-test step" + + cd $(workspaces.source.path)/perf-tests/clusterloader2/ + + echo "Checking for clusterloader binary" + if [ ! -f "./clusterloader" ]; then + echo "Error: clusterloader binary not found in $(pwd)" + echo "Listing workspace root:" + ls -la $(workspaces.source.path) + echo "Listing perf-tests directory:" + ls -la $(workspaces.source.path)/perf-tests + exit 1 + fi + + chmod +x ./clusterloader + + echo "Verifying test configuration files" + echo "Content of testing/neuron/config.yaml:" + cat testing/neuron/config.yaml + echo "Content of testing/neuron/pod.yaml:" + cat testing/neuron/pod.yaml + + echo "Starting clusterloader test" + ENABLE_EXEC_SERVICE=false ./clusterloader \ + --testconfig=testing/neuron/config.yaml \ + --testoverrides=$(workspaces.source.path)/overrides.yaml \ + --nodes=$(params.nodes) \ + --provider=eks \ + --kubeconfig=${KUBECONFIG} \ + --report-dir=$(workspaces.results.path) \ + --alsologtostderr \ + --v=2 + + exit_code=$? + echo "Test completed with exit code: $exit_code" + + if [ $exit_code -eq 0 ]; then + echo "Test succeeded" + echo "1" | tee $(results.datapoint.path) + else + echo "Test failed" + echo "0" | tee $(results.datapoint.path) + fi + + exit $exit_code + timeout: 30000s + + - name: process-metrics + image: alpine + workingDir: $(workspaces.results.path) + script: | + apk add --no-cache jq + + # find the pod startup metrics in the JSON + POD_STARTUP_METRICS=$(jq '.dataItems[] | select(.labels.Metric == "pod_startup") | .data' PodStartupLatency_*.json) + + if [ -z "$POD_STARTUP_METRICS" ]; then + echo "Error: Could not find pod_startup metrics" + exit 1 + fi + + # get the pod startup p50, p90 and p99 + echo "$POD_STARTUP_METRICS" | jq -r '.Perc50' > $(results.pod_startup_p50.path) + echo "$POD_STARTUP_METRICS" | jq -r '.Perc90' > $(results.pod_startup_p90.path) + echo "$POD_STARTUP_METRICS" | jq -r '.Perc99' > $(results.pod_startup_p99.path) + + echo "Extracted metrics:" + echo "P50: $(cat $(results.pod_startup_p50.path))" + echo "P90: $(cat $(results.pod_startup_p90.path))" + echo "P99: $(cat $(results.pod_startup_p99.path))" + + - name: upload-results + image: amazon/aws-cli + workingDir: $(workspaces.results.path) + script: | + S3_RESULT_PATH=$(cat $(results.s3_result.path)) + echo "S3 Path: $S3_RESULT_PATH" + aws sts get-caller-identity + ls -larth + aws s3 cp . s3://$S3_RESULT_PATH/ --recursive \ No newline at end of file diff --git a/tests/tekton-resources/tasks/setup/eks/awscli-neuron.yaml b/tests/tekton-resources/tasks/setup/eks/awscli-neuron.yaml new file mode 100644 index 00000000..bf74c201 --- /dev/null +++ b/tests/tekton-resources/tasks/setup/eks/awscli-neuron.yaml @@ -0,0 +1,121 @@ +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: install-neuron-device-plugin + namespace: scalability +spec: + description: | + Install AWS Neuron device plugin on an existing EKS cluster using Helm. + params: + - name: cluster-name + description: The name of the EKS cluster. + - name: region + default: us-west-2 + description: The region where the cluster is in. + - name: endpoint + - name: instance-types + default: "" + description: "Comma-separated list of instance types to enable Neuron emulation for. If empty, will auto-detect from cluster." + workspaces: + - name: config + mountPath: /config/ + stepTemplate: + env: + - name: KUBECONFIG + value: /config/kubeconfig + steps: + - name: write-kubeconfig + image: alpine/k8s:1.23.7 + script: | + ENDPOINT_FLAG="" + if [ -n "$(params.endpoint)" ]; then + ENDPOINT_FLAG="--endpoint $(params.endpoint)" + fi + aws eks $ENDPOINT_FLAG update-kubeconfig --name $(params.cluster-name) --region $(params.region) + + aws --version + aws sts get-caller-identity + + echo "Available nodes in the cluster:" + # List nodegroups in the cluster + aws eks $ENDPOINT_FLAG list-nodegroups --cluster-name $(params.cluster-name) --region $(params.region) + + - name: install-helm + image: alpine/k8s:1.23.7 + script: | + # Install required dependencies + apk add --no-cache openssl curl bash + + # Install Helm + curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash + - name: install-neuron + image: alpine/k8s:1.23.7 + script: | + # Update kubeconfig with token + ENDPOINT_FLAG="" + if [ -n "$(params.endpoint)" ]; then + ENDPOINT_FLAG="--endpoint $(params.endpoint)" + fi + aws eks $ENDPOINT_FLAG update-kubeconfig --name $(params.cluster-name) --region $(params.region) + + echo "Verifying access to the cluster..." + kubectl get nodes + + # Determine instance types + if [ -n "$(params.instance-types)" ]; then + INSTANCE_TYPES="$(params.instance-types)" + echo "Will install Neuron device plugin in emulation mode for specified instance types: $INSTANCE_TYPES" + else + INSTANCE_TYPES=$(kubectl get nodes -o jsonpath='{.items[*].metadata.labels.node\.kubernetes\.io/instance-type}' | tr ' ' '\n' | sort -u | tr '\n' ',') + echo "Auto-detected instance types from cluster: $INSTANCE_TYPES" + fi + + # Create values file for Helm + cat << EOF > /tmp/values.yaml + devicePlugin: + env: + - name: KUBECONFIG + value: /etc/kubernetes/kubelet.conf + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: NEURON_DEVICE_PLUGIN_EMULATION_MODE + value: "64" + nodeSelector: null + tolerations: + - operator: Exists + effect: NoSchedule + updateStrategy: + type: RollingUpdate + neuronInstances: [${INSTANCE_TYPES}] + npd: + enabled: false + EOF + + echo "Using this values.yaml:" + cat /tmp/values.yaml + + # Install Neuron using values file + helm upgrade --install \ + neuron \ + oci://public.ecr.aws/neuron/neuron-helm-chart \ + --namespace kube-system \ + -f /tmp/values.yaml + + sleep 5 + echo "Verifying Installation..." + while true; do + DESIRED=$(kubectl get ds neuron-device-plugin -n kube-system -o jsonpath='{.status.desiredNumberScheduled}') + READY=$(kubectl get ds neuron-device-plugin -n kube-system -o jsonpath='{.status.numberReady}') + + echo "Desired: $DESIRED, Ready: $READY" + + if [ "$DESIRED" == "$READY" ] && [ "$DESIRED" -gt 0 ]; then + echo "Neuron device plugin installation verified successfully" + break + else + echo "Waiting for neuron-device-plugin daemonset to be ready..." + sleep 5 + fi + done \ No newline at end of file