Scale Test Egress Gateway (scale-egw) #18
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Scale Test Egress Gateway (scale-egw) | |
on: | |
schedule: | |
- cron: "27 0 * * 1-5" | |
workflow_dispatch: | |
inputs: | |
PR-number: | |
description: "Pull request number." | |
required: true | |
context-ref: | |
description: "Context in which the workflow runs. If PR is from a fork, will be the PR target branch (general case). If PR is NOT from a fork, will be the PR branch itself (this allows committers to test changes to workflows directly from PRs)." | |
required: true | |
SHA: | |
description: "SHA under test (head of the PR branch)." | |
required: true | |
extra-args: | |
description: "[JSON object] Arbitrary arguments passed from the trigger comment via regex capture group. Parse with 'fromJson(inputs.extra-args).argName' in workflow." | |
required: false | |
default: "{}" | |
num-clients: | |
description: "Number of clients to create to connect to the external target through EGW" | |
required: false | |
default: 100 | |
type: number | |
client-qps: | |
description: "Number of client pods to create per second" | |
required: false | |
default: 20 | |
type: number | |
# For testing uncomment following lines: | |
# push: | |
# branches: | |
# - your_branch_name | |
permissions: | |
# To be able to access the repository with actions/checkout | |
contents: read | |
# To be able to request the JWT from GitHub's OIDC provider | |
id-token: write | |
# To allow retrieving information from the PR API | |
pull-requests: read | |
# To be able to set commit status | |
statuses: write | |
concurrency: | |
# Structure: | |
# - Workflow name | |
# - Event type | |
# - A unique identifier depending on event type: | |
# - schedule: SHA | |
# - workflow_dispatch: PR number | |
# | |
# This structure ensures a unique concurrency group name is generated for each | |
# type of testing, such that re-runs will cancel the previous run. | |
group: | | |
${{ github.workflow }} | |
${{ github.event_name }} | |
${{ | |
(github.event_name == 'schedule' && github.sha) || | |
(github.event_name == 'workflow_dispatch' && github.event.inputs.PR-number) | |
}} | |
cancel-in-progress: true | |
env: | |
# renovate: datasource=golang-version depName=go | |
go_version: 1.23.6 | |
# renovate: datasource=github-releases depName=eksctl-io/eksctl | |
eksctl_version: v0.203.0 | |
# renovate: datasource=github-releases depName=kubernetes/kubernetes | |
kubectl_version: v1.32.1 | |
# renovate: datasource=docker depName=google/cloud-sdk | |
gcloud_version: 509.0.0 | |
# Hosted under quay.io/cilium/egw-scale-utils and built by | |
# a workflow in cilium/scaffolding. | |
# renovate: datasource=git-refs depName=https://github.com/cilium/scaffolding branch=main | |
egw_utils_ref: 9ee5a03899528341025847fe064bb045de4f888e | |
test_name: egw | |
cluster_name: ${{ github.run_id }}-${{ github.run_attempt }} | |
AWS_SSH_KEY: ~/.ssh/id_aws.pub | |
AWS_SSH_KEY_PRIVATE: ~/.ssh/id_aws | |
jobs: | |
echo-inputs: | |
if: ${{ github.event_name == 'workflow_dispatch' }} | |
name: Echo Workflow Dispatch Inputs | |
runs-on: ubuntu-24.04 | |
steps: | |
- name: Echo Workflow Dispatch Inputs | |
run: | | |
echo '${{ tojson(inputs) }}' | |
commit-status-start: | |
name: Commit Status Start | |
runs-on: ubuntu-24.04 | |
steps: | |
- name: Set initial commit status | |
uses: myrotvorets/set-commit-status-action@3730c0a348a2ace3c110851bed53331bc6406e9f # v2.0.1 | |
with: | |
sha: ${{ inputs.SHA || github.sha }} | |
install-and-scaletest: | |
runs-on: ubuntu-24.04 | |
name: Install and Scale Test | |
timeout-minutes: 150 | |
strategy: | |
fail-fast: false | |
matrix: | |
test_type: | |
- md-bs # Abbreviated "masquerade delay - baseline" | |
- md | |
steps: | |
- name: Checkout context ref (trusted) | |
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
with: | |
ref: ${{ inputs.context-ref || github.sha }} | |
persist-credentials: false | |
- name: Set Environment Variables | |
uses: ./.github/actions/set-env-variables | |
- name: Set up job variables | |
id: vars | |
run: | | |
if [ "${{ github.event_name }}" = "workflow_dispatch" ] ; then | |
SHA="${{ inputs.SHA }}" | |
else | |
SHA="${{ github.sha }}" | |
fi | |
# The SHA under test will have its helm chart checked out at the following | |
# path right before the step where Cilium is installed. | |
CILIUM_INSTALL_DEFAULTS="--chart-directory=untrusted/install/kubernetes/cilium \ | |
--wait=false \ | |
--set=hubble.enabled=true \ | |
--set=pprof.enabled=true \ | |
--set=prometheus.enabled=true \ | |
--set=cluster.name=${{ env.cluster_name }} \ | |
--set=egressGateway.enabled=true \ | |
--set=bpf.masquerade=true \ | |
--set=kubeProxyReplacement=true \ | |
--set=l7Proxy=false \ | |
--set=egressMasqueradeInterfaces="" \ | |
--set=eni.enabled=true \ | |
--set=ipam.mode=eni \ | |
--set=eni.awsEnablePrefixDelegation=true \ | |
--set=image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/cilium-ci:${SHA} \ | |
--set=operator.image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/operator-aws-ci:${SHA} \ | |
--set=hubble.relay.image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/hubble-relay-ci:${SHA} \ | |
--nodes-without-cilium" | |
OWNER="${{ github.ref_name }}" | |
OWNER="${OWNER//[.\/]/-}" | |
if [ "${{ github.event_name }}" == "workflow_dispatch" ] ; then | |
NUM_CLIENT_PODS="${{ inputs.num-clients }}" | |
CLIENT_QPS="${{ inputs.client-qps }}" | |
else | |
NUM_CLIENT_PODS="100" | |
CLIENT_QPS="20" | |
fi | |
# The m5.large instance type can support 29 Pods. | |
# See https://github.com/awslabs/amazon-eks-ami/blob/main/templates/shared/runtime/eni-max-pods.txt. | |
# Changing the instance type also requires changing the hardcoded value below! | |
NODE_INSTANCE_TYPE="m5.large" | |
NUM_CLIENT_NODES="$(((NUM_CLIENT_PODS / 29) + 2))" | |
TEST_NAME="${{ env.test_name }}-${{ matrix.test_type }}-${NUM_CLIENT_PODS}-${CLIENT_QPS}" | |
CLUSTER_NAME="${TEST_NAME}-${{ env.cluster_name }}" | |
eks_version_and_region=$(yq '.include | sort_by(.version) | reverse | .[0] | "\(.version),\(.region)"' .github/actions/eks/k8s-versions.yaml) | |
EKS_VERSION=$(echo $eks_version_and_region | cut -d',' -f1) | |
EKS_REGION=$(echo $eks_version_and_region | cut -d',' -f2) | |
echo sha=${SHA} >> $GITHUB_OUTPUT | |
echo cilium_install_defaults=${CILIUM_INSTALL_DEFAULTS} >> $GITHUB_OUTPUT | |
echo owner=${OWNER} >> $GITHUB_OUTPUT | |
echo test_name=${TEST_NAME} >> $GITHUB_OUTPUT | |
echo cluster_name=${CLUSTER_NAME} >> $GITHUB_OUTPUT | |
echo num_client_pods=${NUM_CLIENT_PODS} >> $GITHUB_OUTPUT | |
echo num_client_nodes=${NUM_CLIENT_NODES} >> $GITHUB_OUTPUT | |
echo node_instance_type=${NODE_INSTANCE_TYPE} >> $GITHUB_OUTPUT | |
echo client_qps=${CLIENT_QPS} >> $GITHUB_OUTPUT | |
echo eks_version=${EKS_VERSION} >> $GITHUB_OUTPUT | |
echo eks_region=${EKS_REGION} >> $GITHUB_OUTPUT | |
- name: Wait for images to be available | |
timeout-minutes: 30 | |
shell: bash | |
run: | | |
for image in cilium-ci operator-aws-ci hubble-relay-ci ; do | |
until docker manifest inspect quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/$image:${{ steps.vars.outputs.sha }} &> /dev/null; do sleep 45s; done | |
done | |
- name: Ensure EGW scale utils image is available | |
shell: bash | |
run: | | |
# Run this seprate from the other "Wait for images to be available" step to help with debugging. | |
if ! docker manifest inspect quay.io/cilium/egw-scale-utils:${{ env.egw_utils_ref }} ; then | |
echo "FATAL: egw-scale-utils image with ref ${{ env.egw_utils_ref }} is not available, exiting" | |
exit 1 | |
fi | |
- name: Install Go | |
uses: actions/setup-go@f111f3307d8850f501ac008e886eec1fd1932a34 # v5.3.0 | |
with: | |
go-version: ${{ env.go_version }} | |
- name: Setup gcloud credentials | |
uses: google-github-actions/auth@71f986410dfbc7added4569d411d040a91dc6935 # v2.1.8 | |
with: | |
workload_identity_provider: ${{ secrets.GCP_PERF_WORKLOAD_IDENTITY_PROVIDER }} | |
service_account: ${{ secrets.GCP_PERF_SA }} | |
create_credentials_file: true | |
export_environment_variables: true | |
- name: Setup gcloud CLI | |
uses: google-github-actions/setup-gcloud@77e7a554d41e2ee56fc945c52dfd3f33d12def9a # v2.1.4 | |
with: | |
project_id: ${{ secrets.GCP_PERF_PROJECT_ID }} | |
version: ${{ env.gcloud_version }} | |
- name: Install kubectl | |
run: | | |
curl -sLO "https://dl.k8s.io/release/${{ env.kubectl_version }}/bin/linux/amd64/kubectl" | |
curl -sLO "https://dl.k8s.io/${{ env.kubectl_version }}/bin/linux/amd64/kubectl.sha256" | |
echo "$(cat kubectl.sha256) kubectl" | sha256sum --check | |
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl | |
kubectl version --client | |
- name: Install eksctl CLI | |
run: | | |
curl -LO "https://github.com/eksctl-io/eksctl/releases/download/${{ env.eksctl_version }}/eksctl_$(uname -s)_amd64.tar.gz" | |
sudo tar xzvfC eksctl_$(uname -s)_amd64.tar.gz /usr/bin | |
rm eksctl_$(uname -s)_amd64.tar.gz | |
- name: Set up AWS CLI credentials | |
uses: aws-actions/configure-aws-credentials@4fc4975a852c8cd99761e2de1f4ba73402e44dd9 # v4.0.3 | |
with: | |
role-to-assume: ${{ secrets.AWS_PR_ASSUME_ROLE }} | |
aws-region: ${{ steps.vars.outputs.eks_region }} | |
- name: Run aws configure | |
run: | | |
aws configure set aws_access_key_id ${{ env.AWS_ACCESS_KEY_ID }} | |
aws configure set aws_secret_access_key ${{ env.AWS_SECRET_ACCESS_KEY }} | |
aws configure set aws_session_token ${{ env.AWS_SESSION_TOKEN }} | |
aws configure set default.region ${{ steps.vars.outputs.eks_region }} | |
- name: Display version info of installed tools | |
run: | | |
echo "--- go ---" | |
go version | |
echo "--- kubectl ---" | |
kubectl version --client | |
echo "--- eksctl ---" | |
eksctl version | |
echo "--- gcloud ---" | |
gcloud version | |
- name: Clone ClusterLoader2 | |
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
with: | |
repository: kubernetes/perf-tests | |
# Avoid using renovate to update this dependency because: (1) | |
# perf-tests does not tag or release, so renovate will pull | |
# all updates to the default branch and (2) continually | |
# updating CL2 may impact the stability of the scale test | |
# results. | |
ref: ce821d6232cee6719dd86e7e68eee361e337e92a | |
persist-credentials: false | |
sparse-checkout: clusterloader2 | |
path: perf-tests | |
- name: Generate SSH keypair | |
run: | | |
ssh-keygen -f ${{ env.AWS_SSH_KEY_PRIVATE }} -P "" | |
- name: Create EKS cluster | |
shell: bash | |
id: deploy-cluster | |
run: | | |
cat <<EOF > eks-config.yaml | |
apiVersion: eksctl.io/v1alpha5 | |
kind: ClusterConfig | |
metadata: | |
name: ${{ steps.vars.outputs.cluster_name }} | |
region: ${{ steps.vars.outputs.eks_region }} | |
version: "${{ steps.vars.outputs.eks_version }}" | |
tags: | |
usage: "${{ github.repository_owner }}-${{ github.event.repository.name }}" | |
owner: "${{ steps.vars.outputs.owner }}" | |
managedNodeGroups: | |
- name: ng-amd64-client | |
instanceTypes: | |
- ${{ steps.vars.outputs.node_instance_type }} | |
desiredCapacity: ${{ steps.vars.outputs.num_client_nodes }} | |
spot: false | |
privateNetworking: true | |
volumeType: "gp3" | |
volumeSize: 20 | |
ssh: | |
allow: true | |
publicKeyPath: ${{ env.AWS_SSH_KEY }} | |
taints: | |
- key: "node.cilium.io/agent-not-ready" | |
value: "true" | |
effect: "NoExecute" | |
labels: | |
role.scaffolding/egw-client: "true" | |
- name: ng-amd64-egw-node | |
instanceTypes: | |
- ${{ steps.vars.outputs.node_instance_type }} | |
desiredCapacity: 1 | |
spot: false | |
privateNetworking: true | |
volumeType: "gp3" | |
volumeSize: 20 | |
ssh: | |
allow: true | |
publicKeyPath: ${{ env.AWS_SSH_KEY }} | |
taints: | |
- key: "node.cilium.io/agent-not-ready" | |
value: "true" | |
effect: "NoExecute" | |
labels: | |
role.scaffolding/egw-node: "true" | |
- name: ng-amd64-heapster | |
instanceTypes: | |
- ${{ steps.vars.outputs.node_instance_type }} | |
desiredCapacity: 1 | |
spot: false | |
privateNetworking: true | |
volumeType: "gp3" | |
volumeSize: 20 | |
ssh: | |
allow: true | |
publicKeyPath: ${{ env.AWS_SSH_KEY }} | |
taints: | |
- key: "node.cilium.io/agent-not-ready" | |
value: "true" | |
effect: "NoExecute" | |
labels: | |
role.scaffolding/monitoring: "true" | |
- name: ng-amd64-no-cilium | |
instanceTypes: | |
- ${{ steps.vars.outputs.node_instance_type }} | |
desiredCapacity: 1 | |
spot: false | |
privateNetworking: true | |
volumeType: "gp3" | |
volumeSize: 20 | |
ssh: | |
allow: true | |
publicKeyPath: ${{ env.AWS_SSH_KEY }} | |
taints: | |
- key: "cilium.io/no-schedule" | |
value: "true" | |
effect: "NoSchedule" | |
labels: | |
cilium.io/no-schedule: "true" | |
EOF | |
eksctl create cluster -f ./eks-config.yaml | |
- name: Install Cilium CLI | |
uses: cilium/cilium-cli@a4936ec2afa58bf755162928456190344a179207 # v0.16.24 | |
with: | |
skip-build: ${{ env.CILIUM_CLI_SKIP_BUILD }} | |
image-repo: ${{ env.CILIUM_CLI_IMAGE_REPO }} | |
image-tag: ${{ inputs.SHA || github.sha }} | |
# Warning: since this is a privileged workflow, subsequent workflow job | |
# steps must take care not to execute untrusted code. | |
- name: Checkout context ref (NOT TRUSTED) | |
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
with: | |
ref: ${{ steps.vars.outputs.SHA }} | |
persist-credentials: false | |
path: untrusted | |
sparse-checkout: | | |
install/kubernetes/cilium | |
- name: Install Cilium | |
run: | | |
cilium install --dry-run-helm-values ${{ steps.vars.outputs.cilium_install_defaults }} | |
cilium install ${{ steps.vars.outputs.cilium_install_defaults }} | |
- name: Delete context ref | |
run: | | |
rm -rf untrusted/ | |
- name: Wait for Cilium status to be ready | |
run: | | |
cilium status --wait --interactive=false | |
- name: Run preflight steps | |
shell: bash | |
working-directory: ./.github/actions/cl2-modules/egw | |
env: | |
EGW_IMAGE_TAG: ${{ env.egw_utils_ref }} | |
run: | | |
if [[ "${{ matrix.test_type }}" == "md-bs" ]]; then | |
./preflight.sh baseline | |
else | |
./preflight.sh | |
fi | |
cat ./manifests/* | |
- name: Run CL2 | |
id: run-cl2 | |
working-directory: ./perf-tests/clusterloader2 | |
shell: bash | |
timeout-minutes: 40 | |
env: | |
CL2_PROMETHEUS_PVC_ENABLED: "false" | |
CL2_ENABLE_PVS: "false" | |
CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: "true" | |
CL2_PROMETHEUS_SCRAPE_CILIUM_AGENT: "true" | |
CL2_PROMETHEUS_MEMORY_SCALE_FACTOR: "2.0" | |
CL2_PROMETHEUS_NODE_SELECTOR: 'role.scaffolding/monitoring: "true"' | |
CL2_NUM_EGW_CLIENTS: "${{ steps.vars.outputs.num_client_pods }}" | |
CL2_EGW_CLIENTS_QPS: "${{ steps.vars.outputs.client_qps }}" | |
CL2_MEDIAN_BOOTSTRAP_THRESHOLD: "80" # Takes a bit for ENI interfaces to be added | |
run: | | |
mkdir ./report | |
# CL2 hardcodes module paths to live in next to the config, even | |
# if the path given is relative. | |
cp ../../.github/actions/cl2-modules/cilium-agent-pprofs.yaml ../../.github/actions/cl2-modules/egw | |
cp ../../.github/actions/cl2-modules/cilium-metrics.yaml ../../.github/actions/cl2-modules/egw | |
echo \ | |
'{"CL2_ADDITIONAL_MEASUREMENT_MODULES": ["./cilium-agent-pprofs.yaml", "./cilium-metrics.yaml"]}' \ | |
> modules.yaml | |
go run ./cmd/clusterloader.go \ | |
-v=2 \ | |
--testconfig=../../.github/actions/cl2-modules/egw/config.yaml \ | |
--prometheus-additional-monitors-path=../../.github/actions/cl2-modules/egw/prom-extra-podmons \ | |
--provider=aws \ | |
--enable-prometheus-server \ | |
--tear-down-prometheus-server=false \ | |
--report-dir=./report \ | |
--experimental-prometheus-snapshot-to-report-dir=true \ | |
--kubeconfig=$HOME/.kube/config \ | |
--testoverrides=./testing/prometheus/not-scrape-kube-proxy.yaml \ | |
--testoverrides=./modules.yaml \ | |
2>&1 | tee cl2-output.txt | |
- name: Features tested | |
uses: ./.github/actions/feature-status | |
with: | |
title: "Summary of all features tested" | |
json-filename: "features-${{ matrix.test_type }}" | |
- name: Get sysdump | |
if: ${{ always() && steps.install-cilium.outcome != 'skipped' && steps.install-cilium.outcome != 'cancelled' }} | |
run: | | |
cilium status | |
cilium sysdump \ | |
--output-filename cilium-sysdump-final \ | |
--extra-label-selectors=app.kubernetes.io/name=egw-client \ | |
--extra-label-selectors=app.kubernetes.io/name=egw-external-target | |
sudo chmod +r cilium-sysdump-final.zip | |
- name: Upload sysdump | |
if: ${{ !success() && steps.install-cilium.outcome != 'skipped' && steps.install-cilium.outcome != 'cancelled' }} | |
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 | |
with: | |
name: cilium-sysdump | |
path: cilium-sysdump-final.zip | |
retention-days: 5 | |
- name: Upload features tested | |
if: ${{ always() }} | |
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 | |
with: | |
name: features-tested-${{ matrix.test_type }} | |
path: features-*.json | |
- name: Cleanup cluster | |
if: ${{ always() && steps.deploy-cluster.outcome != 'skipped' }} | |
run: | | |
eksctl delete cluster --name ${{ steps.vars.outputs.cluster_name }} --region ${{ steps.vars.outputs.eks_region }} | |
- name: Export results and sysdump to GS bucket | |
if: ${{ always() && steps.run-cl2.outcome != 'skipped' && steps.run-cl2.outcome != 'cancelled' }} | |
uses: cilium/scale-tests-action/export-results@7d8cbd9a2f9d00697f5d2efd5cd064478aa327e0 # main | |
with: | |
test_name: ${{ steps.vars.outputs.test_name }} | |
results_bucket: ${{ env.GCP_PERF_RESULTS_BUCKET }} | |
artifacts: ./perf-tests/clusterloader2/report/* | |
other_files: cilium-sysdump-final.zip ./perf-tests/clusterloader2/cl2-output.txt | |
commit-status-final: | |
if: ${{ always() }} | |
name: Commit Status Final | |
needs: install-and-scaletest | |
runs-on: ubuntu-24.04 | |
steps: | |
- name: Set final commit status | |
uses: myrotvorets/set-commit-status-action@3730c0a348a2ace3c110851bed53331bc6406e9f # v2.0.1 | |
with: | |
sha: ${{ inputs.SHA || github.sha }} | |
status: ${{ needs.install-and-scaletest.result }} |