forked from awslabs/kubernetes-iteration-toolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload-slos.yaml
187 lines (186 loc) · 7.68 KB
/
load-slos.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
---
apiVersion: tekton.dev/v1beta1
kind: Task
metadata:
name: load-slos
namespace: scalability
spec:
description: "clusterloader2 task to run various types of cl2 tests on a given cluster."
params:
- name: giturl
description: "git url to clone the package"
default: https://github.com/kubernetes/perf-tests.git
- name: cl2-branch
description: "The branch of clusterloader2 you want to use"
default: "master"
- name: nodes-per-namespace
description: "nodes per namespace to get created for load test "
default: "100"
- name: cl2-load-test-throughput
description: " throughput used for mutate operations"
default: "15"
- name: pods-per-node
description: "pod density"
default: "10"
- name: nodes
description: "number of dataplane nodes to run the load test against"
default: "1000"
- name: results-bucket
description: "Results bucket with path of s3 to upload results"
- name: region
default: "us-west-2"
description: The region where the cluster is in.
- name: cluster-name
description: The name of the EKS cluster you want to spin.
- name: amp-workspace-id
description: The AMP workspace ID where remote write needs to happen.
default: ""
results:
- name: datapoint
description: Stores the CL2 result that can be consumed by other tasks (e.g. cloudwatch)
- name: s3_result
description: Stores the S3 result path after compute
workspaces:
- name: source
mountPath: /src/k8s.io/
- name: results
- name: config
mountPath: /config/
stepTemplate:
env:
- name: KUBECONFIG
value: /config/kubeconfig
steps:
- name: git-clone
image: alpine/git
workingDir: $(workspaces.source.path)
script: |
git clone $(params.giturl)
cd $(workspaces.source.path)/perf-tests/
git fetch origin --verbose --tags
git checkout $(params.cl2-branch)
git branch
- name: prepare-loadtest
image: golang:1.22
workingDir: $(workspaces.source.path)
script: |
S3_RESULT_PATH=$(params.results-bucket)
echo $S3_RESULT_PATH > $(results.s3_result.path)
echo "S3 Path: $S3_RESULT_PATH"
cat > "$(workspaces.source.path)/overrides.yaml" <<EOL
NODES_PER_NAMESPACE: $(params.nodes-per-namespace)
CL2_LOAD_TEST_THROUGHPUT: $(params.cl2-load-test-throughput)
CL2_DELETE_TEST_THROUGHPUT: $(params.cl2-load-test-throughput)
CL2_SCHEDULER_THROUGHPUT_THRESHOLD: 90
PODS_PER_NODE: $(params.pods-per-node)
CL2_USE_HOST_NETWORK_PODS: false
# we are not testing PVS at this point
CL2_ENABLE_PVS: false
ENABLE_SYSTEM_POD_METRICS: false
NODE_MODE: master
CL2_DISABLE_DAEMONSETS: true
CL2_RATE_LIMIT_POD_CREATION: false
# enable SLOs related CL2 vars
CL2_NETWORK_LATENCY_THRESHOLD: 20s
CL2_ENABLE_VIOLATIONS_FOR_NETWORK_PROGRAMMING_LATENCIES: true
CL2_NETWORK_PROGRAMMING_LATENCY_THRESHOLD: 300s
CL2_ENABLE_DNSTESTS: false
CL2_USE_ADVANCED_DNSTEST: false
CL2_ENABLE_API_AVAILABILITY_MEASUREMENT: true
CL2_API_AVAILABILITY_PERCENTAGE_THRESHOLD: 99.5
CL2_ALLOWED_SLOW_API_CALLS: 1
CL2_PROMETHEUS_NODE_SELECTOR: "eks.amazonaws.com/nodegroup: monitoring-$(params.cluster-name)-nodes-1"
CL2_PROMETHEUS_MEMORY_SCALE_FACTOR: 4
CL2_ENABLE_CLUSTER_OOMS_TRACKER: false
EOL
cat $(workspaces.source.path)/overrides.yaml
cp $(workspaces.source.path)/overrides.yaml $(workspaces.results.path)/overrides.yaml
# Enable Prometheus if the remote workspace id is provided
if [ -n "$(params.amp-workspace-id)" ]; then
cat << EOF >> $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/prometheus-prometheus.yaml
containers:
- name: aws-sigv4-proxy-sidecar
image: public.ecr.aws/aws-observability/aws-sigv4-proxy:1.0
args:
- --name
- aps
- --region
- $(params.region)
- --host
- aps-workspaces.$(params.region).amazonaws.com
- --port
- :8005
ports:
- name: aws-sigv4-proxy
containerPort: 8005
remoteWrite:
- url: http://localhost:8005/workspaces/$(params.amp-workspace-id)/api/v1/remote_write
queueConfig:
capacity: 2500
maxSamplesPerSend: 1000
maxShards: 200
externalLabels:
cluster_name: $(params.cluster-name)
s3_path: $S3_RESULT_PATH
EOF
cat $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/prometheus-prometheus.yaml
cat << EOF >> $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/0prometheus-operator-deployment.yaml
tolerations:
- key: monitoring
operator: Exists
effect: NoSchedule
EOF
cat $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/0prometheus-operator-deployment.yaml
# schedule kube-state-pod onto the same node as prometheus
cat $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/exporters/kube-state-metrics/deployment.yaml
cat << EOF >> $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/exporters/kube-state-metrics/deployment.yaml
tolerations:
- key: monitoring
operator: Exists
effect: NoSchedule
EOF
cat $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/exporters/kube-state-metrics/deployment.yaml
fi
# Building clusterloader2 binary
cd $(workspaces.source.path)/perf-tests/clusterloader2/
GOOS=linux CGO_ENABLED=0 go build -v -o ./clusterloader ./cmd
- name: run-loadtest
image: alpine/k8s:1.30.2
onError: continue
script: |
#!/bin/bash
if [ -n "$(params.amp-workspace-id)" ]; then
# Enable prometheus flags
export ENABLE_PROMETHEUS_SERVER=true
export PROMETHEUS_PVC_STORAGE_CLASS=gp2
export PROMETHEUS_SCRAPE_KUBE_PROXY=true
export PROMETHEUS_SCRAPE_APISERVER_ONLY=true
export PROMETHEUS_SCRAPE_KUBE_STATE_METRICS=false
export PROMETHEUS_KUBE_PROXY_SELECTOR_KEY=k8s-app
export PROMETHEUS_MEMORY_REQUEST=16Gi
fi
cat $(workspaces.source.path)/perf-tests/clusterloader2/testing/load/config.yaml
cd $(workspaces.source.path)/perf-tests/clusterloader2/
# [ToDo] To temporarily stop bleeding we delete these.
# Related issue - https://github.com/kubernetes/kubernetes/issues/126578
kubectl --kubeconfig=$KUBECONFIG delete servicemonitor kube-dns -n monitoring
kubectl --kubeconfig=$KUBECONFIG delete servicemonitor coredns -n monitoring
ENABLE_EXEC_SERVICE=false ./clusterloader --kubeconfig=$KUBECONFIG --testconfig=$(workspaces.source.path)/perf-tests/clusterloader2/testing/load/config.yaml --testoverrides=$(workspaces.source.path)/overrides.yaml --nodes=$(params.nodes) --provider=eks --report-dir=$(workspaces.results.path) --alsologtostderr --v=2
exit_code=$?
if [ $exit_code -eq 0 ]; then
echo "1" | tee $(results.datapoint.path)
else
echo "0" | tee $(results.datapoint.path)
fi
exit $exit_code
timeout: 30000s
- name: upload-results
image: amazon/aws-cli
workingDir: $(workspaces.results.path)
script: |
S3_RESULT_PATH=$(cat $(results.s3_result.path))
echo "S3 Path: $S3_RESULT_PATH"
aws sts get-caller-identity
# we expect to see all files from loadtest that clusterloader2 outputs here in this dir
ls -larth
aws s3 cp . s3://$S3_RESULT_PATH/ --recursive