tests/tasks/generators/clusterloader/load-slos.yaml

---
apiVersion: tekton.dev/v1beta1
kind: Task
metadata:
  name: load-slos
  namespace: scalability
spec:
  description: "clusterloader2 task to run various types of cl2 tests on a given cluster."
  params:
  - name: giturl
    description: "git url to clone the package"
    default: https://github.com/kubernetes/perf-tests.git
  - name: cl2-branch
    description: "The branch of clusterloader2 you want to use"
    default: "master"
  - name: nodes-per-namespace
    description: "nodes per namespace to get created for load test "
    default: "100"
  - name: cl2-load-test-throughput
    description: " throughput used for mutate operations"
    default: "15"
  - name: pods-per-node
    description: "pod density"
    default: "10"
  - name: nodes
    description: "number of dataplane nodes to run the load test against"
    default: "1000"
  - name: results-bucket
    description: "Results bucket with path of s3 to upload results"
  - name: region
    default: "us-west-2"
    description: The region where the cluster is in.
  - name: cluster-name
    description: The name of the EKS cluster you want to spin.
  - name: amp-workspace-id
    description: The AMP workspace ID where remote write needs to happen.
    default: ""
  results:
    - name: datapoint
      description: Stores the CL2 result that can be consumed by other tasks (e.g. cloudwatch) 
    - name: s3_result
      description: Stores the S3 result path after compute
  workspaces:
  - name: source
    mountPath: /src/k8s.io/
  - name: results
  - name: config
    mountPath: /config/
  stepTemplate:
    env:
    - name: KUBECONFIG
      value: /config/kubeconfig
  steps:
  - name: git-clone      
    image: alpine/git
    workingDir: $(workspaces.source.path)
    script: |
      git clone $(params.giturl)
      cd $(workspaces.source.path)/perf-tests/
      git fetch origin --verbose --tags
      git checkout $(params.cl2-branch)
      git branch
  - name: prepare-loadtest
    image: golang:1.22
    workingDir: $(workspaces.source.path)
    script: |
      S3_RESULT_PATH=$(params.results-bucket)
      echo $S3_RESULT_PATH > $(results.s3_result.path)
      echo "S3 Path: $S3_RESULT_PATH" 
      cat > "$(workspaces.source.path)/overrides.yaml" <<EOL
      NODES_PER_NAMESPACE: $(params.nodes-per-namespace)
      CL2_LOAD_TEST_THROUGHPUT: $(params.cl2-load-test-throughput)
      CL2_DELETE_TEST_THROUGHPUT: $(params.cl2-load-test-throughput)
      CL2_SCHEDULER_THROUGHPUT_THRESHOLD: 90
      PODS_PER_NODE: $(params.pods-per-node)
      CL2_USE_HOST_NETWORK_PODS: false
      # we are not testing PVS at this point
      CL2_ENABLE_PVS: false
      ENABLE_SYSTEM_POD_METRICS: false
      NODE_MODE: master 
      CL2_DISABLE_DAEMONSETS: true
      CL2_RATE_LIMIT_POD_CREATION: false
      # enable SLOs related CL2 vars
      CL2_NETWORK_LATENCY_THRESHOLD: 20s
      CL2_ENABLE_VIOLATIONS_FOR_NETWORK_PROGRAMMING_LATENCIES: true
      CL2_NETWORK_PROGRAMMING_LATENCY_THRESHOLD: 300s
      CL2_ENABLE_DNSTESTS: false
      CL2_USE_ADVANCED_DNSTEST: false
      CL2_ENABLE_API_AVAILABILITY_MEASUREMENT: true
      CL2_API_AVAILABILITY_PERCENTAGE_THRESHOLD: 99.5
      CL2_ALLOWED_SLOW_API_CALLS: 1
      CL2_PROMETHEUS_NODE_SELECTOR: "eks.amazonaws.com/nodegroup: monitoring-$(params.cluster-name)-nodes-1"
      CL2_PROMETHEUS_MEMORY_SCALE_FACTOR: 4
      CL2_ENABLE_CLUSTER_OOMS_TRACKER: false
      EOL
      cat $(workspaces.source.path)/overrides.yaml
      cp $(workspaces.source.path)/overrides.yaml $(workspaces.results.path)/overrides.yaml
      
      # Enable Prometheus if the remote workspace id is provided
      if [ -n "$(params.amp-workspace-id)" ]; then
      cat << EOF >> $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/prometheus-prometheus.yaml
        containers:
          - name: aws-sigv4-proxy-sidecar
            image: public.ecr.aws/aws-observability/aws-sigv4-proxy:1.0
            args:
              - --name
              - aps
              - --region
              - $(params.region)
              - --host
              - aps-workspaces.$(params.region).amazonaws.com
              - --port
              - :8005
            ports:
              - name: aws-sigv4-proxy
                containerPort: 8005
        remoteWrite:
          - url: http://localhost:8005/workspaces/$(params.amp-workspace-id)/api/v1/remote_write
            queueConfig:
              capacity: 2500
              maxSamplesPerSend: 1000
              maxShards: 200
        externalLabels:
          cluster_name: $(params.cluster-name)
          s3_path: $S3_RESULT_PATH
      EOF
      cat $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/prometheus-prometheus.yaml
      cat << EOF >> $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/0prometheus-operator-deployment.yaml
            tolerations:
              - key: monitoring
                operator: Exists
                effect: NoSchedule  
      EOF
      cat $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/0prometheus-operator-deployment.yaml
      # schedule kube-state-pod onto the same node as prometheus
      cat $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/exporters/kube-state-metrics/deployment.yaml
      cat << EOF >> $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/exporters/kube-state-metrics/deployment.yaml
            tolerations:
              - key: monitoring
                operator: Exists
                effect: NoSchedule  
      EOF
      cat $(workspaces.source.path)/perf-tests/clusterloader2/pkg/prometheus/manifests/exporters/kube-state-metrics/deployment.yaml
      fi
      # Building clusterloader2 binary 
      cd $(workspaces.source.path)/perf-tests/clusterloader2/
      GOOS=linux CGO_ENABLED=0  go build -v -o ./clusterloader ./cmd
  - name: run-loadtest
    image: alpine/k8s:1.30.2
    onError: continue
    script: |
      #!/bin/bash
      if [ -n "$(params.amp-workspace-id)" ]; then
        # Enable prometheus flags
        export ENABLE_PROMETHEUS_SERVER=true
        export PROMETHEUS_PVC_STORAGE_CLASS=gp2
        export PROMETHEUS_SCRAPE_KUBE_PROXY=true
        export PROMETHEUS_SCRAPE_APISERVER_ONLY=true
        export PROMETHEUS_SCRAPE_KUBE_STATE_METRICS=false
        export PROMETHEUS_KUBE_PROXY_SELECTOR_KEY=k8s-app
        export PROMETHEUS_MEMORY_REQUEST=16Gi
      fi
      cat $(workspaces.source.path)/perf-tests/clusterloader2/testing/load/config.yaml
      cd $(workspaces.source.path)/perf-tests/clusterloader2/
      # [ToDo] To temporarily stop bleeding we delete these.
      # Related issue - https://github.com/kubernetes/kubernetes/issues/126578
      kubectl --kubeconfig=$KUBECONFIG delete servicemonitor kube-dns -n monitoring
      kubectl --kubeconfig=$KUBECONFIG delete servicemonitor coredns -n monitoring
      ENABLE_EXEC_SERVICE=false ./clusterloader --kubeconfig=$KUBECONFIG --testconfig=$(workspaces.source.path)/perf-tests/clusterloader2/testing/load/config.yaml --testoverrides=$(workspaces.source.path)/overrides.yaml --nodes=$(params.nodes) --provider=eks --report-dir=$(workspaces.results.path) --alsologtostderr --v=2
      exit_code=$?
      if [ $exit_code -eq 0 ]; then
      echo "1" | tee $(results.datapoint.path)
      else
      echo "0" | tee $(results.datapoint.path)
      fi
      exit $exit_code
    timeout: 30000s
  - name: upload-results
    image: amazon/aws-cli
    workingDir: $(workspaces.results.path)
    script: |
      S3_RESULT_PATH=$(cat $(results.s3_result.path))
      echo "S3 Path: $S3_RESULT_PATH" 
      aws sts get-caller-identity
      # we expect to see all files from loadtest that clusterloader2 outputs here in this dir
      ls -larth
      aws s3 cp . s3://$S3_RESULT_PATH/  --recursive