Skip to content

Commit

Permalink
add initial perf tests for 100 RayCluster and 100 RayJob (#2102)
Browse files Browse the repository at this point in the history
Signed-off-by: Andrew Sy Kim <[email protected]>
  • Loading branch information
andrewsykim authored May 1, 2024
1 parent f27e4ac commit 33a9b24
Show file tree
Hide file tree
Showing 9 changed files with 341 additions and 0 deletions.
58 changes: 58 additions & 0 deletions benchmark/perf-tests/100-raycluster/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
name: kuberay
namespace:
number: 10
tuningSets:
- name: Uniform100qps
qpsLoad:
qps: 100
steps:
- name: Start measurements
measurements:
- Identifier: PodStartupLatency
Method: PodStartupLatency
Params:
action: start
labelSelector: app.kubernetes.io/created-by = kuberay-operator
threshold: 5m
- Identifier: WaitForControlledPodsRunning
Method: WaitForControlledPodsRunning
Params:
action: start
apiVersion: ray.io/v1
kind: RayCluster
labelSelector: app.kubernetes.io/created-by = kuberay-operator
operationTimeout: 120s
- name: Creating Ray clusters
phases:
- namespaceRange:
min: 1
max: 10
replicasPerNamespace: 10
tuningSet: Uniform100qps
objectBundle:
- basename: raycluster
objectTemplatePath: raycluster.yaml
templateFillMap:
Replicas: 1
- name: Wait for RayClusters ready
measurements:
- Identifier: WaitForRayCluster
Method: Exec
Params:
timeout: 10m
command:
- "bash"
- "100-raycluster/wait-for-rayclusters.sh"
- name: Wait for pods to be running
measurements:
- Identifier: WaitForControlledPodsRunning
Method: WaitForControlledPodsRunning
Params:
action: gather
operationTimeout: 10m
- name: Measure pod startup latency
measurements:
- Identifier: PodStartupLatency
Method: PodStartupLatency
Params:
action: gather
64 changes: 64 additions & 0 deletions benchmark/perf-tests/100-raycluster/raycluster.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
apiVersion: ray.io/v1
kind: RayCluster
metadata:
name: {{.Name}}
spec:
rayVersion: '2.9.3'
headGroupSpec:
serviceType: ClusterIP
rayStartParams:
dashboard-host: '0.0.0.0'
template:
spec:
containers:
- name: ray-head
image: rayproject/ray:2.9.3
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
resources:
limits:
cpu: "1"
requests:
cpu: "100m"
volumes:
- name: ray-logs
emptyDir: {}
workerGroupSpecs:
- replicas: {{.Replicas}}
minReplicas: 1
maxReplicas: 10
# logical group name, for this called small-group, also can be functional
groupName: small-group
rayStartParams: {}
template:
spec:
containers:
- name: ray-worker
image: rayproject/ray:2.9.3
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
volumeMounts:
- mountPath: /tmp/ray
name: ray-logs
resources:
limits:
cpu: "1"
requests:
cpu: "100m"
volumes:
- name: ray-logs
emptyDir: {}
9 changes: 9 additions & 0 deletions benchmark/perf-tests/100-raycluster/results/junit.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<testsuite name="ClusterLoaderV2" tests="0" failures="0" errors="0" time="58.451">
<testcase name="kuberay overall (github.com/ray-project/kuberary/benchmark/perf-tests/100-raycluster/config.yaml)" classname="ClusterLoaderV2" time="58.449897441"></testcase>
<testcase name="kuberay: [step: 01] Start measurements [00] - PodStartupLatency" classname="ClusterLoaderV2" time="0.271670737"></testcase>
<testcase name="kuberay: [step: 01] Start measurements [01] - WaitForControlledPodsRunning" classname="ClusterLoaderV2" time="0.673679139"></testcase>
<testcase name="kuberay: [step: 02] Creating Ray clusters" classname="ClusterLoaderV2" time="1.112338422"></testcase>
<testcase name="kuberay: [step: 03] Wait for pods to be running [00] - WaitForControlledPodsRunning" classname="ClusterLoaderV2" time="5.199491235"></testcase>
<testcase name="kuberay: [step: 04] Measure pod startup latency [00] - PodStartupLatency" classname="ClusterLoaderV2" time="0.477699884"></testcase>
</testsuite>
17 changes: 17 additions & 0 deletions benchmark/perf-tests/100-raycluster/wait-for-rayclusters.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

expect_succeeded=100
echo "waiting for $expect_succeeded RayClusters to be completed successfully"

while true; do
num_succeeded=$(kubectl get raycluster -A -o jsonpath='{range .items[*]}{.metadata.name} {.status.state}{"\n"}' | grep -c ready)
echo "$num_succeeded RayClusters ready..."

if [[ "$num_succeeded" == "$expect_succeeded" ]]; then
break;
fi

sleep 5
done

echo "$num_succeeded RayClusters ready!"
68 changes: 68 additions & 0 deletions benchmark/perf-tests/100-rayjob/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
name: kuberay
namespace:
number: 10
tuningSets:
- name: Uniform100qps
qpsLoad:
qps: 100
steps:
- name: Start measurements
measurements:
- Identifier: PodStartupLatency
Method: PodStartupLatency
Params:
action: start
labelSelector: app.kubernetes.io/created-by = kuberay-operator
threshold: 5m
- Identifier: WaitForControlledPodsRunning
Method: WaitForControlledPodsRunning
Params:
action: start
apiVersion: ray.io/v1
kind: RayCluster
labelSelector: app.kubernetes.io/created-by = kuberay-operator
operationTimeout: 120s
- Identifier: JobLifecycleLatency
Method: JobLifecycleLatency
Params:
action: start
labelSelector: app.kubernetes.io/created-by = kuberay-operator
threshold: 5m
- name: Creating RayJobs
phases:
- namespaceRange:
min: 1
max: 10
replicasPerNamespace: 10
tuningSet: Uniform100qps
objectBundle:
- basename: rayjob
objectTemplatePath: rayjob.yaml
- name: Wait for RayJobs complete
measurements:
- Identifier: WaitForRayJob
Method: Exec
Params:
timeout: 10m
command:
- "bash"
- "100-rayjob/wait-for-rayjobs.sh"
- name: Wait for pods to be running
measurements:
- Identifier: WaitForControlledPodsRunning
Method: WaitForControlledPodsRunning
Params:
action: gather
operationTimeout: 10m
- name: Measure pod startup latency
measurements:
- Identifier: PodStartupLatency
Method: PodStartupLatency
Params:
action: gather
- name: Measure job finished
measurements:
- Identifier: JobLifecycleLatency
Method: JobLifecycleLatency
Params:
action: gather
49 changes: 49 additions & 0 deletions benchmark/perf-tests/100-rayjob/rayjob.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
apiVersion: ray.io/v1
kind: RayJob
metadata:
name: {{.Name}}
spec:
entrypoint: python -c "import ray; ray.init(); print(ray.cluster_resources())"
shutdownAfterJobFinishes: true
rayClusterSpec:
rayVersion: '2.9.3'
headGroupSpec:
rayStartParams:
dashboard-host: '0.0.0.0'
template:
spec:
containers:
- name: ray-head
image: rayproject/ray:2.9.3
ports:
- containerPort: 6379
name: gcs-server
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
resources:
limits:
cpu: "1"
requests:
cpu: "100m"
workerGroupSpecs:
- replicas: 1
minReplicas: 1
maxReplicas: 5
groupName: small-group
rayStartParams: {}
template:
spec:
containers:
- name: ray-worker
image: rayproject/ray:2.9.3
lifecycle:
preStop:
exec:
command: [ "/bin/sh","-c","ray stop" ]
resources:
limits:
cpu: "1"
requests:
cpu: "100m"
13 changes: 13 additions & 0 deletions benchmark/perf-tests/100-rayjob/results/junit.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<testsuite name="ClusterLoaderV2" tests="0" failures="0" errors="0" time="284.804">
<testcase name="kuberay overall (100-rayjob/config.yaml)" classname="ClusterLoaderV2" time="284.80261448"></testcase>
<testcase name="kuberay: [step: 01] Start measurements [00] - PodStartupLatency" classname="ClusterLoaderV2" time="0.106826602"></testcase>
<testcase name="kuberay: [step: 01] Start measurements [01] - WaitForControlledPodsRunning" classname="ClusterLoaderV2" time="0.207335286"></testcase>
<testcase name="kuberay: [step: 01] Start measurements [02] - JobLifecycleLatency" classname="ClusterLoaderV2" time="0.106730692"></testcase>
<testcase name="kuberay: [step: 02] Creating RayJobs" classname="ClusterLoaderV2" time="1.059487968"></testcase>
<testcase name="kuberay: [step: 03] Wait for RayJobs complete [00] - WaitForRayJob" classname="ClusterLoaderV2" time="217.399873864"></testcase>
<testcase name="kuberay: [step: 04] Wait for pods to be running [00] - WaitForControlledPodsRunning" classname="ClusterLoaderV2" time="5.011879337"></testcase>
<testcase name="kuberay: [step: 05] Measure pod startup latency [00] - PodStartupLatency" classname="ClusterLoaderV2" time="0.04856601"></testcase>
<testcase name="kuberay: [step: 06] Measure job finished [00] - JobLifecycleLatency" classname="ClusterLoaderV2" time="1.001760404"></testcase>
</testsuite>

17 changes: 17 additions & 0 deletions benchmark/perf-tests/100-rayjob/wait-for-rayjobs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

expect_succeeded=100
echo "waiting for $expect_succeeded RayJobs to be completed successfully"

while true; do
num_succeeded=$(kubectl get rayjob -A -o jsonpath='{range .items[*]}{.metadata.name} {.status.jobStatus}{"\n"}' | grep -c SUCCEEDED)
echo "$num_succeeded RayJobs completed..."

if [[ "$num_succeeded" == "$expect_succeeded" ]]; then
break;
fi

sleep 5
done

echo "$num_succeeded RayJobs completed!"
46 changes: 46 additions & 0 deletions benchmark/perf-tests/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# KubeRay Performance Tests

This directory contains a collection of large scale KubeRay tests using [clusterloader2](https://github.com/kubernetes/perf-tests/tree/master/clusterloader2).
clusterloader2 is a Kubernetes load testing tool by [SIG Scalability](https://github.com/kubernetes/community/blob/master/sig-scalability) used for Kubernetes scalability and performance testing.

## Running clusterloader2 tests

First, install the perf-tests repository and compile the clusterloader2 binary
```
git clone [email protected]:kubernetes/perf-tests.git
cd perf-tests/clusterloader2
go build -o clusterloader2 ./cmd
```

Run the following command to run clusterloader2 against one of the test folders. In this example we'll run the test configured in the [100-raycluster](./100-raycluster/) folder.
```
clusterloader2 --provider=<provider-name> --kubeconfig=<path to kubeconfig> --testconfig=100-raycluster/config.yaml
```

## Tests & Results

Each directory contains a test scenario and it's clusterloader2 configuraiton. Within the directories contains a `results` subdirectory containing junit.xml files generated by clusterloader2
for previously executed runs of the tests.

The current lists of tests are:
* [100 RayCluster test](./100-raycluster/)


## Run a performance test with Kind

You can test clusterloader2 configs using Kind.

First create a kind cluster:
```
kind create cluster --image=kindest/node:v1.27.3
```

Install kuberay;
```
helm install kuberay-operator kuberay/kuberay-operator --version 1.1.0
```

Run a clusterloader2 test:
```
clusterloader2 --provider kind --kubeconfig ~/.kube/config --testconfig ./100-raycluster/config.yaml
```

0 comments on commit 33a9b24

Please sign in to comment.