Skip to content

Commit 3c44ba0

Browse files
authored
feat: record last state transition times (#2053)
## Problem Statement My ML platform team runs the kuberay ray-operator. We want to measure the time it takes for RayCluster's to transition from their initial "unhealthy" state to some other state. This metric is important for us because our users want their RayClusters to start in a timely manner. It seems like neither the ray-operator nor RayClusters provide this info currently. ## Design Add a new `.status.stateTransitionTimes` field to the `RayCluster` custom resource. This field is a `map[ClusterState]*metav1.Time` that indicates the time of the last state transition for each state. This field is updated whenever the `.status.state` changes. * [original discussion doc](https://docs.google.com/document/d/14yPSZ9iLk7a0qEg14rNWr60Btz0HEeQ3oWKP-GN9QTM) * [related Slack thread](https://ray-distributed.slack.com/archives/C01CKH05XBN/p1709321264762029) * [example input and output RayClusters](https://gist.github.com/davidxia/205d2b23202356a2d3172c51e0912f35)
1 parent 9662bd9 commit 3c44ba0

File tree

13 files changed

+157
-14
lines changed

13 files changed

+157
-14
lines changed

helm-chart/kuberay-operator/crds/ray.io_rayclusters.yaml

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml

Lines changed: 10 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ray-operator/apis/ray/v1/raycluster_types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,8 @@ type RayClusterStatus struct {
140140
// LastUpdateTime indicates last update timestamp for this cluster status.
141141
// +nullable
142142
LastUpdateTime *metav1.Time `json:"lastUpdateTime,omitempty"`
143+
// StateTransitionTimes indicates the time of the last state transition for each state.
144+
StateTransitionTimes map[ClusterState]*metav1.Time `json:"stateTransitionTimes,omitempty"`
143145
// Service Endpoints
144146
Endpoints map[string]string `json:"endpoints,omitempty"`
145147
// Head info

ray-operator/apis/ray/v1/zz_generated.deepcopy.go

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ray-operator/config/crd/bases/ray.io_rayclusters.yaml

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ray-operator/config/crd/bases/ray.io_rayjobs.yaml

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ray-operator/config/crd/bases/ray.io_rayservices.yaml

Lines changed: 10 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ray-operator/controllers/ray/raycluster_controller.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1252,6 +1252,13 @@ func (r *RayClusterReconciler) calculateStatus(ctx context.Context, instance *ra
12521252
timeNow := metav1.Now()
12531253
newInstance.Status.LastUpdateTime = &timeNow
12541254

1255+
if instance.Status.State != newInstance.Status.State {
1256+
if newInstance.Status.StateTransitionTimes == nil {
1257+
newInstance.Status.StateTransitionTimes = make(map[rayv1.ClusterState]*metav1.Time)
1258+
}
1259+
newInstance.Status.StateTransitionTimes[newInstance.Status.State] = &timeNow
1260+
}
1261+
12551262
return newInstance, nil
12561263
}
12571264

ray-operator/controllers/ray/raycluster_controller_test.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,15 @@ var _ = Context("Inside the default namespace", func() {
470470
getClusterState(ctx, namespace, rayCluster.Name),
471471
time.Second*3, time.Millisecond*500).Should(Equal(rayv1.Ready))
472472
})
473+
474+
It("RayCluster's .status.stateTransitionTimes should include a time for ready state", func() {
475+
Eventually(
476+
func() *metav1.Time {
477+
status := getClusterStatus(ctx, namespace, rayCluster.Name)()
478+
return status.StateTransitionTimes[rayv1.Ready]
479+
},
480+
time.Second*3, time.Millisecond*500).Should(Not(BeNil()))
481+
})
473482
})
474483

475484
Describe("RayCluster with a multi-host worker group", func() {

ray-operator/controllers/ray/raycluster_controller_unit_test.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1629,6 +1629,7 @@ func TestCalculateStatus(t *testing.T) {
16291629
},
16301630
Status: corev1.PodStatus{
16311631
PodIP: headNodeIP,
1632+
Phase: corev1.PodRunning,
16321633
},
16331634
}
16341635
runtimeObjects := []runtime.Object{headPod, headService}
@@ -1650,6 +1651,57 @@ func TestCalculateStatus(t *testing.T) {
16501651
assert.Equal(t, headNodeIP, newInstance.Status.Head.PodIP)
16511652
assert.Equal(t, headServiceIP, newInstance.Status.Head.ServiceIP)
16521653
assert.Equal(t, headService.Name, newInstance.Status.Head.ServiceName)
1654+
assert.NotNil(t, newInstance.Status.StateTransitionTimes, "Cluster state transition timestamp should be created")
1655+
assert.Equal(t, newInstance.Status.LastUpdateTime, newInstance.Status.StateTransitionTimes[rayv1.Ready])
1656+
}
1657+
1658+
func TestStateTransitionTimes_NoStateChange(t *testing.T) {
1659+
setupTest(t)
1660+
1661+
// Create a new scheme with CRDs, Pod, Service schemes.
1662+
newScheme := runtime.NewScheme()
1663+
_ = rayv1.AddToScheme(newScheme)
1664+
_ = corev1.AddToScheme(newScheme)
1665+
1666+
// Mock data
1667+
headServiceIP := "aaa.bbb.ccc.ddd"
1668+
headService, err := common.BuildServiceForHeadPod(context.Background(), *testRayCluster, nil, nil)
1669+
assert.Nil(t, err, "Failed to build head service.")
1670+
headService.Spec.ClusterIP = headServiceIP
1671+
// headService.Spec.cont
1672+
headPod := &corev1.Pod{
1673+
ObjectMeta: metav1.ObjectMeta{
1674+
Name: "headNode",
1675+
Namespace: namespaceStr,
1676+
Labels: map[string]string{
1677+
utils.RayClusterLabelKey: instanceName,
1678+
utils.RayNodeTypeLabelKey: string(rayv1.HeadNode),
1679+
},
1680+
},
1681+
Status: corev1.PodStatus{
1682+
PodIP: headNodeIP,
1683+
Phase: corev1.PodRunning,
1684+
},
1685+
}
1686+
runtimeObjects := []runtime.Object{headPod, headService}
1687+
1688+
// Initialize a fake client with newScheme and runtimeObjects.
1689+
fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(runtimeObjects...).Build()
1690+
ctx := context.Background()
1691+
1692+
// Initialize a RayCluster reconciler.
1693+
r := &RayClusterReconciler{
1694+
Client: fakeClient,
1695+
Recorder: &record.FakeRecorder{},
1696+
Scheme: scheme.Scheme,
1697+
}
1698+
1699+
preUpdateTime := metav1.Now()
1700+
testRayCluster.Status.State = rayv1.Ready
1701+
testRayCluster.Status.StateTransitionTimes = map[rayv1.ClusterState]*metav1.Time{rayv1.Ready: &preUpdateTime}
1702+
newInstance, err := r.calculateStatus(ctx, testRayCluster)
1703+
assert.Nil(t, err)
1704+
assert.Equal(t, preUpdateTime, *newInstance.Status.StateTransitionTimes[rayv1.Ready], "Cluster state transition timestamp should not be updated")
16531705
}
16541706

16551707
func Test_TerminatedWorkers_NoAutoscaler(t *testing.T) {

ray-operator/controllers/ray/suite_helpers_test.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,16 @@ func getClusterState(ctx context.Context, namespace string, clusterName string)
4949
}
5050
}
5151

52+
func getClusterStatus(ctx context.Context, namespace string, clusterName string) func() rayv1.RayClusterStatus {
53+
return func() rayv1.RayClusterStatus {
54+
var cluster rayv1.RayCluster
55+
if err := k8sClient.Get(ctx, client.ObjectKey{Namespace: namespace, Name: clusterName}, &cluster); err != nil {
56+
log.Fatal(err)
57+
}
58+
return cluster.Status
59+
}
60+
}
61+
5262
func isAllPodsRunningByFilters(ctx context.Context, podlist corev1.PodList, opt ...client.ListOption) bool {
5363
err := k8sClient.List(ctx, &podlist, opt...)
5464
gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "failed to list Pods")

ray-operator/pkg/client/applyconfiguration/ray/v1/rayclusterstatus.go

Lines changed: 29 additions & 14 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)