Skip to content

Commit 1788318

Browse files
committed
fix: change RayCluster .status.state to unhealthy if not all Pods running
Right now if a RayCluster's `.status.state` is ready, and later any of its Pods change to not running, the `.status.state` is still ready. This change makes the RayCluster controller change `.status.state` to unhealthy.
1 parent 0288281 commit 1788318

File tree

2 files changed

+124
-25
lines changed

2 files changed

+124
-25
lines changed

ray-operator/controllers/ray/raycluster_controller.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1214,6 +1214,8 @@ func (r *RayClusterReconciler) calculateStatus(ctx context.Context, instance *ra
12141214
} else {
12151215
if utils.CheckAllPodsRunning(ctx, runtimePods) {
12161216
newInstance.Status.State = rayv1.Ready
1217+
} else {
1218+
newInstance.Status.State = rayv1.Unhealthy
12171219
}
12181220
}
12191221

ray-operator/controllers/ray/raycluster_controller_fake_test.go

Lines changed: 122 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,11 @@ func setupTest(t *testing.T) {
314314
},
315315
},
316316
},
317+
Resources: corev1.ResourceRequirements{
318+
Requests: corev1.ResourceList{
319+
corev1.ResourceMemory: resource.MustParse("100000000"),
320+
},
321+
},
317322
},
318323
},
319324
},
@@ -1570,38 +1575,130 @@ func TestCalculateStatus(t *testing.T) {
15701575
headService, err := common.BuildServiceForHeadPod(context.Background(), *testRayCluster, nil, nil)
15711576
assert.Nil(t, err, "Failed to build head service.")
15721577
headService.Spec.ClusterIP = headServiceIP
1573-
headPod := &corev1.Pod{
1574-
ObjectMeta: metav1.ObjectMeta{
1575-
Name: "headNode",
1576-
Namespace: namespaceStr,
1577-
Labels: map[string]string{
1578-
utils.RayClusterLabelKey: instanceName,
1579-
utils.RayNodeTypeLabelKey: string(rayv1.HeadNode),
1578+
1579+
tests := map[string]struct {
1580+
pods []*corev1.Pod
1581+
expectedState rayv1.ClusterState
1582+
expectedHeadNodeIP string
1583+
expectedHeadServiceIP string
1584+
}{
1585+
".status.state should be ready if all Pods have .status.phase Running": {
1586+
pods: []*corev1.Pod{
1587+
{
1588+
ObjectMeta: metav1.ObjectMeta{
1589+
Name: "headNode",
1590+
Namespace: namespaceStr,
1591+
Labels: map[string]string{
1592+
utils.RayClusterLabelKey: instanceName,
1593+
utils.RayNodeTypeLabelKey: string(rayv1.HeadNode),
1594+
},
1595+
},
1596+
Status: corev1.PodStatus{
1597+
PodIP: headNodeIP,
1598+
Phase: corev1.PodRunning,
1599+
},
1600+
},
1601+
{
1602+
ObjectMeta: metav1.ObjectMeta{
1603+
Name: "workerNode",
1604+
Namespace: namespaceStr,
1605+
Labels: map[string]string{
1606+
utils.RayClusterLabelKey: instanceName,
1607+
utils.RayNodeTypeLabelKey: string(rayv1.WorkerNode),
1608+
},
1609+
},
1610+
Status: corev1.PodStatus{
1611+
Phase: corev1.PodRunning,
1612+
},
1613+
},
15801614
},
1615+
expectedState: rayv1.Ready,
1616+
expectedHeadNodeIP: headNodeIP,
1617+
expectedHeadServiceIP: headServiceIP,
15811618
},
1582-
Status: corev1.PodStatus{
1583-
PodIP: headNodeIP,
1619+
".status.state should be unhealthy if there aren't any Pods": {
1620+
pods: []*corev1.Pod{},
1621+
expectedState: rayv1.Unhealthy,
1622+
expectedHeadNodeIP: "",
1623+
expectedHeadServiceIP: headServiceIP,
1624+
},
1625+
".status.state should be unhealthy if any Pods don't have .status.phase Running": {
1626+
pods: []*corev1.Pod{
1627+
{
1628+
ObjectMeta: metav1.ObjectMeta{
1629+
Name: "headNode",
1630+
Namespace: namespaceStr,
1631+
Labels: map[string]string{
1632+
utils.RayClusterLabelKey: instanceName,
1633+
utils.RayNodeTypeLabelKey: string(rayv1.HeadNode),
1634+
},
1635+
},
1636+
Status: corev1.PodStatus{
1637+
PodIP: headNodeIP,
1638+
Phase: corev1.PodPending,
1639+
},
1640+
},
1641+
},
1642+
expectedState: rayv1.Unhealthy,
1643+
expectedHeadNodeIP: headNodeIP,
1644+
expectedHeadServiceIP: headServiceIP,
1645+
},
1646+
".status.state should be unhealthy if any Pods have a .status.condition of type: Ready that's not status: True": {
1647+
pods: []*corev1.Pod{
1648+
{
1649+
ObjectMeta: metav1.ObjectMeta{
1650+
Name: "headNode",
1651+
Namespace: namespaceStr,
1652+
Labels: map[string]string{
1653+
utils.RayClusterLabelKey: instanceName,
1654+
utils.RayNodeTypeLabelKey: string(rayv1.HeadNode),
1655+
},
1656+
},
1657+
Status: corev1.PodStatus{
1658+
PodIP: headNodeIP,
1659+
Phase: corev1.PodPending,
1660+
Conditions: []corev1.PodCondition{
1661+
{
1662+
Type: corev1.PodReady,
1663+
Status: corev1.ConditionFalse,
1664+
},
1665+
},
1666+
},
1667+
},
1668+
},
1669+
expectedState: rayv1.Unhealthy,
1670+
expectedHeadNodeIP: headNodeIP,
1671+
expectedHeadServiceIP: headServiceIP,
15841672
},
15851673
}
1586-
runtimeObjects := []runtime.Object{headPod, headService}
15871674

1588-
// Initialize a fake client with newScheme and runtimeObjects.
1589-
fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(runtimeObjects...).Build()
1590-
ctx := context.Background()
1675+
for name, tc := range tests {
1676+
t.Run(name, func(t *testing.T) {
1677+
runtimeObjects := []runtime.Object{headService}
1678+
for _, pod := range tc.pods {
1679+
runtimeObjects = append(runtimeObjects, pod)
1680+
}
15911681

1592-
// Initialize a RayCluster reconciler.
1593-
r := &RayClusterReconciler{
1594-
Client: fakeClient,
1595-
Recorder: &record.FakeRecorder{},
1596-
Scheme: scheme.Scheme,
1597-
Log: ctrl.Log.WithName("controllers").WithName("RayCluster"),
1598-
}
1682+
// Initialize a fake client with newScheme and runtimeObjects.
1683+
fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(runtimeObjects...).Build()
1684+
ctx := context.Background()
15991685

1600-
// Test head information
1601-
newInstance, err := r.calculateStatus(ctx, testRayCluster)
1602-
assert.Nil(t, err)
1603-
assert.Equal(t, headNodeIP, newInstance.Status.Head.PodIP)
1604-
assert.Equal(t, headServiceIP, newInstance.Status.Head.ServiceIP)
1686+
// Initialize a RayCluster reconciler.
1687+
r := &RayClusterReconciler{
1688+
Client: fakeClient,
1689+
Recorder: &record.FakeRecorder{},
1690+
Scheme: scheme.Scheme,
1691+
Log: ctrl.Log.WithName("controllers").WithName("RayCluster"),
1692+
}
1693+
1694+
// Test head information
1695+
newInstance, err := r.calculateStatus(ctx, testRayCluster)
1696+
assert.Nil(t, err)
1697+
assert.Equal(t, tc.expectedHeadNodeIP, newInstance.Status.Head.PodIP)
1698+
assert.Equal(t, tc.expectedHeadServiceIP, newInstance.Status.Head.ServiceIP)
1699+
assert.Equal(t, tc.expectedState, newInstance.Status.State)
1700+
})
1701+
}
16051702
}
16061703

16071704
func Test_TerminatedWorkers_NoAutoscaler(t *testing.T) {

0 commit comments

Comments
 (0)