Skip to content

Commit b9d8b1a

Browse files
authored
[RayCluster] Make headpod name deterministic (#3028)
1 parent a860884 commit b9d8b1a

File tree

7 files changed

+37
-36
lines changed

7 files changed

+37
-36
lines changed

kubectl-plugin/test/e2e/kubectl_ray_log_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ var _ = Describe("Calling ray plugin `log` command on Ray Cluster", func() {
2929

3030
It("succeed in retrieving all ray cluster logs", func() {
3131
expectedDirPath := "./raycluster-kuberay"
32-
expectedOutputStringFormat := `No output directory specified, creating dir under current directory using resource name\.\nCommand set to retrieve both head and worker node logs\.\nDownloading log for Ray Node raycluster-kuberay-head-\w+\nDownloading log for Ray Node raycluster-kuberay-workergroup-worker-\w+`
32+
expectedOutputStringFormat := `No output directory specified, creating dir under current directory using resource name\.\nCommand set to retrieve both head and worker node logs\.\nDownloading log for Ray Node raycluster-kuberay-head\nDownloading log for Ray Node raycluster-kuberay-workergroup-worker-\w+`
3333

3434
cmd := exec.Command("kubectl", "ray", "log", "--namespace", namespace, "raycluster-kuberay", "--node-type", "all")
3535
output, err := cmd.CombinedOutput()
@@ -84,7 +84,7 @@ var _ = Describe("Calling ray plugin `log` command on Ray Cluster", func() {
8484

8585
It("succeed in retrieving ray cluster head logs", func() {
8686
expectedDirPath := "./raycluster-kuberay"
87-
expectedOutputStringFormat := `No output directory specified, creating dir under current directory using resource name\.\nCommand set to retrieve only head node logs\.\nDownloading log for Ray Node raycluster-kuberay-head-\w+`
87+
expectedOutputStringFormat := `No output directory specified, creating dir under current directory using resource name\.\nCommand set to retrieve only head node logs\.\nDownloading log for Ray Node raycluster-kuberay-head`
8888

8989
cmd := exec.Command("kubectl", "ray", "log", "--namespace", namespace, "raycluster-kuberay", "--node-type", "head")
9090
output, err := cmd.CombinedOutput()
@@ -191,7 +191,7 @@ var _ = Describe("Calling ray plugin `log` command on Ray Cluster", func() {
191191

192192
It("succeed in retrieving ray cluster logs within designated directory", func() {
193193
expectedDirPath := "./temporary-directory"
194-
expectedOutputStringFormat := `Command set to retrieve both head and worker node logs\.\nDownloading log for Ray Node raycluster-kuberay-head-\w+\nDownloading log for Ray Node raycluster-kuberay-workergroup-worker-\w+`
194+
expectedOutputStringFormat := `Command set to retrieve both head and worker node logs\.\nDownloading log for Ray Node raycluster-kuberay-head\nDownloading log for Ray Node raycluster-kuberay-workergroup-worker-\w+`
195195

196196
err := os.MkdirAll(expectedDirPath, 0o755)
197197
Expect(err).NotTo(HaveOccurred())

ray-operator/controllers/ray/common/pod.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ func DefaultHeadPodTemplate(ctx context.Context, instance rayv1.RayCluster, head
165165
// headPort is passed into setMissingRayStartParams but unused there for the head pod.
166166
// To mitigate this awkwardness and reduce code redundancy, unify head and worker pod configuration logic.
167167
podTemplate := headSpec.Template
168-
podTemplate.GenerateName = podName
168+
podTemplate.Name = podName
169169
// Pods created by RayCluster should be restricted to the namespace of the RayCluster.
170170
// This ensures privilege of KubeRay users are contained within the namespace of the RayCluster.
171171
podTemplate.ObjectMeta.Namespace = instance.Namespace

ray-operator/controllers/ray/raycluster_controller.go

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -733,24 +733,18 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
733733
return errstd.Join(utils.ErrFailedCreateHeadPod, err)
734734
}
735735
common.SuccessfulClustersCounterInc(instance.Namespace)
736-
} else if len(headPods.Items) > 1 {
737-
logger.Info("reconcilePods: Found more than one head Pods; deleting extra head Pods.", "nHeadPods", len(headPods.Items))
738-
// TODO (kevin85421): In-place update may not be a good idea.
739-
itemLength := len(headPods.Items)
740-
for index := 0; index < itemLength; index++ {
741-
if headPods.Items[index].Status.Phase == corev1.PodRunning || headPods.Items[index].Status.Phase == corev1.PodPending {
742-
headPods.Items[index] = headPods.Items[len(headPods.Items)-1] // Replace healthy pod at index i with the last element from the list of pods to delete.
743-
headPods.Items = headPods.Items[:len(headPods.Items)-1] // Truncate slice.
744-
itemLength--
745-
}
746-
}
747-
// delete all the extra head pod pods
748-
for _, extraHeadPodToDelete := range headPods.Items {
749-
if err := r.Delete(ctx, &extraHeadPodToDelete); err != nil {
750-
return errstd.Join(utils.ErrFailedDeleteHeadPod, err)
751-
}
752-
r.rayClusterScaleExpectation.ExpectScalePod(extraHeadPodToDelete.Namespace, instance.Name, expectations.HeadGroup, extraHeadPodToDelete.Name, expectations.Delete)
736+
} else if len(headPods.Items) > 1 { // This should never happen. This protects against the case that users manually create headpod.
737+
correctHeadPodName := instance.Name + "-head"
738+
headPodNames := make([]string, len(headPods.Items))
739+
for i, pod := range headPods.Items {
740+
headPodNames[i] = pod.Name
753741
}
742+
743+
logger.Info("Multiple head pods found, it should only exist one head pod. Please delete extra head pods.",
744+
"found pods", headPodNames,
745+
"should only leave", correctHeadPodName,
746+
)
747+
return fmt.Errorf("%d head pods found %v. Please delete extra head pods and leave only the head pod with name %s", len(headPods.Items), headPodNames, correctHeadPodName)
754748
}
755749

756750
// Reconcile worker pods now
@@ -1092,7 +1086,7 @@ func (r *RayClusterReconciler) createWorkerPod(ctx context.Context, instance ray
10921086
// Build head instance pod(s).
10931087
func (r *RayClusterReconciler) buildHeadPod(ctx context.Context, instance rayv1.RayCluster) corev1.Pod {
10941088
logger := ctrl.LoggerFrom(ctx)
1095-
podName := utils.PodGenerateName(instance.Name, rayv1.HeadNode)
1089+
podName := utils.PodName(instance.Name, rayv1.HeadNode, false)
10961090
fqdnRayIP := utils.GenerateFQDNServiceName(ctx, instance, instance.Namespace) // Fully Qualified Domain Name
10971091
// The Ray head port used by workers to connect to the cluster (GCS server port for Ray >= 1.11.0, Redis port for older Ray.)
10981092
headPort := common.GetHeadPort(instance.Spec.HeadGroupSpec.RayStartParams)
@@ -1119,7 +1113,7 @@ func getCreatorCRDType(instance rayv1.RayCluster) utils.CRDType {
11191113
// Build worker instance pods.
11201114
func (r *RayClusterReconciler) buildWorkerPod(ctx context.Context, instance rayv1.RayCluster, worker rayv1.WorkerGroupSpec) corev1.Pod {
11211115
logger := ctrl.LoggerFrom(ctx)
1122-
podName := utils.PodGenerateName(fmt.Sprintf("%s-%s", instance.Name, worker.GroupName), rayv1.WorkerNode)
1116+
podName := utils.PodName(fmt.Sprintf("%s-%s", instance.Name, worker.GroupName), rayv1.WorkerNode, true)
11231117
fqdnRayIP := utils.GenerateFQDNServiceName(ctx, instance, instance.Namespace) // Fully Qualified Domain Name
11241118

11251119
// The Ray head port used by workers to connect to the cluster (GCS server port for Ray >= 1.11.0, Redis port for older Ray.)

ray-operator/controllers/ray/utils/util.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -165,10 +165,10 @@ func CheckRouteName(ctx context.Context, s string, n string) string {
165165
return CheckName(s)
166166
}
167167

168-
// PodGenerateName returns the value that should be used for a Pod's generateName
168+
// PodName returns the value that should be used for a Pod's Name or GenerateName
169169
// based on the RayCluster name and node type (head or worker).
170-
func PodGenerateName(prefix string, nodeType rayv1.RayNodeType) string {
171-
maxPrefixLength := 50 // 63 - (max(8,6) + 5 ) // 6 to 8 char are consumed at the end with "-head-" or -worker- + 5 generated.
170+
func PodName(prefix string, nodeType rayv1.RayNodeType, isGenerateName bool) string {
171+
maxPrefixLength := 50 // 63 - ( 8 + 5 ) // 8 char are consumed at the end with "-worker-" + 5 generated.
172172

173173
var podPrefix string
174174
if len(prefix) <= maxPrefixLength {
@@ -177,7 +177,11 @@ func PodGenerateName(prefix string, nodeType rayv1.RayNodeType) string {
177177
podPrefix = prefix[:maxPrefixLength]
178178
}
179179

180-
return strings.ToLower(podPrefix + DashSymbol + string(nodeType) + DashSymbol)
180+
result := strings.ToLower(podPrefix + DashSymbol + string(nodeType))
181+
if isGenerateName {
182+
result += DashSymbol
183+
}
184+
return result
181185
}
182186

183187
// CheckName makes sure the name does not start with a numeric value and the total length is < 63 char

ray-operator/controllers/ray/utils/util_test.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ func TestCheckAllPodsRunning(t *testing.T) {
103103
}
104104
}
105105

106-
func TestPodGenerateName(t *testing.T) {
106+
func TestPodName(t *testing.T) {
107107
tests := []struct {
108108
name string
109109
prefix string
@@ -114,7 +114,7 @@ func TestPodGenerateName(t *testing.T) {
114114
name: "short cluster name, head pod",
115115
prefix: "ray-cluster-01",
116116
nodeType: rayv1.HeadNode,
117-
expected: "ray-cluster-01-head-",
117+
expected: "ray-cluster-01-head",
118118
},
119119
{
120120
name: "short cluster name, worker pod",
@@ -126,7 +126,7 @@ func TestPodGenerateName(t *testing.T) {
126126
name: "long cluster name, head pod",
127127
prefix: "ray-cluster-0000000000000000000000011111111122222233333333333333",
128128
nodeType: rayv1.HeadNode,
129-
expected: "ray-cluster-00000000000000000000000111111111222222-head-",
129+
expected: "ray-cluster-00000000000000000000000111111111222222-head",
130130
},
131131
{
132132
name: "long cluster name, worker pod",
@@ -138,11 +138,12 @@ func TestPodGenerateName(t *testing.T) {
138138

139139
for _, test := range tests {
140140
t.Run(test.name, func(t *testing.T) {
141-
str := PodGenerateName(test.prefix, test.nodeType)
141+
isPodNameGenerated := test.nodeType == rayv1.WorkerNode // HeadPod name is now fixed
142+
str := PodName(test.prefix, test.nodeType, isPodNameGenerated)
142143
if str != test.expected {
143144
t.Logf("expected: %q", test.expected)
144145
t.Logf("actual: %q", str)
145-
t.Error("PodGenerateName returned an unexpected string")
146+
t.Error("PodName returned an unexpected string")
146147
}
147148

148149
// 63 (max pod name length) - 5 random hexadecimal characters from generateName

ray-operator/test/e2e/raycluster_gcs_ft_test.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,14 +109,16 @@ func TestRayClusterGCSFaultTolerence(t *testing.T) {
109109
err = test.Client().Core().CoreV1().Pods(namespace.Name).Delete(test.Ctx(), headPod.Name, metav1.DeleteOptions{})
110110
g.Expect(err).NotTo(HaveOccurred())
111111

112-
testPodNameChanged := func(p *corev1.Pod) bool { return p.Name != headPod.Name }
112+
PodUID := func(p *corev1.Pod) string { return string(p.UID) }
113113
g.Eventually(HeadPod(test, rayCluster), TestTimeoutMedium).
114-
Should(WithTransform(testPodNameChanged, Equal(true)))
114+
ShouldNot(WithTransform(PodUID, Equal(string(headPod.UID)))) // Use UID to check if the new head pod is created.
115115

116116
g.Eventually(HeadPod(test, rayCluster), TestTimeoutMedium).
117117
Should(WithTransform(PodState, Equal("Running")))
118118

119-
headPod, _ = GetHeadPod(test, rayCluster)
119+
headPod, err = GetHeadPod(test, rayCluster) // Replace the old head pod
120+
g.Expect(err).NotTo(HaveOccurred())
121+
120122
expectedOutput = "4"
121123

122124
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "samples/test_detached_actor_2.py", rayNamespace, expectedOutput})

ray-operator/test/e2erayservice/rayservice_redeploy_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ func TestRedeployRayServe(t *testing.T) {
7171
g.Expect(err).NotTo(HaveOccurred())
7272
g.Expect(endpoints.Subsets).To(HaveLen(1))
7373
g.Expect(endpoints.Subsets[0].Addresses).To(HaveLen(1))
74-
g.Expect(endpoints.Subsets[0].Addresses[0].TargetRef.Name).NotTo(Equal(oldHeadPod.Name))
74+
g.Expect(endpoints.Subsets[0].Addresses[0].TargetRef.UID).NotTo(Equal(oldHeadPod.UID))
7575
}, TestTimeoutMedium).Should(Succeed())
7676

7777
LogWithTimestamp(test.T(), "Waiting for RayService %s/%s to running", rayService.Namespace, rayService.Name)

0 commit comments

Comments
 (0)