Skip to content

Commit ffac341

Browse files
authored
[RayCluster] Add serviceName to status.headInfo (#2089)
1 parent b8212e5 commit ffac341

16 files changed

+73
-34
lines changed

apiserver/pkg/server/ray_job_submission_service_server.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ func (s *RayJobSubmissionServiceServer) SubmitRayJob(ctx context.Context, req *a
5151
}
5252
rayDashboardClient := s.dashboardClientFunc()
5353
// TODO: support proxy subresources in kuberay-apiserver
54-
if err := rayDashboardClient.InitClient(*url, nil); err != nil {
54+
if err := rayDashboardClient.InitClient(ctx, *url, nil); err != nil {
5555
return nil, err
5656
}
5757
request := &utils.RayJobRequest{Entrypoint: req.Jobsubmission.Entrypoint}
@@ -106,7 +106,7 @@ func (s *RayJobSubmissionServiceServer) GetJobDetails(ctx context.Context, req *
106106
}
107107
rayDashboardClient := s.dashboardClientFunc()
108108
// TODO: support proxy subresources in kuberay-apiserver
109-
if err := rayDashboardClient.InitClient(*url, nil); err != nil {
109+
if err := rayDashboardClient.InitClient(ctx, *url, nil); err != nil {
110110
return nil, err
111111
}
112112
nodeInfo, err := rayDashboardClient.GetJobInfo(ctx, req.Submissionid)
@@ -129,7 +129,7 @@ func (s *RayJobSubmissionServiceServer) GetJobLog(ctx context.Context, req *api.
129129
}
130130
rayDashboardClient := s.dashboardClientFunc()
131131
// TODO: support proxy subresources in kuberay-apiserver
132-
if err := rayDashboardClient.InitClient(*url, nil); err != nil {
132+
if err := rayDashboardClient.InitClient(ctx, *url, nil); err != nil {
133133
return nil, err
134134
}
135135
jlog, err := rayDashboardClient.GetJobLog(ctx, req.Submissionid)
@@ -152,7 +152,7 @@ func (s *RayJobSubmissionServiceServer) ListJobDetails(ctx context.Context, req
152152
}
153153
rayDashboardClient := s.dashboardClientFunc()
154154
// TODO: support proxy subresources in kuberay-apiserver
155-
if err := rayDashboardClient.InitClient(*url, nil); err != nil {
155+
if err := rayDashboardClient.InitClient(ctx, *url, nil); err != nil {
156156
return nil, err
157157
}
158158
nodesInfo, err := rayDashboardClient.ListJobs(ctx)
@@ -176,7 +176,7 @@ func (s *RayJobSubmissionServiceServer) StopRayJob(ctx context.Context, req *api
176176
}
177177
rayDashboardClient := s.dashboardClientFunc()
178178
// TODO: support proxy subresources in kuberay-apiserver
179-
if err := rayDashboardClient.InitClient(*url, nil); err != nil {
179+
if err := rayDashboardClient.InitClient(ctx, *url, nil); err != nil {
180180
return nil, err
181181
}
182182
err = rayDashboardClient.StopJob(ctx, req.Submissionid)
@@ -196,7 +196,7 @@ func (s *RayJobSubmissionServiceServer) DeleteRayJob(ctx context.Context, req *a
196196
}
197197
rayDashboardClient := s.dashboardClientFunc()
198198
// TODO: support proxy subresources in kuberay-apiserver
199-
if err := rayDashboardClient.InitClient(*url, nil); err != nil {
199+
if err := rayDashboardClient.InitClient(ctx, *url, nil); err != nil {
200200
return nil, err
201201
}
202202
err = rayDashboardClient.DeleteJob(ctx, req.Submissionid)

helm-chart/kuberay-operator/crds/ray.io_rayclusters.yaml

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ray-operator/apis/ray/v1/raycluster_types.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,8 +153,9 @@ type RayClusterStatus struct {
153153

154154
// HeadInfo gives info about head
155155
type HeadInfo struct {
156-
PodIP string `json:"podIP,omitempty"`
157-
ServiceIP string `json:"serviceIP,omitempty"`
156+
PodIP string `json:"podIP,omitempty"`
157+
ServiceIP string `json:"serviceIP,omitempty"`
158+
ServiceName string `json:"serviceName,omitempty"`
158159
}
159160

160161
// RayNodeType the type of a ray node: head/worker

ray-operator/config/crd/bases/ray.io_rayclusters.yaml

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ray-operator/config/crd/bases/ray.io_rayjobs.yaml

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ray-operator/config/crd/bases/ray.io_rayservices.yaml

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ray-operator/controllers/ray/raycluster_controller.go

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1309,21 +1309,21 @@ func (r *RayClusterReconciler) getHeadPodIP(ctx context.Context, instance *rayv1
13091309
return runtimePods.Items[0].Status.PodIP, nil
13101310
}
13111311

1312-
func (r *RayClusterReconciler) getHeadServiceIP(ctx context.Context, instance *rayv1.RayCluster) (string, error) {
1312+
func (r *RayClusterReconciler) getHeadServiceIPAndName(ctx context.Context, instance *rayv1.RayCluster) (string, string, error) {
13131313
runtimeServices := corev1.ServiceList{}
13141314
filterLabels := client.MatchingLabels(common.HeadServiceLabels(*instance))
13151315
if err := r.List(ctx, &runtimeServices, client.InNamespace(instance.Namespace), filterLabels); err != nil {
1316-
return "", err
1316+
return "", "", err
13171317
}
13181318
if len(runtimeServices.Items) < 1 {
1319-
return "", fmt.Errorf("unable to find head service. cluster name %s, filter labels %v", instance.Name, filterLabels)
1319+
return "", "", fmt.Errorf("unable to find head service. cluster name %s, filter labels %v", instance.Name, filterLabels)
13201320
} else if len(runtimeServices.Items) > 1 {
1321-
return "", fmt.Errorf("found multiple head services. cluster name %s, filter labels %v", instance.Name, filterLabels)
1321+
return "", "", fmt.Errorf("found multiple head services. cluster name %s, filter labels %v", instance.Name, filterLabels)
13221322
} else if runtimeServices.Items[0].Spec.ClusterIP == "" {
1323-
return "", fmt.Errorf("head service IP is empty. cluster name %s, filter labels %v", instance.Name, filterLabels)
1323+
return "", "", fmt.Errorf("head service IP is empty. cluster name %s, filter labels %v", instance.Name, filterLabels)
13241324
}
13251325

1326-
return runtimeServices.Items[0].Spec.ClusterIP, nil
1326+
return runtimeServices.Items[0].Spec.ClusterIP, runtimeServices.Items[0].Name, nil
13271327
}
13281328

13291329
func (r *RayClusterReconciler) updateEndpoints(ctx context.Context, instance *rayv1.RayCluster) error {
@@ -1374,10 +1374,11 @@ func (r *RayClusterReconciler) updateHeadInfo(ctx context.Context, instance *ray
13741374
instance.Status.Head.PodIP = ip
13751375
}
13761376

1377-
if ip, err := r.getHeadServiceIP(ctx, instance); err != nil {
1377+
if ip, name, err := r.getHeadServiceIPAndName(ctx, instance); err != nil {
13781378
return err
13791379
} else {
13801380
instance.Status.Head.ServiceIP = ip
1381+
instance.Status.Head.ServiceName = name
13811382
}
13821383

13831384
return nil

ray-operator/controllers/ray/raycluster_controller_unit_test.go

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1355,7 +1355,7 @@ func TestGetHeadPodIP(t *testing.T) {
13551355
}
13561356
}
13571357

1358-
func TestGetHeadServiceIP(t *testing.T) {
1358+
func TestGetHeadServiceIPAndName(t *testing.T) {
13591359
setupTest(t)
13601360

13611361
headServiceIP := "1.2.3.4"
@@ -1379,21 +1379,25 @@ func TestGetHeadServiceIP(t *testing.T) {
13791379
tests := map[string]struct {
13801380
services []runtime.Object
13811381
expectedIP string
1382+
expectedName string
13821383
returnsError bool
13831384
}{
13841385
"get expected Service IP if there's one head Service": {
13851386
services: testServices,
13861387
expectedIP: headServiceIP,
1388+
expectedName: headService.Name,
13871389
returnsError: false,
13881390
},
13891391
"get error if there's no head Service": {
13901392
services: []runtime.Object{},
13911393
expectedIP: "",
1394+
expectedName: "",
13921395
returnsError: true,
13931396
},
13941397
"get error if there's more than one head Service": {
13951398
services: append(testServices, extraHeadService),
13961399
expectedIP: "",
1400+
expectedName: "",
13971401
returnsError: true,
13981402
},
13991403
}
@@ -1408,15 +1412,15 @@ func TestGetHeadServiceIP(t *testing.T) {
14081412
Scheme: scheme.Scheme,
14091413
}
14101414

1411-
ip, err := testRayClusterReconciler.getHeadServiceIP(context.TODO(), testRayCluster)
1412-
1415+
ip, name, err := testRayClusterReconciler.getHeadServiceIPAndName(context.TODO(), testRayCluster)
14131416
if tc.returnsError {
1414-
assert.NotNil(t, err, "getHeadServiceIP should return error")
1417+
assert.NotNil(t, err, "getHeadServiceIPAndName should return error")
14151418
} else {
1416-
assert.Nil(t, err, "getHeadServiceIP should not return error")
1419+
assert.Nil(t, err, "getHeadServiceIPAndName should not return error")
14171420
}
14181421

1419-
assert.Equal(t, tc.expectedIP, ip, "getHeadServiceIP returned unexpected IP")
1422+
assert.Equal(t, tc.expectedIP, ip, "getHeadServiceIPAndName returned unexpected IP")
1423+
assert.Equal(t, tc.expectedName, name, "getHeadServiceIPAndName returned unexpected name")
14201424
})
14211425
}
14221426
}
@@ -1645,6 +1649,7 @@ func TestCalculateStatus(t *testing.T) {
16451649
assert.Nil(t, err)
16461650
assert.Equal(t, headNodeIP, newInstance.Status.Head.PodIP)
16471651
assert.Equal(t, headServiceIP, newInstance.Status.Head.ServiceIP)
1652+
assert.Equal(t, headService.Name, newInstance.Status.Head.ServiceName)
16481653
}
16491654

16501655
func Test_TerminatedWorkers_NoAutoscaler(t *testing.T) {

ray-operator/controllers/ray/rayjob_controller.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
101101
}
102102

103103
rayDashboardClient := r.dashboardClientFunc()
104-
err = rayDashboardClient.InitClient(rayJobInstance.Status.DashboardURL, rayClusterInstance)
104+
err = rayDashboardClient.InitClient(ctx, rayJobInstance.Status.DashboardURL, rayClusterInstance)
105105
if err != nil {
106106
logger.Error(err, "Failed to initialize dashboard client")
107107
}
@@ -218,7 +218,7 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
218218

219219
// Check the current status of ray jobs
220220
rayDashboardClient := r.dashboardClientFunc()
221-
if err := rayDashboardClient.InitClient(rayJobInstance.Status.DashboardURL, rayClusterInstance); err != nil {
221+
if err := rayDashboardClient.InitClient(ctx, rayJobInstance.Status.DashboardURL, rayClusterInstance); err != nil {
222222
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
223223
}
224224

ray-operator/controllers/ray/rayservice_controller.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1028,7 +1028,7 @@ func (r *RayServiceReconciler) updateStatusForActiveCluster(ctx context.Context,
10281028
}
10291029

10301030
rayDashboardClient := r.dashboardClientFunc()
1031-
if err := rayDashboardClient.InitClient(clientURL, rayClusterInstance); err != nil {
1031+
if err := rayDashboardClient.InitClient(ctx, clientURL, rayClusterInstance); err != nil {
10321032
return err
10331033
}
10341034

@@ -1081,7 +1081,7 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns
10811081
}
10821082

10831083
rayDashboardClient := r.dashboardClientFunc()
1084-
if err := rayDashboardClient.InitClient(clientURL, rayClusterInstance); err != nil {
1084+
if err := rayDashboardClient.InitClient(ctx, clientURL, rayClusterInstance); err != nil {
10851085
return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, false, err
10861086
}
10871087

ray-operator/controllers/ray/utils/dashboard_httpclient.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ var (
3131
)
3232

3333
type RayDashboardClientInterface interface {
34-
InitClient(url string, rayCluster *rayv1.RayCluster) error
34+
InitClient(ctx context.Context, url string, rayCluster *rayv1.RayCluster) error
3535
UpdateDeployments(ctx context.Context, configJson []byte) error
3636
// V2/multi-app Rest API
3737
GetServeDetails(ctx context.Context) (*ServeDetails, error)
@@ -109,11 +109,18 @@ func FetchHeadServiceURL(ctx context.Context, cli client.Client, rayCluster *ray
109109
return headServiceURL, nil
110110
}
111111

112-
func (r *RayDashboardClient) InitClient(url string, rayCluster *rayv1.RayCluster) error {
112+
func (r *RayDashboardClient) InitClient(ctx context.Context, url string, rayCluster *rayv1.RayCluster) error {
113+
log := ctrl.LoggerFrom(ctx)
114+
113115
if r.useProxy {
114-
headSvcName, err := GenerateHeadServiceName(RayClusterCRD, rayCluster.Spec, rayCluster.Name)
115-
if err != nil {
116-
return err
116+
var err error
117+
headSvcName := rayCluster.Status.Head.ServiceName
118+
if headSvcName == "" {
119+
log.Info("RayCluster is missing .status.head.serviceName, calling GenerateHeadServiceName instead...", "RayCluster name", rayCluster.Name, "namespace", rayCluster.Namespace)
120+
headSvcName, err = GenerateHeadServiceName(RayClusterCRD, rayCluster.Spec, rayCluster.Name)
121+
if err != nil {
122+
return err
123+
}
117124
}
118125

119126
r.client = r.mgr.GetHTTPClient()

ray-operator/controllers/ray/utils/dashboard_httpclient_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ var _ = Describe("RayFrameworkGenerator", func() {
5353
}
5454

5555
rayDashboardClient = &RayDashboardClient{}
56-
err := rayDashboardClient.InitClient("127.0.0.1:8090", nil)
56+
err := rayDashboardClient.InitClient(context.Background(), "127.0.0.1:8090", nil)
5757
Expect(err).To(BeNil())
5858
})
5959

ray-operator/controllers/ray/utils/fake_serve_httpclient.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ type FakeRayDashboardClient struct {
1919

2020
var _ RayDashboardClientInterface = (*FakeRayDashboardClient)(nil)
2121

22-
func (r *FakeRayDashboardClient) InitClient(url string, rayCluster *rayv1.RayCluster) error {
22+
func (r *FakeRayDashboardClient) InitClient(ctx context.Context, url string, rayCluster *rayv1.RayCluster) error {
2323
r.client = &http.Client{}
2424
r.dashboardURL = "http://" + url
2525
return nil

ray-operator/pkg/client/applyconfiguration/ray/v1/headinfo.go

Lines changed: 11 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)