From 9662bd9221c26b9e004048475585be3d150817a6 Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Tue, 30 Apr 2024 22:29:24 -0700 Subject: [PATCH] Make k8s job backoff limit configurable for RayJob (#2091) --- docs/reference/api.md | 15 +++++++++++ .../kuberay-operator/crds/ray.io_rayjobs.yaml | 6 +++++ ray-operator/apis/ray/v1/rayjob_types.go | 7 ++++++ .../apis/ray/v1/zz_generated.deepcopy.go | 25 +++++++++++++++++++ .../config/crd/bases/ray.io_rayjobs.yaml | 6 +++++ .../controllers/ray/rayjob_controller.go | 6 ++++- .../applyconfiguration/ray/v1/rayjobspec.go | 9 +++++++ .../ray/v1/submitterconfig.go | 23 +++++++++++++++++ .../pkg/client/applyconfiguration/utils.go | 2 ++ 9 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 ray-operator/pkg/client/applyconfiguration/ray/v1/submitterconfig.go diff --git a/docs/reference/api.md b/docs/reference/api.md index a985e7c8a5c..dd29431ca00 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -160,6 +160,7 @@ _Appears in:_ | `entrypointNumCpus` _float_ | EntrypointNumCpus specifies the number of cpus to reserve for the entrypoint command. | | `entrypointNumGpus` _float_ | EntrypointNumGpus specifies the number of gpus to reserve for the entrypoint command. | | `entrypointResources` _string_ | EntrypointResources specifies the custom resources and quantities to reserve for the entrypoint command. | +| `submitterConfig` _[SubmitterConfig](#submitterconfig)_ | Configurations of submitter k8s job. | @@ -214,6 +215,20 @@ _Appears in:_ | `workersToDelete` _string array_ | WorkersToDelete workers to be deleted | +#### SubmitterConfig + + + + + +_Appears in:_ +- [RayJobSpec](#rayjobspec) + +| Field | Description | +| --- | --- | +| `backoffLimit` _integer_ | BackoffLimit of the submitter k8s job. | + + #### UpscalingMode _Underlying type:_ _string_ diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml index f4703dd49d0..37b37b334d6 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml @@ -7075,6 +7075,12 @@ spec: submissionMode: default: K8sJobMode type: string + submitterConfig: + properties: + backoffLimit: + format: int32 + type: integer + type: object submitterPodTemplate: properties: metadata: diff --git a/ray-operator/apis/ray/v1/rayjob_types.go b/ray-operator/apis/ray/v1/rayjob_types.go index 4d7a0dec057..9fa0e194e55 100644 --- a/ray-operator/apis/ray/v1/rayjob_types.go +++ b/ray-operator/apis/ray/v1/rayjob_types.go @@ -59,6 +59,11 @@ const ( HTTPMode JobSubmissionMode = "HTTPMode" // Submit job via HTTP request ) +type SubmitterConfig struct { + // BackoffLimit of the submitter k8s job. + BackoffLimit *int32 `json:"backoffLimit,omitempty"` +} + // RayJobSpec defines the desired state of RayJob type RayJobSpec struct { // INSERT ADDITIONAL SPEC FIELDS - desired state of cluster @@ -104,6 +109,8 @@ type RayJobSpec struct { // EntrypointResources specifies the custom resources and quantities to reserve for the // entrypoint command. EntrypointResources string `json:"entrypointResources,omitempty"` + // Configurations of submitter k8s job. + SubmitterConfig *SubmitterConfig `json:"submitterConfig,omitempty"` } // RayJobStatus defines the observed state of RayJob diff --git a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go index 75cd1cfeb5c..71567aa3164 100644 --- a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go +++ b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go @@ -375,6 +375,11 @@ func (in *RayJobSpec) DeepCopyInto(out *RayJobSpec) { *out = new(corev1.PodTemplateSpec) (*in).DeepCopyInto(*out) } + if in.SubmitterConfig != nil { + in, out := &in.SubmitterConfig, &out.SubmitterConfig + *out = new(SubmitterConfig) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RayJobSpec. @@ -584,6 +589,26 @@ func (in *ServeDeploymentStatus) DeepCopy() *ServeDeploymentStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SubmitterConfig) DeepCopyInto(out *SubmitterConfig) { + *out = *in + if in.BackoffLimit != nil { + in, out := &in.BackoffLimit, &out.BackoffLimit + *out = new(int32) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SubmitterConfig. +func (in *SubmitterConfig) DeepCopy() *SubmitterConfig { + if in == nil { + return nil + } + out := new(SubmitterConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *WorkerGroupSpec) DeepCopyInto(out *WorkerGroupSpec) { *out = *in diff --git a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml index f4703dd49d0..37b37b334d6 100644 --- a/ray-operator/config/crd/bases/ray.io_rayjobs.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayjobs.yaml @@ -7075,6 +7075,12 @@ spec: submissionMode: default: K8sJobMode type: string + submitterConfig: + properties: + backoffLimit: + format: int32 + type: integer + type: object submitterPodTemplate: properties: metadata: diff --git a/ray-operator/controllers/ray/rayjob_controller.go b/ray-operator/controllers/ray/rayjob_controller.go index a0ad6c4dd37..6a9312fb5c8 100644 --- a/ray-operator/controllers/ray/rayjob_controller.go +++ b/ray-operator/controllers/ray/rayjob_controller.go @@ -422,6 +422,10 @@ func (r *RayJobReconciler) getSubmitterTemplate(ctx context.Context, rayJobInsta // createNewK8sJob creates a new Kubernetes Job. It returns an error. func (r *RayJobReconciler) createNewK8sJob(ctx context.Context, rayJobInstance *rayv1.RayJob, submitterTemplate corev1.PodTemplateSpec) error { logger := ctrl.LoggerFrom(ctx) + submitterBackoffLimit := pointer.Int32(2) + if rayJobInstance.Spec.SubmitterConfig != nil && rayJobInstance.Spec.SubmitterConfig.BackoffLimit != nil { + submitterBackoffLimit = rayJobInstance.Spec.SubmitterConfig.BackoffLimit + } job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ Name: rayJobInstance.Name, @@ -437,7 +441,7 @@ func (r *RayJobReconciler) createNewK8sJob(ctx context.Context, rayJobInstance * // is attempted 3 times at the maximum, but still mitigates the case of unrecoverable // application-level errors, where the maximum number of retries is reached, and the job // completion time increases with no benefits, but wasted resource cycles. - BackoffLimit: pointer.Int32(2), + BackoffLimit: submitterBackoffLimit, Template: submitterTemplate, }, } diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayjobspec.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayjobspec.go index 489c68ad363..868736c7379 100644 --- a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayjobspec.go +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayjobspec.go @@ -25,6 +25,7 @@ type RayJobSpecApplyConfiguration struct { EntrypointNumCpus *float32 `json:"entrypointNumCpus,omitempty"` EntrypointNumGpus *float32 `json:"entrypointNumGpus,omitempty"` EntrypointResources *string `json:"entrypointResources,omitempty"` + SubmitterConfig *SubmitterConfigApplyConfiguration `json:"submitterConfig,omitempty"` } // RayJobSpecApplyConfiguration constructs an declarative configuration of the RayJobSpec type for use with @@ -164,3 +165,11 @@ func (b *RayJobSpecApplyConfiguration) WithEntrypointResources(value string) *Ra b.EntrypointResources = &value return b } + +// WithSubmitterConfig sets the SubmitterConfig field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the SubmitterConfig field is set to the value of the last call. +func (b *RayJobSpecApplyConfiguration) WithSubmitterConfig(value *SubmitterConfigApplyConfiguration) *RayJobSpecApplyConfiguration { + b.SubmitterConfig = value + return b +} diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/submitterconfig.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/submitterconfig.go new file mode 100644 index 00000000000..ab7a53f355a --- /dev/null +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/submitterconfig.go @@ -0,0 +1,23 @@ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1 + +// SubmitterConfigApplyConfiguration represents an declarative configuration of the SubmitterConfig type for use +// with apply. +type SubmitterConfigApplyConfiguration struct { + BackoffLimit *int32 `json:"backoffLimit,omitempty"` +} + +// SubmitterConfigApplyConfiguration constructs an declarative configuration of the SubmitterConfig type for use with +// apply. +func SubmitterConfig() *SubmitterConfigApplyConfiguration { + return &SubmitterConfigApplyConfiguration{} +} + +// WithBackoffLimit sets the BackoffLimit field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the BackoffLimit field is set to the value of the last call. +func (b *SubmitterConfigApplyConfiguration) WithBackoffLimit(value int32) *SubmitterConfigApplyConfiguration { + b.BackoffLimit = &value + return b +} diff --git a/ray-operator/pkg/client/applyconfiguration/utils.go b/ray-operator/pkg/client/applyconfiguration/utils.go index 35cc7873662..79e0a8cc1fb 100644 --- a/ray-operator/pkg/client/applyconfiguration/utils.go +++ b/ray-operator/pkg/client/applyconfiguration/utils.go @@ -45,6 +45,8 @@ func ForKind(kind schema.GroupVersionKind) interface{} { return &rayv1.ScaleStrategyApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("ServeDeploymentStatus"): return &rayv1.ServeDeploymentStatusApplyConfiguration{} + case v1.SchemeGroupVersion.WithKind("SubmitterConfig"): + return &rayv1.SubmitterConfigApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("WorkerGroupSpec"): return &rayv1.WorkerGroupSpecApplyConfiguration{}