From e0e59ba808c99e9b56bd6b39b3937da22345bf54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=80=9A=E5=BB=B7?= Date: Wed, 22 Feb 2023 16:38:12 +0800 Subject: [PATCH 1/2] optimize et-operator for spot instance --- VERSION | 2 +- arena-artifacts/Chart.yaml | 4 +- .../kai.alibabacloud.com_scaleins.yaml | 10 +-- .../kai.alibabacloud.com_scaleouts.yaml | 10 +-- .../kai.alibabacloud.com_trainingjobs.yaml | 19 ++++++ arena-artifacts/charts/et-operator/Chart.yaml | 4 +- arena-artifacts/values.yaml | 10 +-- charts/etjob/templates/etjob.yaml | 20 ++++-- pkg/apis/types/submit.go | 5 ++ pkg/apis/types/submit_etjob.go | 11 +++- pkg/argsbuilder/const.go | 4 ++ pkg/argsbuilder/submit_etjob.go | 61 ++++++++++++++++++- pkg/model/model.go | 2 +- pkg/serving/update.go | 54 ++++++++++------ pkg/training/const.go | 2 + pkg/training/trainer_et.go | 17 +++++- .../arena/model/training/ETJobBuilder.java | 11 ++++ .../arenasdk/training/et_job_builder.py | 44 +++++++------ 18 files changed, 227 insertions(+), 63 deletions(-) diff --git a/VERSION b/VERSION index 2003b639c..965065db5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.9.2 +0.9.3 diff --git a/arena-artifacts/Chart.yaml b/arena-artifacts/Chart.yaml index 42f027f37..95fc2da15 100644 --- a/arena-artifacts/Chart.yaml +++ b/arena-artifacts/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: arena-artifacts -description: A Helm chart for installing arena dependencies +description: A Helm chart for installing arena dependencies # A chart can be either an 'application' or a 'library' chart. # @@ -41,7 +41,7 @@ dependencies: condition: cron.enabled,global.cron.enabled - name: et-operator alias: et - version: 0.1.0 + version: 0.1.1 repository: "@et-operator" condition: et.enabled,global.et.enabled - name: mpi-operator diff --git a/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_scaleins.yaml b/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_scaleins.yaml index 7a86d254c..9ddd58274 100644 --- a/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_scaleins.yaml +++ b/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_scaleins.yaml @@ -155,11 +155,12 @@ spec: description: ReplicaStatuses is map of ReplicaType and ReplicaStatus, specifies the status of each replica. type: object + restartCount: + description: The number of times the Job has been restarted + format: int32 + type: integer startTime: - description: Represents time when the job was acknowledged by the - job controller. It is not guaranteed to be set in happens-before - order across separate operations. It is represented in RFC3339 form - and is in UTC. + description: Represents time when the job was acknowledged by the job controller. It is not guaranteed to be set in happens-before order across separate operations. It is represented in RFC3339 form and is in UTC. format: date-time type: string toDeletePods: @@ -170,6 +171,7 @@ spec: required: - conditions - replicaStatuses + - restartCount type: object type: object served: true diff --git a/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_scaleouts.yaml b/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_scaleouts.yaml index 96c54c9e3..c77afe4e1 100644 --- a/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_scaleouts.yaml +++ b/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_scaleouts.yaml @@ -156,16 +156,18 @@ spec: description: ReplicaStatuses is map of ReplicaType and ReplicaStatus, specifies the status of each replica. type: object + restartCount: + description: The number of times the Job has been restarted + format: int32 + type: integer startTime: - description: Represents time when the job was acknowledged by the - job controller. It is not guaranteed to be set in happens-before - order across separate operations. It is represented in RFC3339 form - and is in UTC. + description: Represents time when the job was acknowledged by the job controller. It is not guaranteed to be set in happens-before order across separate operations. It is represented in RFC3339 form and is in UTC. format: date-time type: string required: - conditions - replicaStatuses + - restartCount type: object type: object served: true diff --git a/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_trainingjobs.yaml b/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_trainingjobs.yaml index 1d17a4957..5f23ef29d 100644 --- a/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_trainingjobs.yaml +++ b/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_trainingjobs.yaml @@ -45,6 +45,12 @@ spec: spec: description: TrainingJobSpec defines the desired state of TrainingJob properties: + backoffLimit: + default: 6 + description: Optional number of retries to execute script. + format: int32 + minimum: 0 + type: integer cleanPodPolicy: description: CleanPodPolicy defines the policy that whether to kill pods after the job completes. Defaults to None. @@ -13170,6 +13176,14 @@ spec: description: Specifies the mode when launcher attach to workers. available option is ssh / kubexec Defaults is kubexec. type: string + restartPolicy: + default: Never + description: Restart policy for training job One of OnFailure, Never. + Default to Never. + enum: + - Never + - OnFailure + type: string slotsPerWorker: description: Specifies the number of slots per worker used in hostfile. Defaults to 1. @@ -13258,6 +13272,10 @@ spec: description: ReplicaStatuses is map of ReplicaType and ReplicaStatus, specifies the status of each replica. type: object + restartCount: + description: The number of times the Job has been restarted + format: int32 + type: integer startTime: description: Represents time when the job was acknowledged by the job controller. It is not guaranteed to be set in happens-before @@ -13272,6 +13290,7 @@ spec: required: - conditions - replicaStatuses + - restartCount type: object type: object served: true diff --git a/arena-artifacts/charts/et-operator/Chart.yaml b/arena-artifacts/charts/et-operator/Chart.yaml index 07c8e2c03..cd7afa614 100644 --- a/arena-artifacts/charts/et-operator/Chart.yaml +++ b/arena-artifacts/charts/et-operator/Chart.yaml @@ -15,10 +15,10 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.1.0 +version: 0.1.1 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "v0.1.0" +appVersion: "v0.1.1" diff --git a/arena-artifacts/values.yaml b/arena-artifacts/values.yaml index fb97785b8..ebea3e6a1 100644 --- a/arena-artifacts/values.yaml +++ b/arena-artifacts/values.yaml @@ -8,7 +8,7 @@ global: pullImageByVPCNetwork: false # the prefix of all image imagePrefix: registry.cn-zhangjiakou.aliyuncs.com - # the cluster type + # the cluster type clusterProfile: "Default" # specfiy the nodeSelector for all operator pods nodeSelector: {} @@ -31,7 +31,7 @@ tf: image: acs/tf_operator tag: v1.0-aliyun-986e8d5 imagePullPolicy: IfNotPresent - resources: + resources: limits: cpu: 200m memory: 2Gi @@ -78,7 +78,7 @@ pytorch: image: acs/pytorch-operator tag: v1.0-aliyun-6ebeb29 imagePullPolicy: IfNotPresent - resources: + resources: limits: cpu: 200m memory: 2Gi @@ -91,9 +91,9 @@ pytorch: et: enabled: true image: acs/et-operator - tag: v0.1.0-aliyun-1499985 + tag: v0.1.1-aliyun-0d6b825 imagePullPolicy: IfNotPresent - resources: + resources: limits: cpu: 200m memory: 2Gi diff --git a/charts/etjob/templates/etjob.yaml b/charts/etjob/templates/etjob.yaml index 51f65044b..c870b6565 100644 --- a/charts/etjob/templates/etjob.yaml +++ b/charts/etjob/templates/etjob.yaml @@ -18,8 +18,14 @@ metadata: createdBy: "ETJob" {{- range $key, $value := .Values.labels }} {{ $key }}: {{ $value | quote }} - {{- end }} + {{- end }} spec: + {{- if .Values.jobBackoffLimit }} + backoffLimit: {{ .Values.jobBackoffLimit }} + {{- end }} + {{- if .Values.jobRestartPolicy }} + restartPolicy: {{ .Values.jobRestartPolicy }} + {{- end }} {{- if .Values.cleanPodPolicy }} cleanPodPolicy: {{ .Values.cleanPodPolicy }} {{- end }} @@ -50,9 +56,9 @@ spec: {{- end }} {{- end }} spec: - {{- if ne (len .Values.nodeSelectors) 0 }} + {{- if ne (len .Values.launcherSelectors) 0 }} nodeSelector: - {{- range $nodeKey,$nodeVal := .Values.nodeSelectors }} + {{- range $nodeKey,$nodeVal := .Values.launcherSelectors }} {{ $nodeKey }}: "{{ $nodeVal }}" {{- end }} {{- end }} @@ -300,6 +306,9 @@ spec: {{- if .Values.workers }} worker: + {{- if .Values.workerRestartPolicy }} + restartPolicy: {{ .Values.workerRestartPolicy }} + {{- end }} replicas: {{ .Values.workers }} {{- if .Values.maxWorkers }} maxReplicas: {{ .Values.maxWorkers }} @@ -316,6 +325,9 @@ spec: release: {{ .Release.Name }} heritage: {{ .Release.Service }} createdBy: "ETJob" + {{- range $key, $value := .Values.labels }} + {{ $key }}: {{ $value | quote }} + {{- end }} {{- if .Values.annotations }} annotations: {{- range $key, $value := .Values.annotations }} @@ -326,7 +338,7 @@ spec: {{- if ne (len .Values.nodeSelectors) 0 }} nodeSelector: {{- range $nodeKey,$nodeVal := .Values.nodeSelectors }} - {{ $nodeKey }}: "{{ $nodeVal }}" + {{ $nodeKey }}: "{{ $nodeVal }}" {{- end }} {{- end }} {{- if ne (len .Values.tolerations) 0 }} diff --git a/pkg/apis/types/submit.go b/pkg/apis/types/submit.go index 6dbb35e10..1986d1904 100644 --- a/pkg/apis/types/submit.go +++ b/pkg/apis/types/submit.go @@ -109,6 +109,11 @@ type CommonSubmitArgs struct { // HelmOptions stores the helm options HelmOptions []string `yaml:"-"` + // EnableSpotInstance enables the feature of SuperVisor manage spot instance training. + EnableSpotInstance bool `yaml:"enableSpotInstance"` + + // MaxWaitTime stores the maximum length of time a job waits for resources + MaxWaitTime int `yaml:"maxWaitTime"` // SchedulerName stores the scheduler name,match option --scheduler SchedulerName string `yaml:"schedulerName"` diff --git a/pkg/apis/types/submit_etjob.go b/pkg/apis/types/submit_etjob.go index 4cfaf6706..d578fcfec 100644 --- a/pkg/apis/types/submit_etjob.go +++ b/pkg/apis/types/submit_etjob.go @@ -8,9 +8,14 @@ type SubmitETJobArgs struct { // SubmitTensorboardArgs stores tensorboard information SubmitTensorboardArgs `yaml:",inline"` // SubmitSyncCodeArgs stores syncing code information - SubmitSyncCodeArgs `yaml:",inline"` - MaxWorkers int `yaml:"maxWorkers"` - MinWorkers int `yaml:"minWorkers"` + SubmitSyncCodeArgs `yaml:",inline"` + MaxWorkers int `yaml:"maxWorkers"` + MinWorkers int `yaml:"minWorkers"` + LauncherSelectors map[string]string `yaml:"launcherSelectors"` // --launcher-selector + JobRestartPolicy string `yaml:"jobRestartPolicy"` // --job-restart-policy + WorkerRestartPolicy string `yaml:"workerRestartPolicy"` // --worker-restart-policy + JobBackoffLimit int `yaml:"jobBackoffLimit"` // --job-backoff-limit + } type ScaleETJobArgs struct { diff --git a/pkg/argsbuilder/const.go b/pkg/argsbuilder/const.go index 7c7e5a3c7..b52ee8b61 100644 --- a/pkg/argsbuilder/const.go +++ b/pkg/argsbuilder/const.go @@ -9,4 +9,8 @@ const ( aliyunENIAnnotation = "k8s.aliyun.com/eni" jobSuspend = "scheduling.x-k8s.io/suspend" + + spotInstanceAnnotation = "job-supervisor.kube-ai.io/spot-instance" + + maxWaitTimeAnnotation = "job-supervisor.kube-ai.io/max-wait-time" ) diff --git a/pkg/argsbuilder/submit_etjob.go b/pkg/argsbuilder/submit_etjob.go index bdaca2c14..85f77c310 100644 --- a/pkg/argsbuilder/submit_etjob.go +++ b/pkg/argsbuilder/submit_etjob.go @@ -15,10 +15,14 @@ package argsbuilder import ( "fmt" - "k8s.io/apimachinery/pkg/api/resource" "reflect" + "strconv" "strings" + log "github.com/sirupsen/logrus" + + "k8s.io/apimachinery/pkg/api/resource" + "github.com/kubeflow/arena/pkg/apis/types" "github.com/spf13/cobra" ) @@ -68,10 +72,19 @@ func (s *SubmitETJobArgsBuilder) AddCommandFlags(command *cobra.Command) { for name := range s.subBuilders { s.subBuilders[name].AddCommandFlags(command) } + var launcherSelectors []string command.Flags().StringVar(&s.args.Cpu, "cpu", "", "the cpu resource to use for the training, like 1 for 1 core.") command.Flags().StringVar(&s.args.Memory, "memory", "", "the memory resource to use for the training, like 1Gi.") command.Flags().IntVar(&s.args.MaxWorkers, "max-workers", 1000, "the max worker number to run the distributed training.") command.Flags().IntVar(&s.args.MinWorkers, "min-workers", 1, "the min worker number to run the distributed training.") + command.Flags().BoolVar(&s.args.EnableSpotInstance, "spot-instance", false, "EnableSpotInstance enables the feature of SuperVisor manager spot instance training") + command.Flags().IntVar(&s.args.MaxWaitTime, "max-wait-time", 0, "MaxWaitTime stores the maximum length of time a job waits for resources") + command.Flags().StringArrayVarP(&launcherSelectors, "launcher-selector", "", []string{}, `assigning launcher pod to some k8s particular nodes, usage: "--launcher-selector=key=value" or "--launcher-selector key=value" `) + command.Flags().StringVar(&s.args.JobRestartPolicy, "job-restart-policy", "", "training job restart policy, support: Never and OnFailure") + command.Flags().StringVar(&s.args.WorkerRestartPolicy, "worker-restart-policy", "", "training job worker restart policy, support: Never/OnFailure/Always/ExitCode") + command.Flags().IntVar(&s.args.JobBackoffLimit, "job-backoff-limit", 6, "the max restart count of trainingjob, default is six") + + s.argValues["launcher-selector"] = &launcherSelectors } func (s *SubmitETJobArgsBuilder) PreBuild() error { @@ -96,6 +109,15 @@ func (s *SubmitETJobArgsBuilder) Build() error { if err := s.addWorkerToEnv(); err != nil { return nil } + if err := s.setSpotInstance(); err != nil { + return nil + } + if err := s.setMaxWaitTime(); err != nil { + return nil + } + if err := s.setLauncherSelectors(); err != nil { + return nil + } return nil } @@ -126,3 +148,40 @@ func (s *SubmitETJobArgsBuilder) addWorkerToEnv() error { s.args.Envs["minWorkers"] = fmt.Sprintf("%v", s.args.MinWorkers) return nil } + +// setSpotInstance is used to add annotation for spot instance training +func (s *SubmitETJobArgsBuilder) setSpotInstance() error { + if s.args.EnableSpotInstance { + if s.args.Annotations == nil { + s.args.Annotations = map[string]string{} + } + s.args.Annotations[spotInstanceAnnotation] = "true" + } + return nil +} + +func (s *SubmitETJobArgsBuilder) setMaxWaitTime() error { + if s.args.MaxWaitTime > 0 { + if s.args.Annotations == nil { + s.args.Annotations = map[string]string{} + } + s.args.Annotations[maxWaitTimeAnnotation] = strconv.Itoa(s.args.MaxWaitTime) + } + return nil +} + +func (m *SubmitETJobArgsBuilder) setLauncherSelectors() error { + log.Debug("begin setLauncherSelector") + m.args.LauncherSelectors = map[string]string{} + argKey := "launcher-selector" + var LauncherSelectors *[]string + value, ok := m.argValues[argKey] + if !ok { + log.Warnf("Fail to get key: %s", argKey) + return nil + } + LauncherSelectors = value.(*[]string) + m.args.LauncherSelectors = transformSliceToMap(*LauncherSelectors, "=") + log.Debugf("success to transform launcher selector: %v", m.args.LauncherSelectors) + return nil +} diff --git a/pkg/model/model.go b/pkg/model/model.go index 92d9e41d7..c86840c60 100644 --- a/pkg/model/model.go +++ b/pkg/model/model.go @@ -263,7 +263,7 @@ func (m *modelJob) Params() map[string]string { for _, value := range arr { if strings.HasPrefix(value, "--") { kv := strings.Split(value, "=") - params[kv[0]] = kv[1] + params[fmt.Sprintf("--%s", kv[0])] = value[len(kv[0])+1:] } } return params diff --git a/pkg/serving/update.go b/pkg/serving/update.go index 666045f22..451e1751e 100644 --- a/pkg/serving/update.go +++ b/pkg/serving/update.go @@ -32,17 +32,18 @@ func UpdateTensorflowServing(args *types.UpdateTensorFlowServingArgs) error { if strings.HasSuffix(servingArgs, "\n") { servingArgs = servingArgs[:len(servingArgs)-2] } - arr := strings.Split(servingArgs, " ") - + arr := strings.Split(servingArgs, "--") params := make(map[string]string) - for i := 1; i < len(arr); i++ { - pair := strings.Split(arr[i], "=") - if len(pair) == 0 { + for index, argItem := range arr { + if index == 0 { + continue + } + pair := strings.Split(argItem, "=") + if len(pair) <= 1 { continue } - params[pair[0]] = pair[1] + params[fmt.Sprintf("--%s", pair[0])] = argItem[len(pair[0])+1:] } - if args.ModelName != "" { params["--model_name"] = args.ModelName } @@ -93,13 +94,26 @@ func UpdateTensorflowServing(args *types.UpdateTensorFlowServingArgs) error { if deploy.Spec.Template.Spec.Tolerations == nil { deploy.Spec.Template.Spec.Tolerations = []v1.Toleration{} } + mapSet := make(map[string]interface{}) + for _, toleration := range deploy.Spec.Template.Spec.Tolerations { + mapSet[fmt.Sprintf("%s=%s:%s,%s", toleration.Key, + toleration.Value, + toleration.Effect, + toleration.Operator)] = nil + } for _, toleration := range args.Tolerations { - deploy.Spec.Template.Spec.Tolerations = append(deploy.Spec.Template.Spec.Tolerations, v1.Toleration{ - Key: toleration.Key, - Value: toleration.Value, - Effect: v1.TaintEffect(toleration.Effect), - Operator: v1.TolerationOperator(toleration.Operator), - }) + if _, ok := mapSet[fmt.Sprintf("%s=%s:%s,%s", toleration.Key, + toleration.Value, + toleration.Effect, + toleration.Operator)]; !ok { + deploy.Spec.Template.Spec.Tolerations = append(deploy.Spec.Template.Spec.Tolerations, v1.Toleration{ + Key: toleration.Key, + Value: toleration.Value, + Effect: v1.TaintEffect(toleration.Effect), + Operator: v1.TolerationOperator(toleration.Operator), + }) + } + } } @@ -119,12 +133,18 @@ func UpdateTritonServing(args *types.UpdateTritonServingArgs) error { if strings.HasSuffix(servingArgs, "\n") { servingArgs = servingArgs[:len(servingArgs)-2] } - arr := strings.Split(servingArgs, " ") + arr := strings.Split(servingArgs, "--") params := make(map[string]string) - for i := 1; i < len(arr); i++ { - pair := strings.Split(arr[i], "=") - params[pair[0]] = pair[1] + for index, argItem := range arr { + if index == 0 { + continue + } + pair := strings.Split(argItem, "=") + if len(pair) <= 1 { + continue + } + params[fmt.Sprintf("--%s", pair[0])] = argItem[len(pair[0])+1:] } if args.ModelRepository != "" { diff --git a/pkg/training/const.go b/pkg/training/const.go index 899a6bd68..1fc60a55e 100644 --- a/pkg/training/const.go +++ b/pkg/training/const.go @@ -40,6 +40,8 @@ const ( aliyunENIAnnotation = "k8s.aliyun.com/eni" requestGPUsOfJobAnnoKey = "requestGPUsOfJobOwner" + + spotInstanceJobStatusAnnotation = "job-supervisor.kube-ai.io/job-status" ) var ( diff --git a/pkg/training/trainer_et.go b/pkg/training/trainer_et.go index 17b5cb4af..40b9591b3 100644 --- a/pkg/training/trainer_et.go +++ b/pkg/training/trainer_et.go @@ -97,6 +97,8 @@ func (ej *ETJob) GetStatus() (status string) { status = "PENDING" } else if ej.isScaling() { status = "SCALING" + } else if ej.isMaxWaitTimeExceeded() { + status = "MAX_WAIT_TIME_EXCEEDED" } else { status = "RUNNING" } @@ -214,7 +216,7 @@ func (ej *ETJob) isSucceeded() bool { } func (ej *ETJob) isFailed() bool { - return ej.trainingjob.Status.Phase == "Failed" + return ej.trainingjob.Status.Phase == "Failed" && !ej.hasMaxWaitTimeExceededAnnotation() } func (ej *ETJob) isScaling() bool { @@ -231,6 +233,19 @@ func (ej *ETJob) isPending() bool { return false } +func (ej *ETJob) hasMaxWaitTimeExceededAnnotation() bool { + if val, ok := ej.trainingjob.Annotations[spotInstanceJobStatusAnnotation]; ok { + if val == "timeout" { + return true + } + } + return false +} + +func (ej *ETJob) isMaxWaitTimeExceeded() bool { + return ej.trainingjob.Status.Phase == "Failed" && ej.hasMaxWaitTimeExceededAnnotation() +} + // ET Job trainer type ETJobTrainer struct { client *kubernetes.Clientset diff --git a/sdk/arena-java-sdk/src/main/java/com/github/kubeflow/arena/model/training/ETJobBuilder.java b/sdk/arena-java-sdk/src/main/java/com/github/kubeflow/arena/model/training/ETJobBuilder.java index dbc020081..4f2b79f40 100644 --- a/sdk/arena-java-sdk/src/main/java/com/github/kubeflow/arena/model/training/ETJobBuilder.java +++ b/sdk/arena-java-sdk/src/main/java/com/github/kubeflow/arena/model/training/ETJobBuilder.java @@ -1,6 +1,7 @@ package com.github.kubeflow.arena.model.training; import com.github.kubeflow.arena.enums.TrainingJobType; +import com.github.kubeflow.arena.model.fields.BoolField; import com.github.kubeflow.arena.model.fields.StringField; import java.util.ArrayList; @@ -31,6 +32,16 @@ public ETJobBuilder memory(String m) { return this; } + public ETJobBuilder enableSpotInstance() { + this.options.add(new BoolField("--spot-instance")); + return this; + } + + public ETJobBuilder maxWaitTime(int time) { + this.options.add(new StringField("--max-wait-time", String.valueOf(time))); + return this; + } + /** * following functions invoke JobBuilder functions diff --git a/sdk/arena-python-sdk/arenasdk/training/et_job_builder.py b/sdk/arena-python-sdk/arenasdk/training/et_job_builder.py index 9f3bc675d..1262217f5 100644 --- a/sdk/arena-python-sdk/arenasdk/training/et_job_builder.py +++ b/sdk/arena-python-sdk/arenasdk/training/et_job_builder.py @@ -7,30 +7,38 @@ class ETJobBuilder(JobBuilder): def __init__(self): super().__init__(TrainingJobType.ETTrainingJob) - + def with_min_workers(self,workers: int) -> ETJobBuilder: self._options.append(StringField("--min-workers",workers)) - return self + return self def with_max_workers(self,workers: int) -> ETJobBuilder: self._options.append(StringField("--max-workers",workers)) - return self + return self def with_cpu(self,cpu: str) -> ETJobBuilder: self._options.append(StringField("--cpu",cpu)) - return self + return self def with_memory(self,memory: str) -> ETJobBuilder: self._options.append(StringField("--memory",memory)) return self - + + def enable_spot_instance(self) -> ETJobBuilder: + self._options.append(BoolField("--spot-instance")) + return self + + def with_max_wait_time(self,time: int) -> ETJobBuilder: + self._options.append(StringField("--max-wait-time",time)) + return self + def with_name(self,name: str) -> ETJobBuilder: super().with_name(name) - return self + return self def with_image(self,image: str) -> ETJobBuilder: super().with_image(image) - return self + return self def with_workers(self,count: int) -> ETJobBuilder: super().with_workers(count) @@ -39,7 +47,7 @@ def with_workers(self,count: int) -> ETJobBuilder: def with_image_pull_secrets(self,secrets: List[str]) -> ETJobBuilder: super().with_image_pull_secrets(secrets) return self - + def with_gpus(self,count: int) -> ETJobBuilder: super().with_gpus(count) return self @@ -55,10 +63,10 @@ def with_node_selectors(self,selectors: Dict[str, str]) -> ETJobBuilder: def with_tolerations(self,tolerations: List[str]) -> ETJobBuilder: super().with_tolerations(tolerations) return self - + def with_config_files(self,files: Dict[str, str]) -> ETJobBuilder: super().with_config_files(files) - return self + return self def with_annotations(self,annotions: Dict[str, str]) -> ETJobBuilder: super().with_annotations(annotions) @@ -66,8 +74,8 @@ def with_annotations(self,annotions: Dict[str, str]) -> ETJobBuilder: def with_datas(self,datas: Dict[str,str]) -> ETJobBuilder: super().with_datas(datas) - return self - + return self + def with_data_dirs(self,data_dirs: Dict[str, str]) -> ETJobBuilder: super().with_data_dirs(data_dirs) return self @@ -79,26 +87,26 @@ def with_log_dir(self,dir: str) -> ETJobBuilder: def with_priority(self,priority: str) -> ETJobBuilder: super().with_priority(priority) return self - + def enable_rdma(self) -> ETJobBuilder: super().enable_rdma() return self - + def with_sync_image(self,image: str) -> ETJobBuilder: super().with_sync_image(image) return self def with_sync_mode(self,mode: str) -> ETJobBuilder: super().with_sync_mode(mode) - return self + return self def with_sync_source(self,source: str) -> ETJobBuilder: super().with_sync_source(source) return self - + def enable_tensorboard(self) -> ETJobBuilder: super().enable_tensorboard() - return self + return self def with_tensorboard_image(self,image: str) -> ETJobBuilder: super().with_tensorboard_image(image) @@ -110,7 +118,7 @@ def with_working_dir(self,dir: str) -> ETJobBuilder: def with_retry_count(self,count: int) -> ETJobBuilder: super().with_retry_count(count) - return self + return self def enable_coscheduling(self) -> ETJobBuilder: super().enable_coscheduling() From 02cc5ef067d9228897f12aeeaaa4cdb5ca1cd236 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=80=9A=E5=BB=B7?= Date: Wed, 22 Feb 2023 20:27:34 +0800 Subject: [PATCH 2/2] add jobsupervisor --- arena-artifacts/Chart.yaml | 5 + .../charts/job-supervisor/.helmignore | 23 ++++ .../charts/job-supervisor/Chart.yaml | 24 ++++ .../job-supervisor/templates/deployment.yaml | 49 ++++++++ .../charts/job-supervisor/templates/rbac.yaml | 112 ++++++++++++++++++ .../charts/job-supervisor/values.yaml | 3 + arena-artifacts/values.yaml | 8 ++ 7 files changed, 224 insertions(+) create mode 100644 arena-artifacts/charts/job-supervisor/.helmignore create mode 100644 arena-artifacts/charts/job-supervisor/Chart.yaml create mode 100644 arena-artifacts/charts/job-supervisor/templates/deployment.yaml create mode 100644 arena-artifacts/charts/job-supervisor/templates/rbac.yaml create mode 100644 arena-artifacts/charts/job-supervisor/values.yaml diff --git a/arena-artifacts/Chart.yaml b/arena-artifacts/Chart.yaml index 95fc2da15..f2a0ebcca 100644 --- a/arena-artifacts/Chart.yaml +++ b/arena-artifacts/Chart.yaml @@ -59,3 +59,8 @@ dependencies: version: 0.1.0 repository: "@gpu-exporter" condition: exporter.enabled,global.exporter.enabled + - name: job-supervisor + alias: job-supervisor + version: 0.1.0 + repository: "@job-supervisor" + condition: job-supervisor.enabled,global.job-supervisor.enabled \ No newline at end of file diff --git a/arena-artifacts/charts/job-supervisor/.helmignore b/arena-artifacts/charts/job-supervisor/.helmignore new file mode 100644 index 000000000..0e8a0eb36 --- /dev/null +++ b/arena-artifacts/charts/job-supervisor/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/arena-artifacts/charts/job-supervisor/Chart.yaml b/arena-artifacts/charts/job-supervisor/Chart.yaml new file mode 100644 index 000000000..8c06625b5 --- /dev/null +++ b/arena-artifacts/charts/job-supervisor/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: job-supervisor +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "v1.0.0-aliyun" \ No newline at end of file diff --git a/arena-artifacts/charts/job-supervisor/templates/deployment.yaml b/arena-artifacts/charts/job-supervisor/templates/deployment.yaml new file mode 100644 index 000000000..f226e5920 --- /dev/null +++ b/arena-artifacts/charts/job-supervisor/templates/deployment.yaml @@ -0,0 +1,49 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: job-supervisor + {{- include "arena.labels" . | nindent 4 }} + name: job-supervisor + namespace: {{ .Release.Namespace }} +spec: + replicas: 1 + selector: + matchLabels: + app: job-supervisor + {{- include "arena.labels" . | nindent 6 }} + strategy: + rollingUpdate: + maxSurge: 25% + maxUnavailable: 25% + type: RollingUpdate + template: + metadata: + labels: + {{- include "arena.labels" . | nindent 8 }} + app: job-supervisor + spec: + nodeSelector: + {{- include "arena.nodeSelector" . | nindent 8 }} + {{- include "arena.nonEdgeNodeSelector" . | nindent 8 }} + tolerations: + {{- include "arena.tolerateNonEdgeNodeSelector" . | nindent 6 }} + containers: + - command: + - /job-supervisor + image: {{ include "arena.imagePrefix" . }}/{{ .Values.image }}:{{ .Values.tag }} + imagePullPolicy: {{ .Values.imagePullPolicy }} + name: job-supervisor + resources: + limits: + cpu: 300m + memory: 500Mi + requests: + cpu: 100m + memory: 300Mi + dnsPolicy: ClusterFirst + restartPolicy: Always + schedulerName: default-scheduler + serviceAccount: job-supervisor + serviceAccountName: job-supervisor + terminationGracePeriodSeconds: 30 \ No newline at end of file diff --git a/arena-artifacts/charts/job-supervisor/templates/rbac.yaml b/arena-artifacts/charts/job-supervisor/templates/rbac.yaml new file mode 100644 index 000000000..4435f96b2 --- /dev/null +++ b/arena-artifacts/charts/job-supervisor/templates/rbac.yaml @@ -0,0 +1,112 @@ + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: job-supervisor + namespace: {{ .Release.Namespace }} + labels: + {{- include "arena.labels" . | nindent 4 }} + +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: job-supervisor + labels: + {{- include "arena.labels" . | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - configmaps + - endpoints + - events + - namespaces + - serviceaccounts + - secrets + - persistentvolumeclaims + - pods + - pods/log + - pods/exec + - services + - nodes + verbs: + - '*' +- apiGroups: + - "" + - apps + - extensions + resources: + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - '*' +- apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + - rolebindings + - clusterroles + - clusterrolebindings + verbs: + - '*' +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' +- apiGroups: + - apps.kubedl.io + resources: + - '*' + verbs: + - '*' +- apiGroups: + - kubeflow.org + resources: + - '*' + verbs: + - '*' +- apiGroups: + - kai.alibabacloud.com + resources: + - '*' + verbs: + - '*' +- apiGroups: + - batch + resources: + - jobs + verbs: + - '*' +- apiGroups: + - storage.k8s.io + resources: + - storageclasses + verbs: + - '*' +- apiGroups: + - tensorflow.org + resources: + - '*' + verbs: + - '*' +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: job-supervisor + namespace: {{ .Release.Namespace }} + labels: + {{- include "arena.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: job-supervisor +subjects: +- kind: ServiceAccount + name: job-supervisor + namespace: {{ .Release.Namespace }} \ No newline at end of file diff --git a/arena-artifacts/charts/job-supervisor/values.yaml b/arena-artifacts/charts/job-supervisor/values.yaml new file mode 100644 index 000000000..407905dda --- /dev/null +++ b/arena-artifacts/charts/job-supervisor/values.yaml @@ -0,0 +1,3 @@ +# Default values for job-supervisor +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. \ No newline at end of file diff --git a/arena-artifacts/values.yaml b/arena-artifacts/values.yaml index ebea3e6a1..49b2fe824 100644 --- a/arena-artifacts/values.yaml +++ b/arena-artifacts/values.yaml @@ -124,3 +124,11 @@ exporter: tag: v1.0.1-b2c2f9b imagePullPolicy: IfNotPresent resources: {} + +# job-supervisor +job-supervisor: + enabled: true + image: acs/job-supervisor + tag: v0.1.1-3e98633-aliyun + imagePullPolicy: IfNotPresent + nodeSelector: {} \ No newline at end of file