From e0e59ba808c99e9b56bd6b39b3937da22345bf54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=80=9A=E5=BB=B7?= <tingshuai.yts@hotmail.com>
Date: Wed, 22 Feb 2023 16:38:12 +0800
Subject: [PATCH 1/2] optimize et-operator for spot instance

---
 VERSION                                       |  2 +-
 arena-artifacts/Chart.yaml                    |  4 +-
 .../kai.alibabacloud.com_scaleins.yaml        | 10 +--
 .../kai.alibabacloud.com_scaleouts.yaml       | 10 +--
 .../kai.alibabacloud.com_trainingjobs.yaml    | 19 ++++++
 arena-artifacts/charts/et-operator/Chart.yaml |  4 +-
 arena-artifacts/values.yaml                   | 10 +--
 charts/etjob/templates/etjob.yaml             | 20 ++++--
 pkg/apis/types/submit.go                      |  5 ++
 pkg/apis/types/submit_etjob.go                | 11 +++-
 pkg/argsbuilder/const.go                      |  4 ++
 pkg/argsbuilder/submit_etjob.go               | 61 ++++++++++++++++++-
 pkg/model/model.go                            |  2 +-
 pkg/serving/update.go                         | 54 ++++++++++------
 pkg/training/const.go                         |  2 +
 pkg/training/trainer_et.go                    | 17 +++++-
 .../arena/model/training/ETJobBuilder.java    | 11 ++++
 .../arenasdk/training/et_job_builder.py       | 44 +++++++------
 18 files changed, 227 insertions(+), 63 deletions(-)

diff --git a/VERSION b/VERSION
index 2003b639c..965065db5 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.9.2
+0.9.3
diff --git a/arena-artifacts/Chart.yaml b/arena-artifacts/Chart.yaml
index 42f027f37..95fc2da15 100644
--- a/arena-artifacts/Chart.yaml
+++ b/arena-artifacts/Chart.yaml
@@ -1,6 +1,6 @@
 apiVersion: v2
 name: arena-artifacts
-description: A Helm chart for installing arena dependencies 
+description: A Helm chart for installing arena dependencies
 
 # A chart can be either an 'application' or a 'library' chart.
 #
@@ -41,7 +41,7 @@ dependencies:
     condition: cron.enabled,global.cron.enabled
   - name: et-operator
     alias: et
-    version: 0.1.0
+    version: 0.1.1
     repository: "@et-operator"
     condition: et.enabled,global.et.enabled
   - name: mpi-operator
diff --git a/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_scaleins.yaml b/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_scaleins.yaml
index 7a86d254c..9ddd58274 100644
--- a/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_scaleins.yaml
+++ b/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_scaleins.yaml
@@ -155,11 +155,12 @@ spec:
                 description: ReplicaStatuses is map of ReplicaType and ReplicaStatus,
                   specifies the status of each replica.
                 type: object
+              restartCount:
+                description: The number of times the Job has been restarted
+                format: int32
+                type: integer
               startTime:
-                description: Represents time when the job was acknowledged by the
-                  job controller. It is not guaranteed to be set in happens-before
-                  order across separate operations. It is represented in RFC3339 form
-                  and is in UTC.
+                description: Represents time when the job was acknowledged by the job controller. It is not guaranteed to be set in happens-before order across separate operations. It is represented in RFC3339 form and is in UTC.
                 format: date-time
                 type: string
               toDeletePods:
@@ -170,6 +171,7 @@ spec:
             required:
             - conditions
             - replicaStatuses
+            - restartCount
             type: object
         type: object
     served: true
diff --git a/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_scaleouts.yaml b/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_scaleouts.yaml
index 96c54c9e3..c77afe4e1 100644
--- a/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_scaleouts.yaml
+++ b/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_scaleouts.yaml
@@ -156,16 +156,18 @@ spec:
                 description: ReplicaStatuses is map of ReplicaType and ReplicaStatus,
                   specifies the status of each replica.
                 type: object
+              restartCount:
+                description: The number of times the Job has been restarted
+                format: int32
+                type: integer
               startTime:
-                description: Represents time when the job was acknowledged by the
-                  job controller. It is not guaranteed to be set in happens-before
-                  order across separate operations. It is represented in RFC3339 form
-                  and is in UTC.
+                description: Represents time when the job was acknowledged by the job controller. It is not guaranteed to be set in happens-before order across separate operations. It is represented in RFC3339 form and is in UTC.
                 format: date-time
                 type: string
             required:
             - conditions
             - replicaStatuses
+            - restartCount
             type: object
         type: object
     served: true
diff --git a/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_trainingjobs.yaml b/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_trainingjobs.yaml
index 1d17a4957..5f23ef29d 100644
--- a/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_trainingjobs.yaml
+++ b/arena-artifacts/all_crds/v1/et-operator/kai.alibabacloud.com_trainingjobs.yaml
@@ -45,6 +45,12 @@ spec:
           spec:
             description: TrainingJobSpec defines the desired state of TrainingJob
             properties:
+              backoffLimit:
+                default: 6
+                description: Optional number of retries to execute script.
+                format: int32
+                minimum: 0
+                type: integer
               cleanPodPolicy:
                 description: CleanPodPolicy defines the policy that whether to kill
                   pods after the job completes. Defaults to None.
@@ -13170,6 +13176,14 @@ spec:
                 description: Specifies the mode when launcher attach to workers. available
                   option is ssh / kubexec Defaults is kubexec.
                 type: string
+              restartPolicy:
+                default: Never
+                description: Restart policy for training job One of OnFailure, Never.
+                  Default to Never.
+                enum:
+                - Never
+                - OnFailure
+                type: string
               slotsPerWorker:
                 description: Specifies the number of slots per worker used in hostfile.
                   Defaults to 1.
@@ -13258,6 +13272,10 @@ spec:
                 description: ReplicaStatuses is map of ReplicaType and ReplicaStatus,
                   specifies the status of each replica.
                 type: object
+              restartCount:
+                description: The number of times the Job has been restarted
+                format: int32
+                type: integer
               startTime:
                 description: Represents time when the job was acknowledged by the
                   job controller. It is not guaranteed to be set in happens-before
@@ -13272,6 +13290,7 @@ spec:
             required:
             - conditions
             - replicaStatuses
+            - restartCount
             type: object
         type: object
     served: true
diff --git a/arena-artifacts/charts/et-operator/Chart.yaml b/arena-artifacts/charts/et-operator/Chart.yaml
index 07c8e2c03..cd7afa614 100644
--- a/arena-artifacts/charts/et-operator/Chart.yaml
+++ b/arena-artifacts/charts/et-operator/Chart.yaml
@@ -15,10 +15,10 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.1.0
+version: 0.1.1
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: "v0.1.0"
+appVersion: "v0.1.1"
diff --git a/arena-artifacts/values.yaml b/arena-artifacts/values.yaml
index fb97785b8..ebea3e6a1 100644
--- a/arena-artifacts/values.yaml
+++ b/arena-artifacts/values.yaml
@@ -8,7 +8,7 @@ global:
   pullImageByVPCNetwork: false
   # the prefix of all image
   imagePrefix: registry.cn-zhangjiakou.aliyuncs.com
-  # the cluster type 
+  # the cluster type
   clusterProfile: "Default"
   # specfiy the nodeSelector for all operator pods
   nodeSelector: {}
@@ -31,7 +31,7 @@ tf:
   image: acs/tf_operator
   tag: v1.0-aliyun-986e8d5
   imagePullPolicy: IfNotPresent
-  resources: 
+  resources:
     limits:
       cpu: 200m
       memory: 2Gi
@@ -78,7 +78,7 @@ pytorch:
   image: acs/pytorch-operator
   tag: v1.0-aliyun-6ebeb29
   imagePullPolicy: IfNotPresent
-  resources: 
+  resources:
     limits:
       cpu: 200m
       memory: 2Gi
@@ -91,9 +91,9 @@ pytorch:
 et:
   enabled: true
   image: acs/et-operator
-  tag: v0.1.0-aliyun-1499985
+  tag: v0.1.1-aliyun-0d6b825
   imagePullPolicy: IfNotPresent
-  resources: 
+  resources:
     limits:
       cpu: 200m
       memory: 2Gi
diff --git a/charts/etjob/templates/etjob.yaml b/charts/etjob/templates/etjob.yaml
index 51f65044b..c870b6565 100644
--- a/charts/etjob/templates/etjob.yaml
+++ b/charts/etjob/templates/etjob.yaml
@@ -18,8 +18,14 @@ metadata:
     createdBy: "ETJob"
   {{- range $key, $value := .Values.labels }}
     {{ $key }}: {{ $value | quote }}
-  {{- end }} 
+  {{- end }}
 spec:
+  {{- if .Values.jobBackoffLimit }}
+  backoffLimit: {{ .Values.jobBackoffLimit }}
+  {{- end }}
+  {{- if .Values.jobRestartPolicy }}
+  restartPolicy: {{ .Values.jobRestartPolicy }}
+  {{- end }}
   {{- if .Values.cleanPodPolicy }}
   cleanPodPolicy: {{ .Values.cleanPodPolicy }}
   {{- end }}
@@ -50,9 +56,9 @@ spec:
           {{- end }}
           {{- end }}
         spec:
-          {{- if ne (len .Values.nodeSelectors) 0 }}
+          {{- if ne (len .Values.launcherSelectors) 0 }}
           nodeSelector:
-          {{- range $nodeKey,$nodeVal := .Values.nodeSelectors }}
+          {{- range $nodeKey,$nodeVal := .Values.launcherSelectors }}
             {{ $nodeKey }}: "{{ $nodeVal }}"
           {{- end }}
           {{- end }}
@@ -300,6 +306,9 @@ spec:
 
     {{- if .Values.workers }}
     worker:
+      {{- if .Values.workerRestartPolicy }}
+      restartPolicy: {{ .Values.workerRestartPolicy }}
+      {{- end }}
       replicas: {{ .Values.workers }}
       {{- if .Values.maxWorkers }}
       maxReplicas: {{ .Values.maxWorkers }}
@@ -316,6 +325,9 @@ spec:
             release: {{ .Release.Name }}
             heritage: {{ .Release.Service }}
             createdBy: "ETJob"
+          {{- range $key, $value := .Values.labels }}
+            {{ $key }}: {{ $value | quote }}
+          {{- end }}
           {{- if .Values.annotations }}
           annotations:
           {{- range $key, $value := .Values.annotations }}
@@ -326,7 +338,7 @@ spec:
           {{- if ne (len .Values.nodeSelectors) 0 }}
           nodeSelector:
           {{- range $nodeKey,$nodeVal := .Values.nodeSelectors }}
-          {{ $nodeKey }}: "{{ $nodeVal }}"
+            {{ $nodeKey }}: "{{ $nodeVal }}"
           {{- end }}
           {{- end }}
           {{- if ne (len .Values.tolerations) 0 }}
diff --git a/pkg/apis/types/submit.go b/pkg/apis/types/submit.go
index 6dbb35e10..1986d1904 100644
--- a/pkg/apis/types/submit.go
+++ b/pkg/apis/types/submit.go
@@ -109,6 +109,11 @@ type CommonSubmitArgs struct {
 	// HelmOptions stores the helm options
 	HelmOptions []string `yaml:"-"`
 
+	// EnableSpotInstance enables the feature of SuperVisor manage spot instance training.
+	EnableSpotInstance bool `yaml:"enableSpotInstance"`
+
+	// MaxWaitTime stores the maximum length of time a job waits for resources
+	MaxWaitTime int `yaml:"maxWaitTime"`
 	// SchedulerName stores the scheduler name,match option --scheduler
 	SchedulerName string `yaml:"schedulerName"`
 
diff --git a/pkg/apis/types/submit_etjob.go b/pkg/apis/types/submit_etjob.go
index 4cfaf6706..d578fcfec 100644
--- a/pkg/apis/types/submit_etjob.go
+++ b/pkg/apis/types/submit_etjob.go
@@ -8,9 +8,14 @@ type SubmitETJobArgs struct {
 	// SubmitTensorboardArgs stores tensorboard information
 	SubmitTensorboardArgs `yaml:",inline"`
 	// SubmitSyncCodeArgs stores syncing code information
-	SubmitSyncCodeArgs `yaml:",inline"`
-	MaxWorkers         int `yaml:"maxWorkers"`
-	MinWorkers         int `yaml:"minWorkers"`
+	SubmitSyncCodeArgs  `yaml:",inline"`
+	MaxWorkers          int               `yaml:"maxWorkers"`
+	MinWorkers          int               `yaml:"minWorkers"`
+	LauncherSelectors   map[string]string `yaml:"launcherSelectors"`   // --launcher-selector
+	JobRestartPolicy    string            `yaml:"jobRestartPolicy"`    // --job-restart-policy
+	WorkerRestartPolicy string            `yaml:"workerRestartPolicy"` // --worker-restart-policy
+	JobBackoffLimit     int               `yaml:"jobBackoffLimit"`     // --job-backoff-limit
+
 }
 
 type ScaleETJobArgs struct {
diff --git a/pkg/argsbuilder/const.go b/pkg/argsbuilder/const.go
index 7c7e5a3c7..b52ee8b61 100644
--- a/pkg/argsbuilder/const.go
+++ b/pkg/argsbuilder/const.go
@@ -9,4 +9,8 @@ const (
 	aliyunENIAnnotation = "k8s.aliyun.com/eni"
 
 	jobSuspend = "scheduling.x-k8s.io/suspend"
+
+	spotInstanceAnnotation = "job-supervisor.kube-ai.io/spot-instance"
+
+	maxWaitTimeAnnotation = "job-supervisor.kube-ai.io/max-wait-time"
 )
diff --git a/pkg/argsbuilder/submit_etjob.go b/pkg/argsbuilder/submit_etjob.go
index bdaca2c14..85f77c310 100644
--- a/pkg/argsbuilder/submit_etjob.go
+++ b/pkg/argsbuilder/submit_etjob.go
@@ -15,10 +15,14 @@ package argsbuilder
 
 import (
 	"fmt"
-	"k8s.io/apimachinery/pkg/api/resource"
 	"reflect"
+	"strconv"
 	"strings"
 
+	log "github.com/sirupsen/logrus"
+
+	"k8s.io/apimachinery/pkg/api/resource"
+
 	"github.com/kubeflow/arena/pkg/apis/types"
 	"github.com/spf13/cobra"
 )
@@ -68,10 +72,19 @@ func (s *SubmitETJobArgsBuilder) AddCommandFlags(command *cobra.Command) {
 	for name := range s.subBuilders {
 		s.subBuilders[name].AddCommandFlags(command)
 	}
+	var launcherSelectors []string
 	command.Flags().StringVar(&s.args.Cpu, "cpu", "", "the cpu resource to use for the training, like 1 for 1 core.")
 	command.Flags().StringVar(&s.args.Memory, "memory", "", "the memory resource to use for the training, like 1Gi.")
 	command.Flags().IntVar(&s.args.MaxWorkers, "max-workers", 1000, "the max worker number to run the distributed training.")
 	command.Flags().IntVar(&s.args.MinWorkers, "min-workers", 1, "the min worker number to run the distributed training.")
+	command.Flags().BoolVar(&s.args.EnableSpotInstance, "spot-instance", false, "EnableSpotInstance enables the feature of SuperVisor manager spot instance training")
+	command.Flags().IntVar(&s.args.MaxWaitTime, "max-wait-time", 0, "MaxWaitTime stores the maximum length of time a job waits for resources")
+	command.Flags().StringArrayVarP(&launcherSelectors, "launcher-selector", "", []string{}, `assigning launcher pod to some k8s particular nodes, usage: "--launcher-selector=key=value" or "--launcher-selector key=value" `)
+	command.Flags().StringVar(&s.args.JobRestartPolicy, "job-restart-policy", "", "training job restart policy, support: Never and OnFailure")
+	command.Flags().StringVar(&s.args.WorkerRestartPolicy, "worker-restart-policy", "", "training job worker restart policy, support: Never/OnFailure/Always/ExitCode")
+	command.Flags().IntVar(&s.args.JobBackoffLimit, "job-backoff-limit", 6, "the max restart count of trainingjob, default is six")
+
+	s.argValues["launcher-selector"] = &launcherSelectors
 }
 
 func (s *SubmitETJobArgsBuilder) PreBuild() error {
@@ -96,6 +109,15 @@ func (s *SubmitETJobArgsBuilder) Build() error {
 	if err := s.addWorkerToEnv(); err != nil {
 		return nil
 	}
+	if err := s.setSpotInstance(); err != nil {
+		return nil
+	}
+	if err := s.setMaxWaitTime(); err != nil {
+		return nil
+	}
+	if err := s.setLauncherSelectors(); err != nil {
+		return nil
+	}
 	return nil
 }
 
@@ -126,3 +148,40 @@ func (s *SubmitETJobArgsBuilder) addWorkerToEnv() error {
 	s.args.Envs["minWorkers"] = fmt.Sprintf("%v", s.args.MinWorkers)
 	return nil
 }
+
+// setSpotInstance is used to add annotation for spot instance training
+func (s *SubmitETJobArgsBuilder) setSpotInstance() error {
+	if s.args.EnableSpotInstance {
+		if s.args.Annotations == nil {
+			s.args.Annotations = map[string]string{}
+		}
+		s.args.Annotations[spotInstanceAnnotation] = "true"
+	}
+	return nil
+}
+
+func (s *SubmitETJobArgsBuilder) setMaxWaitTime() error {
+	if s.args.MaxWaitTime > 0 {
+		if s.args.Annotations == nil {
+			s.args.Annotations = map[string]string{}
+		}
+		s.args.Annotations[maxWaitTimeAnnotation] = strconv.Itoa(s.args.MaxWaitTime)
+	}
+	return nil
+}
+
+func (m *SubmitETJobArgsBuilder) setLauncherSelectors() error {
+	log.Debug("begin setLauncherSelector")
+	m.args.LauncherSelectors = map[string]string{}
+	argKey := "launcher-selector"
+	var LauncherSelectors *[]string
+	value, ok := m.argValues[argKey]
+	if !ok {
+		log.Warnf("Fail to get key: %s", argKey)
+		return nil
+	}
+	LauncherSelectors = value.(*[]string)
+	m.args.LauncherSelectors = transformSliceToMap(*LauncherSelectors, "=")
+	log.Debugf("success to transform launcher selector: %v", m.args.LauncherSelectors)
+	return nil
+}
diff --git a/pkg/model/model.go b/pkg/model/model.go
index 92d9e41d7..c86840c60 100644
--- a/pkg/model/model.go
+++ b/pkg/model/model.go
@@ -263,7 +263,7 @@ func (m *modelJob) Params() map[string]string {
 	for _, value := range arr {
 		if strings.HasPrefix(value, "--") {
 			kv := strings.Split(value, "=")
-			params[kv[0]] = kv[1]
+			params[fmt.Sprintf("--%s", kv[0])] = value[len(kv[0])+1:]
 		}
 	}
 	return params
diff --git a/pkg/serving/update.go b/pkg/serving/update.go
index 666045f22..451e1751e 100644
--- a/pkg/serving/update.go
+++ b/pkg/serving/update.go
@@ -32,17 +32,18 @@ func UpdateTensorflowServing(args *types.UpdateTensorFlowServingArgs) error {
 			if strings.HasSuffix(servingArgs, "\n") {
 				servingArgs = servingArgs[:len(servingArgs)-2]
 			}
-			arr := strings.Split(servingArgs, " ")
-
+			arr := strings.Split(servingArgs, "--")
 			params := make(map[string]string)
-			for i := 1; i < len(arr); i++ {
-				pair := strings.Split(arr[i], "=")
-				if len(pair) == 0 {
+			for index, argItem := range arr {
+				if index == 0 {
+					continue
+				}
+				pair := strings.Split(argItem, "=")
+				if len(pair) <= 1 {
 					continue
 				}
-				params[pair[0]] = pair[1]
+				params[fmt.Sprintf("--%s", pair[0])] = argItem[len(pair[0])+1:]
 			}
-
 			if args.ModelName != "" {
 				params["--model_name"] = args.ModelName
 			}
@@ -93,13 +94,26 @@ func UpdateTensorflowServing(args *types.UpdateTensorFlowServingArgs) error {
 		if deploy.Spec.Template.Spec.Tolerations == nil {
 			deploy.Spec.Template.Spec.Tolerations = []v1.Toleration{}
 		}
+		mapSet := make(map[string]interface{})
+		for _, toleration := range deploy.Spec.Template.Spec.Tolerations {
+			mapSet[fmt.Sprintf("%s=%s:%s,%s", toleration.Key,
+				toleration.Value,
+				toleration.Effect,
+				toleration.Operator)] = nil
+		}
 		for _, toleration := range args.Tolerations {
-			deploy.Spec.Template.Spec.Tolerations = append(deploy.Spec.Template.Spec.Tolerations, v1.Toleration{
-				Key:      toleration.Key,
-				Value:    toleration.Value,
-				Effect:   v1.TaintEffect(toleration.Effect),
-				Operator: v1.TolerationOperator(toleration.Operator),
-			})
+			if _, ok := mapSet[fmt.Sprintf("%s=%s:%s,%s", toleration.Key,
+				toleration.Value,
+				toleration.Effect,
+				toleration.Operator)]; !ok {
+				deploy.Spec.Template.Spec.Tolerations = append(deploy.Spec.Template.Spec.Tolerations, v1.Toleration{
+					Key:      toleration.Key,
+					Value:    toleration.Value,
+					Effect:   v1.TaintEffect(toleration.Effect),
+					Operator: v1.TolerationOperator(toleration.Operator),
+				})
+			}
+
 		}
 	}
 
@@ -119,12 +133,18 @@ func UpdateTritonServing(args *types.UpdateTritonServingArgs) error {
 		if strings.HasSuffix(servingArgs, "\n") {
 			servingArgs = servingArgs[:len(servingArgs)-2]
 		}
-		arr := strings.Split(servingArgs, " ")
+		arr := strings.Split(servingArgs, "--")
 
 		params := make(map[string]string)
-		for i := 1; i < len(arr); i++ {
-			pair := strings.Split(arr[i], "=")
-			params[pair[0]] = pair[1]
+		for index, argItem := range arr {
+			if index == 0 {
+				continue
+			}
+			pair := strings.Split(argItem, "=")
+			if len(pair) <= 1 {
+				continue
+			}
+			params[fmt.Sprintf("--%s", pair[0])] = argItem[len(pair[0])+1:]
 		}
 
 		if args.ModelRepository != "" {
diff --git a/pkg/training/const.go b/pkg/training/const.go
index 899a6bd68..1fc60a55e 100644
--- a/pkg/training/const.go
+++ b/pkg/training/const.go
@@ -40,6 +40,8 @@ const (
 	aliyunENIAnnotation = "k8s.aliyun.com/eni"
 
 	requestGPUsOfJobAnnoKey = "requestGPUsOfJobOwner"
+
+	spotInstanceJobStatusAnnotation = "job-supervisor.kube-ai.io/job-status"
 )
 
 var (
diff --git a/pkg/training/trainer_et.go b/pkg/training/trainer_et.go
index 17b5cb4af..40b9591b3 100644
--- a/pkg/training/trainer_et.go
+++ b/pkg/training/trainer_et.go
@@ -97,6 +97,8 @@ func (ej *ETJob) GetStatus() (status string) {
 		status = "PENDING"
 	} else if ej.isScaling() {
 		status = "SCALING"
+	} else if ej.isMaxWaitTimeExceeded() {
+		status = "MAX_WAIT_TIME_EXCEEDED"
 	} else {
 		status = "RUNNING"
 	}
@@ -214,7 +216,7 @@ func (ej *ETJob) isSucceeded() bool {
 }
 
 func (ej *ETJob) isFailed() bool {
-	return ej.trainingjob.Status.Phase == "Failed"
+	return ej.trainingjob.Status.Phase == "Failed" && !ej.hasMaxWaitTimeExceededAnnotation()
 }
 
 func (ej *ETJob) isScaling() bool {
@@ -231,6 +233,19 @@ func (ej *ETJob) isPending() bool {
 	return false
 }
 
+func (ej *ETJob) hasMaxWaitTimeExceededAnnotation() bool {
+	if val, ok := ej.trainingjob.Annotations[spotInstanceJobStatusAnnotation]; ok {
+		if val == "timeout" {
+			return true
+		}
+	}
+	return false
+}
+
+func (ej *ETJob) isMaxWaitTimeExceeded() bool {
+	return ej.trainingjob.Status.Phase == "Failed" && ej.hasMaxWaitTimeExceededAnnotation()
+}
+
 // ET Job trainer
 type ETJobTrainer struct {
 	client      *kubernetes.Clientset
diff --git a/sdk/arena-java-sdk/src/main/java/com/github/kubeflow/arena/model/training/ETJobBuilder.java b/sdk/arena-java-sdk/src/main/java/com/github/kubeflow/arena/model/training/ETJobBuilder.java
index dbc020081..4f2b79f40 100644
--- a/sdk/arena-java-sdk/src/main/java/com/github/kubeflow/arena/model/training/ETJobBuilder.java
+++ b/sdk/arena-java-sdk/src/main/java/com/github/kubeflow/arena/model/training/ETJobBuilder.java
@@ -1,6 +1,7 @@
 package com.github.kubeflow.arena.model.training;
 
 import com.github.kubeflow.arena.enums.TrainingJobType;
+import com.github.kubeflow.arena.model.fields.BoolField;
 import com.github.kubeflow.arena.model.fields.StringField;
 
 import java.util.ArrayList;
@@ -31,6 +32,16 @@ public ETJobBuilder memory(String m) {
         return this;
     }
 
+    public ETJobBuilder enableSpotInstance() {
+        this.options.add(new BoolField("--spot-instance"));
+        return this;
+    }
+
+    public ETJobBuilder maxWaitTime(int time) {
+        this.options.add(new StringField("--max-wait-time", String.valueOf(time)));
+        return this;
+    }
+
 
     /**
      * following functions invoke JobBuilder functions
diff --git a/sdk/arena-python-sdk/arenasdk/training/et_job_builder.py b/sdk/arena-python-sdk/arenasdk/training/et_job_builder.py
index 9f3bc675d..1262217f5 100644
--- a/sdk/arena-python-sdk/arenasdk/training/et_job_builder.py
+++ b/sdk/arena-python-sdk/arenasdk/training/et_job_builder.py
@@ -7,30 +7,38 @@
 class ETJobBuilder(JobBuilder):
     def __init__(self):
         super().__init__(TrainingJobType.ETTrainingJob)
-        
+
     def with_min_workers(self,workers: int) -> ETJobBuilder:
         self._options.append(StringField("--min-workers",workers))
-        return self 
+        return self
 
     def with_max_workers(self,workers: int) -> ETJobBuilder:
         self._options.append(StringField("--max-workers",workers))
-        return self 
+        return self
 
     def with_cpu(self,cpu: str) -> ETJobBuilder:
         self._options.append(StringField("--cpu",cpu))
-        return self 
+        return self
 
     def with_memory(self,memory: str) -> ETJobBuilder:
         self._options.append(StringField("--memory",memory))
         return self
-    
+
+    def enable_spot_instance(self) -> ETJobBuilder:
+        self._options.append(BoolField("--spot-instance"))
+        return self
+
+    def with_max_wait_time(self,time: int) -> ETJobBuilder:
+        self._options.append(StringField("--max-wait-time",time))
+        return self
+
     def with_name(self,name: str) -> ETJobBuilder:
         super().with_name(name)
-        return self 
+        return self
 
     def with_image(self,image: str) -> ETJobBuilder:
         super().with_image(image)
-        return self 
+        return self
 
     def with_workers(self,count: int) -> ETJobBuilder:
         super().with_workers(count)
@@ -39,7 +47,7 @@ def with_workers(self,count: int) -> ETJobBuilder:
     def with_image_pull_secrets(self,secrets: List[str]) -> ETJobBuilder:
         super().with_image_pull_secrets(secrets)
         return self
-    
+
     def with_gpus(self,count: int) -> ETJobBuilder:
         super().with_gpus(count)
         return self
@@ -55,10 +63,10 @@ def with_node_selectors(self,selectors: Dict[str, str]) -> ETJobBuilder:
     def with_tolerations(self,tolerations: List[str]) -> ETJobBuilder:
         super().with_tolerations(tolerations)
         return self
-    
+
     def with_config_files(self,files: Dict[str, str]) -> ETJobBuilder:
         super().with_config_files(files)
-        return self 
+        return self
 
     def with_annotations(self,annotions: Dict[str, str]) -> ETJobBuilder:
         super().with_annotations(annotions)
@@ -66,8 +74,8 @@ def with_annotations(self,annotions: Dict[str, str]) -> ETJobBuilder:
 
     def with_datas(self,datas: Dict[str,str]) -> ETJobBuilder:
         super().with_datas(datas)
-        return self 
-    
+        return self
+
     def with_data_dirs(self,data_dirs: Dict[str, str]) -> ETJobBuilder:
         super().with_data_dirs(data_dirs)
         return  self
@@ -79,26 +87,26 @@ def with_log_dir(self,dir: str) -> ETJobBuilder:
     def with_priority(self,priority: str) -> ETJobBuilder:
         super().with_priority(priority)
         return  self
-    
+
     def enable_rdma(self) -> ETJobBuilder:
         super().enable_rdma()
         return  self
-    
+
     def with_sync_image(self,image: str) -> ETJobBuilder:
         super().with_sync_image(image)
         return  self
 
     def with_sync_mode(self,mode: str) -> ETJobBuilder:
         super().with_sync_mode(mode)
-        return  self 
+        return  self
 
     def with_sync_source(self,source: str) -> ETJobBuilder:
         super().with_sync_source(source)
         return  self
-    
+
     def enable_tensorboard(self) -> ETJobBuilder:
         super().enable_tensorboard()
-        return self 
+        return self
 
     def with_tensorboard_image(self,image: str) -> ETJobBuilder:
         super().with_tensorboard_image(image)
@@ -110,7 +118,7 @@ def with_working_dir(self,dir: str) -> ETJobBuilder:
 
     def with_retry_count(self,count: int) -> ETJobBuilder:
         super().with_retry_count(count)
-        return self 
+        return self
 
     def enable_coscheduling(self) -> ETJobBuilder:
         super().enable_coscheduling()

From 02cc5ef067d9228897f12aeeaaa4cdb5ca1cd236 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=80=9A=E5=BB=B7?= <tingshuai.yts@hotmail.com>
Date: Wed, 22 Feb 2023 20:27:34 +0800
Subject: [PATCH 2/2] add jobsupervisor

---
 arena-artifacts/Chart.yaml                    |   5 +
 .../charts/job-supervisor/.helmignore         |  23 ++++
 .../charts/job-supervisor/Chart.yaml          |  24 ++++
 .../job-supervisor/templates/deployment.yaml  |  49 ++++++++
 .../charts/job-supervisor/templates/rbac.yaml | 112 ++++++++++++++++++
 .../charts/job-supervisor/values.yaml         |   3 +
 arena-artifacts/values.yaml                   |   8 ++
 7 files changed, 224 insertions(+)
 create mode 100644 arena-artifacts/charts/job-supervisor/.helmignore
 create mode 100644 arena-artifacts/charts/job-supervisor/Chart.yaml
 create mode 100644 arena-artifacts/charts/job-supervisor/templates/deployment.yaml
 create mode 100644 arena-artifacts/charts/job-supervisor/templates/rbac.yaml
 create mode 100644 arena-artifacts/charts/job-supervisor/values.yaml

diff --git a/arena-artifacts/Chart.yaml b/arena-artifacts/Chart.yaml
index 95fc2da15..f2a0ebcca 100644
--- a/arena-artifacts/Chart.yaml
+++ b/arena-artifacts/Chart.yaml
@@ -59,3 +59,8 @@ dependencies:
     version: 0.1.0
     repository: "@gpu-exporter"
     condition: exporter.enabled,global.exporter.enabled
+  - name: job-supervisor
+    alias: job-supervisor
+    version: 0.1.0
+    repository: "@job-supervisor"
+    condition: job-supervisor.enabled,global.job-supervisor.enabled
\ No newline at end of file
diff --git a/arena-artifacts/charts/job-supervisor/.helmignore b/arena-artifacts/charts/job-supervisor/.helmignore
new file mode 100644
index 000000000..0e8a0eb36
--- /dev/null
+++ b/arena-artifacts/charts/job-supervisor/.helmignore
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/arena-artifacts/charts/job-supervisor/Chart.yaml b/arena-artifacts/charts/job-supervisor/Chart.yaml
new file mode 100644
index 000000000..8c06625b5
--- /dev/null
+++ b/arena-artifacts/charts/job-supervisor/Chart.yaml
@@ -0,0 +1,24 @@
+apiVersion: v2
+name: job-supervisor
+description: A Helm chart for Kubernetes
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.1.0
+
+# This is the version number of the application being deployed. This version number should be
+# incremented each time you make changes to the application. Versions are not expected to
+# follow Semantic Versioning. They should reflect the version the application is using.
+# It is recommended to use it with quotes.
+appVersion: "v1.0.0-aliyun"
\ No newline at end of file
diff --git a/arena-artifacts/charts/job-supervisor/templates/deployment.yaml b/arena-artifacts/charts/job-supervisor/templates/deployment.yaml
new file mode 100644
index 000000000..f226e5920
--- /dev/null
+++ b/arena-artifacts/charts/job-supervisor/templates/deployment.yaml
@@ -0,0 +1,49 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    app: job-supervisor
+    {{- include "arena.labels" . | nindent 4 }}
+  name: job-supervisor
+  namespace: {{ .Release.Namespace }}
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: job-supervisor
+      {{- include "arena.labels" . | nindent 6 }}
+  strategy:
+    rollingUpdate:
+      maxSurge: 25%
+      maxUnavailable: 25%
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        {{- include "arena.labels" . | nindent 8 }}
+        app: job-supervisor
+    spec:
+      nodeSelector:
+        {{- include "arena.nodeSelector" . | nindent 8 }}
+        {{- include "arena.nonEdgeNodeSelector" . | nindent 8 }}
+      tolerations:
+      {{- include "arena.tolerateNonEdgeNodeSelector" . | nindent 6 }}
+      containers:
+        - command:
+            - /job-supervisor
+          image: {{ include "arena.imagePrefix" . }}/{{ .Values.image }}:{{ .Values.tag }}
+          imagePullPolicy: {{ .Values.imagePullPolicy }}
+          name: job-supervisor
+          resources:
+            limits:
+              cpu: 300m
+              memory: 500Mi
+            requests:
+              cpu: 100m
+              memory: 300Mi
+      dnsPolicy: ClusterFirst
+      restartPolicy: Always
+      schedulerName: default-scheduler
+      serviceAccount: job-supervisor
+      serviceAccountName: job-supervisor
+      terminationGracePeriodSeconds: 30
\ No newline at end of file
diff --git a/arena-artifacts/charts/job-supervisor/templates/rbac.yaml b/arena-artifacts/charts/job-supervisor/templates/rbac.yaml
new file mode 100644
index 000000000..4435f96b2
--- /dev/null
+++ b/arena-artifacts/charts/job-supervisor/templates/rbac.yaml
@@ -0,0 +1,112 @@
+
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: job-supervisor
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "arena.labels" . | nindent 4 }}
+
+---
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: job-supervisor
+  labels:
+    {{- include "arena.labels" . | nindent 4 }}
+rules:
+- apiGroups:
+  - ""
+  resources:
+  - configmaps
+  - endpoints
+  - events
+  - namespaces
+  - serviceaccounts
+  - secrets
+  - persistentvolumeclaims
+  - pods
+  - pods/log
+  - pods/exec
+  - services
+  - nodes
+  verbs:
+  - '*'
+- apiGroups:
+  - ""
+  - apps
+  - extensions
+  resources:
+  - deployments
+  - daemonsets
+  - replicasets
+  - statefulsets
+  verbs:
+  - '*'
+- apiGroups:
+  - rbac.authorization.k8s.io
+  resources:
+  - roles
+  - rolebindings
+  - clusterroles
+  - clusterrolebindings
+  verbs:
+  - '*'
+- apiGroups:
+  - apiextensions.k8s.io
+  resources:
+  - customresourcedefinitions
+  verbs:
+  - '*'
+- apiGroups:
+  - apps.kubedl.io
+  resources:
+  - '*'
+  verbs:
+  - '*'
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - '*'
+  verbs:
+  - '*'
+- apiGroups:
+  - kai.alibabacloud.com
+  resources:
+  - '*'
+  verbs:
+  - '*'
+- apiGroups:
+  - batch
+  resources:
+  - jobs
+  verbs:
+  - '*'
+- apiGroups:
+  - storage.k8s.io
+  resources:
+  - storageclasses
+  verbs:
+  - '*'
+- apiGroups:
+  - tensorflow.org
+  resources:
+  - '*'
+  verbs:
+  - '*'
+---
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: job-supervisor
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "arena.labels" . | nindent 4 }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: job-supervisor
+subjects:
+- kind: ServiceAccount
+  name: job-supervisor
+  namespace: {{ .Release.Namespace }}
\ No newline at end of file
diff --git a/arena-artifacts/charts/job-supervisor/values.yaml b/arena-artifacts/charts/job-supervisor/values.yaml
new file mode 100644
index 000000000..407905dda
--- /dev/null
+++ b/arena-artifacts/charts/job-supervisor/values.yaml
@@ -0,0 +1,3 @@
+# Default values for job-supervisor
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
\ No newline at end of file
diff --git a/arena-artifacts/values.yaml b/arena-artifacts/values.yaml
index ebea3e6a1..49b2fe824 100644
--- a/arena-artifacts/values.yaml
+++ b/arena-artifacts/values.yaml
@@ -124,3 +124,11 @@ exporter:
   tag: v1.0.1-b2c2f9b
   imagePullPolicy: IfNotPresent
   resources: {}
+
+# job-supervisor
+job-supervisor:
+  enabled: true
+  image: acs/job-supervisor
+  tag: v0.1.1-3e98633-aliyun
+  imagePullPolicy: IfNotPresent
+  nodeSelector: {}
\ No newline at end of file