@@ -85,30 +85,6 @@ def _run(self):
85
85
job_status = spot_utils .get_job_status (self ._backend ,
86
86
self ._cluster_name )
87
87
88
- if job_status is not None and not job_status .is_terminal ():
89
- need_recovery = False
90
- if self ._task .num_nodes > 1 :
91
- # Check the cluster status for multi-node jobs, since the
92
- # job may not be set to FAILED immediately when only some
93
- # of the nodes are preempted.
94
- (cluster_status ,
95
- handle ) = backend_utils .refresh_cluster_status_handle (
96
- self ._cluster_name , force_refresh = True )
97
- if cluster_status != global_user_state .ClusterStatus .UP :
98
- # recover the cluster if it is not up.
99
- # The status could be None when the cluster is preempted
100
- # right after the job was found FAILED.
101
- cluster_status_str = ('is preempted'
102
- if cluster_status is None else
103
- f'status { cluster_status .value } ' )
104
- logger .info (f'Cluster { cluster_status_str } . '
105
- 'Recovering...' )
106
- need_recovery = True
107
- if not need_recovery :
108
- # The job and cluster are healthy, continue to monitor the
109
- # job status.
110
- continue
111
-
112
88
if job_status == job_lib .JobStatus .SUCCEEDED :
113
89
end_time = spot_utils .get_job_timestamp (self ._backend ,
114
90
self ._cluster_name ,
@@ -117,14 +93,35 @@ def _run(self):
117
93
spot_state .set_succeeded (self ._job_id , end_time = end_time )
118
94
break
119
95
120
- if job_status == job_lib .JobStatus .FAILED :
121
- # Check the status of the spot cluster. If it is not UP,
122
- # the cluster is preempted.
123
- (cluster_status ,
124
- handle ) = backend_utils .refresh_cluster_status_handle (
125
- self ._cluster_name , force_refresh = True )
126
- if cluster_status == global_user_state .ClusterStatus .UP :
127
- # The user code has probably crashed.
96
+ # For single-node jobs, nonterminated job_status indicates a
97
+ # healthy cluster. We can safely continue monitoring.
98
+ # For multi-node jobs, since the job may not be set to FAILED
99
+ # immediately (depending on user program) when only some of the
100
+ # nodes are preempted, need to check the actual cluster status.
101
+ if (job_status is not None and not job_status .is_terminal () and
102
+ self ._task .num_nodes == 1 ):
103
+ continue
104
+
105
+ # Pull the actual cluster status from the cloud provider to
106
+ # determine whether the cluster is preempted.
107
+ (cluster_status ,
108
+ handle ) = backend_utils .refresh_cluster_status_handle (
109
+ self ._cluster_name , force_refresh = True )
110
+
111
+ if cluster_status != global_user_state .ClusterStatus .UP :
112
+ # The cluster is (partially) preempted. It can be down, INIT
113
+ # or STOPPED, based on the interruption behavior of the cloud.
114
+ # Spot recovery is needed (will be done later in the code).
115
+ cluster_status_str = ('' if cluster_status is None else
116
+ f' (status: { cluster_status .value } )' )
117
+ logger .info (
118
+ f'Cluster is preempted{ cluster_status_str } . Recovering...' )
119
+ else :
120
+ if job_status is not None and not job_status .is_terminal ():
121
+ # The multi-node job is still running, continue monitoring.
122
+ continue
123
+ elif job_status == job_lib .JobStatus .FAILED :
124
+ # The user code has probably crashed, fail immediately.
128
125
end_time = spot_utils .get_job_timestamp (self ._backend ,
129
126
self ._cluster_name ,
130
127
get_end_time = True )
@@ -140,11 +137,16 @@ def _run(self):
140
137
failure_type = spot_state .SpotStatus .FAILED ,
141
138
end_time = end_time )
142
139
break
143
- # cluster can be down, INIT or STOPPED, based on the interruption
144
- # behavior of the cloud.
145
- # Failed to connect to the cluster or the cluster is partially down.
146
- # job_status is None or job_status == job_lib.JobStatus.FAILED
147
- logger .info ('The cluster is preempted.' )
140
+ # Although the cluster is healthy, we fail to access the
141
+ # job status. Try to recover the job (will not restart the
142
+ # cluster, if the cluster is healthy).
143
+ assert job_status is None , job_status
144
+ logger .info ('Failed to fetch the job status while the '
145
+ 'cluster is healthy. Try to recover the job '
146
+ '(the cluster will not be restarted).' )
147
+
148
+ # Try to recover the spot jobs, when the cluster is preempted
149
+ # or the job status is failed to be fetched.
148
150
spot_state .set_recovering (self ._job_id )
149
151
recovered_time = self ._strategy_executor .recover ()
150
152
spot_state .set_recovered (self ._job_id ,
0 commit comments