@@ -80,7 +80,7 @@ def launch(self) -> Optional[float]:
80
80
It can fail if resource is not available. Need to check the cluster
81
81
status, after calling.
82
82
83
- Returns: The job's start timestamp, or None if failed to start .
83
+ Returns: The job's submit timestamp, or None if failed.
84
84
"""
85
85
if self .retry_until_up :
86
86
return self ._launch (max_retry = None )
@@ -140,8 +140,9 @@ def _launch(self, max_retry=3, raise_on_failure=True) -> Optional[float]:
140
140
raise_on_failure: Whether to raise an exception if the launch fails.
141
141
142
142
Returns:
143
- The job's start timestamp, or None if failed to start and
144
- raise_on_failure is False.
143
+ The job's submit timestamp, or None if failed to submit the job
144
+ (either provisioning fails or any error happens in job submission)
145
+ and raise_on_failure is False.
145
146
"""
146
147
# TODO(zhwu): handle the failure during `preparing sky runtime`.
147
148
retry_cnt = 0
@@ -152,8 +153,11 @@ def _launch(self, max_retry=3, raise_on_failure=True) -> Optional[float]:
152
153
exception = None
153
154
try :
154
155
usage_lib .messages .usage .set_internal ()
156
+ # Detach setup, so that the setup failure can be detected
157
+ # by the controller process (job_status -> FAILED_SETUP).
155
158
sky .launch (self .dag ,
156
159
cluster_name = self .cluster_name ,
160
+ detach_setup = True ,
157
161
detach_run = True ,
158
162
_is_launched_by_spot_controller = True )
159
163
logger .info ('Spot cluster launched.' )
@@ -202,7 +206,7 @@ def _launch(self, max_retry=3, raise_on_failure=True) -> Optional[float]:
202
206
# The cluster can be preempted before the job is launched.
203
207
# Break to let the retry launch kick in.
204
208
logger .info ('The cluster is preempted before the job '
205
- 'starts .' )
209
+ 'is submitted .' )
206
210
# TODO(zhwu): we should recover the preemption with the
207
211
# recovery strategy instead of the current while loop.
208
212
retry_launch = True
@@ -223,11 +227,11 @@ def _launch(self, max_retry=3, raise_on_failure=True) -> Optional[float]:
223
227
continue
224
228
225
229
# Check the job status until it is not in initialized status
226
- if status is not None and job_lib .JobStatus .PENDING < status :
230
+ if status is not None and status > job_lib .JobStatus .INIT :
227
231
try :
228
- launch_time = spot_utils .get_job_timestamp (
232
+ job_submitted_at = spot_utils .get_job_timestamp (
229
233
self .backend , self .cluster_name , get_end_time = False )
230
- return launch_time
234
+ return job_submitted_at
231
235
except Exception as e : # pylint: disable=broad-except
232
236
# If we failed to get the job timestamp, we will retry
233
237
# job checking loop.
@@ -271,16 +275,16 @@ def __init__(self, cluster_name: str, backend: 'backends.Backend',
271
275
'sky.clouds.Region' ]] = None
272
276
273
277
def _launch (self , max_retry = 3 , raise_on_failure = True ) -> Optional [float ]:
274
- launch_time = super ()._launch (max_retry , raise_on_failure )
275
- if launch_time is not None :
278
+ job_submitted_at = super ()._launch (max_retry , raise_on_failure )
279
+ if job_submitted_at is not None :
276
280
# Only record the cloud/region if the launch is successful.
277
281
handle = global_user_state .get_handle_from_cluster_name (
278
282
self .cluster_name )
279
283
assert handle is not None , 'Cluster should be launched.'
280
284
launched_resources = handle .launched_resources
281
285
self ._launched_cloud_region = (launched_resources .cloud ,
282
286
launched_resources .region )
283
- return launch_time
287
+ return job_submitted_at
284
288
285
289
def recover (self ) -> float :
286
290
# 1. Cancel the jobs and launch the cluster with the STOPPED status,
@@ -308,11 +312,11 @@ def recover(self) -> float:
308
312
region = launched_region )
309
313
task .set_resources ({new_resources })
310
314
# Not using self.launch to avoid the retry until up logic.
311
- launched_time = self ._launch (raise_on_failure = False )
315
+ job_submitted_at = self ._launch (raise_on_failure = False )
312
316
# Restore the original dag, i.e. reset the region constraint.
313
317
task .set_resources ({original_resources })
314
- if launched_time is not None :
315
- return launched_time
318
+ if job_submitted_at is not None :
319
+ return job_submitted_at
316
320
317
321
# Step 2
318
322
logger .debug ('Terminating unhealthy spot cluster and '
@@ -324,9 +328,9 @@ def recover(self) -> float:
324
328
logger .debug ('Relaunch the cluster without constraining to prior '
325
329
'cloud/region.' )
326
330
# Not using self.launch to avoid the retry until up logic.
327
- launched_time = self ._launch (max_retry = self ._MAX_RETRY_CNT ,
328
- raise_on_failure = False )
329
- if launched_time is None :
331
+ job_submitted_at = self ._launch (max_retry = self ._MAX_RETRY_CNT ,
332
+ raise_on_failure = False )
333
+ if job_submitted_at is None :
330
334
# Failed to launch the cluster.
331
335
if self .retry_until_up :
332
336
gap_seconds = self .RETRY_INIT_GAP_SECONDS
@@ -339,4 +343,4 @@ def recover(self) -> float:
339
343
f'Failed to recover the spot cluster after retrying '
340
344
f'{ self ._MAX_RETRY_CNT } times.' )
341
345
342
- return launched_time
346
+ return job_submitted_at
0 commit comments