Skip to content

Commit 1af147a

Browse files
authored
[Spot] Fix race condition for spot logs (skypilot-org#1329)
* Fix race condition for spot logs * fix * fix comment * address comments * add comment
1 parent 7602b25 commit 1af147a

File tree

1 file changed

+14
-5
lines changed

1 file changed

+14
-5
lines changed

sky/spot/spot_utils.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -230,15 +230,24 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
230230
cluster_name = generate_spot_cluster_name(task_name, job_id)
231231
backend = backends.CloudVmRayBackend()
232232
spot_status = spot_state.get_status(job_id)
233-
while not spot_status.is_terminal():
234-
if spot_status != spot_state.SpotStatus.RUNNING:
235-
logger.info(f'INFO: The log is not ready yet, as the spot job '
236-
f'is {spot_status.value}. '
233+
# spot_status can be None if the controller process just started and has
234+
# not updated the spot status yet.
235+
while spot_status is None or not spot_status.is_terminal():
236+
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
237+
# Check the handle: The cluster can be preempted and removed from the
238+
# table before the spot state is updated by the controller. In this
239+
# case, we should skip the logging, and wait for the next round of
240+
# status check.
241+
if handle is None or spot_status != spot_state.SpotStatus.RUNNING:
242+
status_help_str = ''
243+
if (spot_status is not None and
244+
spot_status != spot_state.SpotStatus.RUNNING):
245+
status_help_str = f', as the spot job is {spot_status.value}'
246+
logger.info(f'INFO: The log is not ready yet{status_help_str}. '
237247
f'Waiting for {JOB_STATUS_CHECK_GAP_SECONDS} seconds.')
238248
time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
239249
spot_status = spot_state.get_status(job_id)
240250
continue
241-
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
242251
returncode = backend.tail_logs(handle,
243252
job_id=None,
244253
spot_job_id=job_id,

0 commit comments

Comments
 (0)