@@ -230,15 +230,24 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
230
230
cluster_name = generate_spot_cluster_name (task_name , job_id )
231
231
backend = backends .CloudVmRayBackend ()
232
232
spot_status = spot_state .get_status (job_id )
233
- while not spot_status .is_terminal ():
234
- if spot_status != spot_state .SpotStatus .RUNNING :
235
- logger .info (f'INFO: The log is not ready yet, as the spot job '
236
- f'is { spot_status .value } . '
233
+ # spot_status can be None if the controller process just started and has
234
+ # not updated the spot status yet.
235
+ while spot_status is None or not spot_status .is_terminal ():
236
+ handle = global_user_state .get_handle_from_cluster_name (cluster_name )
237
+ # Check the handle: The cluster can be preempted and removed from the
238
+ # table before the spot state is updated by the controller. In this
239
+ # case, we should skip the logging, and wait for the next round of
240
+ # status check.
241
+ if handle is None or spot_status != spot_state .SpotStatus .RUNNING :
242
+ status_help_str = ''
243
+ if (spot_status is not None and
244
+ spot_status != spot_state .SpotStatus .RUNNING ):
245
+ status_help_str = f', as the spot job is { spot_status .value } '
246
+ logger .info (f'INFO: The log is not ready yet{ status_help_str } . '
237
247
f'Waiting for { JOB_STATUS_CHECK_GAP_SECONDS } seconds.' )
238
248
time .sleep (JOB_STATUS_CHECK_GAP_SECONDS )
239
249
spot_status = spot_state .get_status (job_id )
240
250
continue
241
- handle = global_user_state .get_handle_from_cluster_name (cluster_name )
242
251
returncode = backend .tail_logs (handle ,
243
252
job_id = None ,
244
253
spot_job_id = job_id ,
0 commit comments