Skip to content

Commit 9165b3e

Browse files
authored
Fix spot terminal status (skypilot-org#1624)
* Fix spot terminal status * fix smoke test * longer waiting time * longer * fix FAILED_SETUP type * format * fix comment
1 parent b0894c8 commit 9165b3e

File tree

3 files changed

+7
-6
lines changed

3 files changed

+7
-6
lines changed

sky/spot/spot_state.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def colored_str(self):
127127
@classmethod
128128
def terminal_statuses(cls) -> List['SpotStatus']:
129129
return [
130-
cls.SUCCEEDED, cls.FAILED, cls.FAILED_NO_RESOURCE,
130+
cls.SUCCEEDED, cls.FAILED, cls.FAILED_SETUP, cls.FAILED_NO_RESOURCE,
131131
cls.FAILED_CONTROLLER, cls.CANCELLED
132132
]
133133

@@ -264,8 +264,9 @@ def set_failed(job_id: int,
264264
WHERE job_id=(?) AND end_at IS null""",
265265
(*list(fields_to_set.values()), job_id))
266266
_CONN.commit()
267-
if failure_type == SpotStatus.FAILED:
268-
logger.info('Job failed due to user code.')
267+
if failure_type in [SpotStatus.FAILED, SpotStatus.FAILED_SETUP]:
268+
logger.info(
269+
f'Job failed due to user code (status: {failure_type.value}).')
269270
elif failure_type == SpotStatus.FAILED_NO_RESOURCE:
270271
logger.info('Job failed due to failing to find available resources '
271272
'after retries.')

sky/spot/spot_utils.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
# state, after the job finished. This is a safeguard to avoid the case where
4848
# the spot job status fails to be updated and keep the `sky spot logs` blocking
4949
# for a long time.
50-
_FINAL_SPOT_STATUS_WAIT_TIMEOUT_SECONDS = 10
50+
_FINAL_SPOT_STATUS_WAIT_TIMEOUT_SECONDS = 20
5151

5252

5353
class UserSignal(enum.Enum):
@@ -62,7 +62,7 @@ def get_job_status(backend: 'backends.CloudVmRayBackend',
6262
cluster_name: str) -> Optional['job_lib.JobStatus']:
6363
"""Check the status of the job running on the spot cluster.
6464
65-
It can be None, INIT, RUNNING, SUCCEEDED, FAILED or CANCELLED.
65+
It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_SETUP or CANCELLED.
6666
"""
6767
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
6868
status = None

tests/test_smoke.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1134,7 +1134,7 @@ def test_spot_failed_setup(generic_cloud: str):
11341134
f'sky spot launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml',
11351135
'sleep 200',
11361136
# Make sure the job failed quickly.
1137-
f'{_SPOT_QUEUE_WAIT} | grep {name} | head -n1 | grep "FAILED"',
1137+
f'{_SPOT_QUEUE_WAIT} | grep {name} | head -n1 | grep "FAILED_SETUP"',
11381138
],
11391139
f'sky spot cancel -y -n {name}',
11401140
# Increase timeout since sky spot queue -r can be blocked by other spot tests.

0 commit comments

Comments
 (0)