Skip to content

Commit 561c21f

Browse files
authored
[Spot] Rename spot queue and fix help str (skypilot-org#1294)
* Rename to spot queue with help str fixes * fix cli doc * Fix super * fix alias CLI * Fix log * fix * address comments * fix log * format * address comments
1 parent d45a7f6 commit 561c21f

File tree

11 files changed

+124
-86
lines changed

11 files changed

+124
-86
lines changed

docs/source/examples/spot-jobs.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,8 @@ Here are some commands for managed spot jobs. Check :code:`sky spot --help` for
188188
.. code-block:: console
189189
190190
# Check the status of the spot jobs
191-
$ sky spot status
192-
Fetching managed spot job status...
191+
$ sky spot queue
192+
Fetching managed spot job statuses...
193193
Managed spot jobs:
194194
ID NAME RESOURCES SUBMITTED TOT. DURATION JOB DURATION #RECOVERIES STATUS
195195
2 roberta 1x [A100:8] 2 hrs ago 2h 47m 18s 2h 36m 18s 0 RUNNING

docs/source/reference/cli.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@ Managed Spot Jobs CLI
8686
:prog: sky spot launch
8787
:nested: full
8888

89-
.. click:: sky.cli:spot_status
90-
:prog: sky spot status
89+
.. click:: sky.cli:spot_queue
90+
:prog: sky spot queue
9191
:nested: full
9292

9393
.. click:: sky.cli:spot_cancel

sky/__init__.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@
2020
from sky.global_user_state import ClusterStatus
2121
from sky.skylet.job_lib import JobStatus
2222
from sky.core import (status, start, stop, down, autostop, queue, cancel,
23-
tail_logs, download_logs, job_status, spot_status,
24-
spot_cancel, storage_ls, storage_delete)
23+
tail_logs, download_logs, job_status, spot_queue,
24+
spot_status, spot_cancel, storage_ls, storage_delete)
2525

2626
# Aliases.
2727
AWS = clouds.AWS
@@ -68,7 +68,8 @@
6868
'download_logs',
6969
'job_status',
7070
# core APIs Spot Job Management
71-
'spot_status',
71+
'spot_queue',
72+
'spot_status', # Deprecated (alias for spot_queue)
7273
'spot_cancel',
7374
# core APIs Storage Management
7475
'storage_ls',

sky/backends/backend.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,9 @@ def execute(self, handle: ResourceHandle, task: 'task_lib.Task',
8282
return self._execute(handle, task, detach_run)
8383

8484
@timeline.event
85-
def post_execute(self, handle: ResourceHandle, teardown: bool) -> None:
85+
def post_execute(self, handle: ResourceHandle, down: bool) -> None:
8686
"""Post execute(): e.g., print helpful inspection messages."""
87-
return self._post_execute(handle, teardown)
87+
return self._post_execute(handle, down)
8888

8989
@timeline.event
9090
def teardown_ephemeral_storage(self, task: 'task_lib.Task') -> None:
@@ -130,7 +130,7 @@ def _execute(self, handle: ResourceHandle, task: 'task_lib.Task',
130130
detach_run: bool) -> None:
131131
raise NotImplementedError
132132

133-
def _post_execute(self, handle: ResourceHandle, teardown: bool) -> None:
133+
def _post_execute(self, handle: ResourceHandle, down: bool) -> None:
134134
raise NotImplementedError
135135

136136
def _teardown_ephemeral_storage(self, task: 'task_lib.Task') -> None:

sky/backends/cloud_vm_ray_backend.py

Lines changed: 45 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ def add_prologue(self,
198198
'run_bash_command_with_log = ray.remote(run_bash_command_with_log)',
199199
]
200200
if spot_task is not None:
201-
# Add the spot job to spot status table.
201+
# Add the spot job to spot queue table.
202202
resources_str = backend_utils.get_task_resources_str(spot_task)
203203
self._code += [
204204
'from sky.spot import spot_state',
@@ -2092,17 +2092,33 @@ def _exec_code_on_head(
20922092
self.tail_logs(handle, job_id)
20932093
finally:
20942094
name = handle.cluster_name
2095-
logger.info(f'{fore.CYAN}Job ID: '
2096-
f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
2097-
'\nTo cancel the job:\t'
2098-
f'{backend_utils.BOLD}sky cancel {name} {job_id}'
2099-
f'{backend_utils.RESET_BOLD}'
2100-
'\nTo stream the logs:\t'
2101-
f'{backend_utils.BOLD}sky logs {name} {job_id}'
2102-
f'{backend_utils.RESET_BOLD}'
2103-
'\nTo view the job queue:\t'
2104-
f'{backend_utils.BOLD}sky queue {name}'
2105-
f'{backend_utils.RESET_BOLD}')
2095+
if name == spot_lib.SPOT_CONTROLLER_NAME:
2096+
logger.info(f'{fore.CYAN}Spot Job ID: '
2097+
f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
2098+
'\nTo cancel the job:\t\t'
2099+
f'{backend_utils.BOLD}sky spot cancel {job_id}'
2100+
f'{backend_utils.RESET_BOLD}'
2101+
'\nTo stream the logs:\t\t'
2102+
f'{backend_utils.BOLD}sky spot logs {job_id}'
2103+
f'{backend_utils.RESET_BOLD}'
2104+
f'\nTo stream controller logs:\t'
2105+
f'{backend_utils.BOLD}sky logs {name} {job_id}'
2106+
f'{backend_utils.RESET_BOLD}'
2107+
'\nTo view all spot jobs:\t\t'
2108+
f'{backend_utils.BOLD}sky spot queue'
2109+
f'{backend_utils.RESET_BOLD}')
2110+
else:
2111+
logger.info(f'{fore.CYAN}Job ID: '
2112+
f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
2113+
'\nTo cancel the job:\t'
2114+
f'{backend_utils.BOLD}sky cancel {name} {job_id}'
2115+
f'{backend_utils.RESET_BOLD}'
2116+
'\nTo stream the logs:\t'
2117+
f'{backend_utils.BOLD}sky logs {name} {job_id}'
2118+
f'{backend_utils.RESET_BOLD}'
2119+
'\nTo view the job queue:\t'
2120+
f'{backend_utils.BOLD}sky queue {name}'
2121+
f'{backend_utils.RESET_BOLD}')
21062122

21072123
def _setup_and_create_job_cmd_on_local_head(
21082124
self,
@@ -2190,31 +2206,32 @@ def _execute(
21902206
# Case: task_lib.Task(run, num_nodes=1)
21912207
self._execute_task_one_node(handle, task, job_id, detach_run)
21922208

2193-
def _post_execute(self, handle: ResourceHandle, teardown: bool) -> None:
2209+
def _post_execute(self, handle: ResourceHandle, down: bool) -> None:
21942210
colorama.init()
21952211
fore = colorama.Fore
21962212
style = colorama.Style
21972213
name = handle.cluster_name
2214+
if name == spot_lib.SPOT_CONTROLLER_NAME or down:
2215+
return
21982216
stop_str = ('\nTo stop the cluster:'
21992217
f'\t{backend_utils.BOLD}sky stop {name}'
22002218
f'{backend_utils.RESET_BOLD}')
22012219
if isinstance(handle.launched_resources.cloud, clouds.Local):
22022220
stop_str = ''
2203-
if not teardown:
2204-
logger.info(f'\n{fore.CYAN}Cluster name: '
2205-
f'{style.BRIGHT}{name}{style.RESET_ALL}'
2206-
'\nTo log into the head VM:\t'
2207-
f'{backend_utils.BOLD}ssh {name}'
2208-
f'{backend_utils.RESET_BOLD}'
2209-
'\nTo submit a job:'
2210-
f'\t\t{backend_utils.BOLD}sky exec {name} yaml_file'
2211-
f'{backend_utils.RESET_BOLD}'
2212-
f'{stop_str}'
2213-
'\nTo teardown the cluster:'
2214-
f'\t{backend_utils.BOLD}sky down {name}'
2215-
f'{backend_utils.RESET_BOLD}')
2216-
if handle.tpu_delete_script is not None:
2217-
logger.info('Tip: `sky down` will delete launched TPU(s) too.')
2221+
logger.info(f'\n{fore.CYAN}Cluster name: '
2222+
f'{style.BRIGHT}{name}{style.RESET_ALL}'
2223+
'\nTo log into the head VM:\t'
2224+
f'{backend_utils.BOLD}ssh {name}'
2225+
f'{backend_utils.RESET_BOLD}'
2226+
'\nTo submit a job:'
2227+
f'\t\t{backend_utils.BOLD}sky exec {name} yaml_file'
2228+
f'{backend_utils.RESET_BOLD}'
2229+
f'{stop_str}'
2230+
'\nTo teardown the cluster:'
2231+
f'\t{backend_utils.BOLD}sky down {name}'
2232+
f'{backend_utils.RESET_BOLD}')
2233+
if handle.tpu_delete_script is not None:
2234+
logger.info('Tip: `sky down` will delete launched TPU(s) too.')
22182235

22192236
def _teardown_ephemeral_storage(self, task: task_lib.Task) -> None:
22202237
storage_mounts = task.storage_mounts

sky/backends/local_docker_backend.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,8 @@ def _execute(self, handle: ResourceHandle, task: 'task_lib.Task',
268268

269269
self._execute_task_one_node(handle, task)
270270

271-
def _post_execute(self, handle: ResourceHandle, teardown: bool) -> None:
271+
def _post_execute(self, handle: ResourceHandle, down: bool) -> None:
272+
del down # unused
272273
colorama.init()
273274
style = colorama.Style
274275
container = self.containers[handle]

sky/cli.py

Lines changed: 36 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
listed in "sky --help". Take care to put logically connected commands close to
2828
each other.
2929
"""
30+
import copy
3031
import datetime
3132
import functools
3233
import getpass
@@ -895,28 +896,8 @@ def _make_dag_from_entrypoint_with_overrides(
895896
return dag
896897

897898

898-
def _start_cluster(cluster_name: str,
899-
idle_minutes_to_autostop: Optional[int] = None,
900-
retry_until_up: bool = False):
901-
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
902-
backend = backend_utils.get_backend_from_handle(handle)
903-
assert isinstance(backend, backends.CloudVmRayBackend)
904-
with sky.Dag():
905-
dummy_task = sky.Task().set_resources(handle.launched_resources)
906-
dummy_task.num_nodes = handle.launched_nodes
907-
handle = backend.provision(dummy_task,
908-
to_provision=handle.launched_resources,
909-
dryrun=False,
910-
stream_logs=True,
911-
cluster_name=cluster_name,
912-
retry_until_up=retry_until_up)
913-
if idle_minutes_to_autostop is not None:
914-
backend.set_autostop(handle, idle_minutes_to_autostop)
915-
return handle
916-
917-
918899
class _NaturalOrderGroup(click.Group):
919-
"""Lists commands in the order they are defined in this script.
900+
"""Lists commands in the order defined in this script.
920901
921902
Reference: https://github.com/pallets/click/issues/513
922903
"""
@@ -940,6 +921,30 @@ def get_help(self, ctx):
940921
return super().get_help(ctx)
941922

942923

924+
def _with_deprecation_warning(f, original_name, alias_name):
925+
926+
@functools.wraps(f)
927+
def wrapper(self, *args, **kwargs):
928+
click.secho(
929+
f'WARNING: `{alias_name}` is deprecated and will be removed in a '
930+
f'future release. Please use `{original_name}` instead.\n',
931+
err=True,
932+
fg='yellow')
933+
return f(self, *args, **kwargs)
934+
935+
return wrapper
936+
937+
938+
def _add_command_alias_to_group(group, command, name, hidden):
939+
"""Add a alias of a command to a group."""
940+
new_command = copy.deepcopy(command)
941+
new_command.hidden = hidden
942+
new_command.name = name
943+
new_command.invoke = _with_deprecation_warning(new_command.invoke,
944+
command.name, name)
945+
group.add_command(new_command, name=name)
946+
947+
943948
@click.group(cls=_NaturalOrderGroup, context_settings=_CONTEXT_SETTINGS)
944949
@click.option('--install-shell-completion',
945950
type=click.Choice(['bash', 'zsh', 'fish', 'auto']),
@@ -1902,10 +1907,10 @@ def _hint_for_down_spot_controller(controller_name: str):
19021907
f'spot controller ({cluster_status.value}). Please be '
19031908
f'aware of the following:{colorama.Style.RESET_ALL}'
19041909
'\n * All logs and status information of the spot '
1905-
'jobs (output of `sky spot status`) will be lost.')
1910+
'jobs (output of `sky spot queue`) will be lost.')
19061911
if cluster_status == global_user_state.ClusterStatus.UP:
19071912
try:
1908-
spot_jobs = core.spot_status(refresh=False)
1913+
spot_jobs = core.spot_queue(refresh=False)
19091914
except exceptions.ClusterNotUpError:
19101915
# The spot controller cluster status changed during querying
19111916
# the spot jobs, use the latest cluster status, so that the
@@ -2719,7 +2724,7 @@ def spot_launch(
27192724
retry_until_up=retry_until_up)
27202725

27212726

2722-
@spot.command('status', cls=_DocumentedCodeCommand)
2727+
@spot.command('queue', cls=_DocumentedCodeCommand)
27232728
@click.option('--all',
27242729
'-a',
27252730
default=False,
@@ -2736,7 +2741,7 @@ def spot_launch(
27362741
)
27372742
@usage_lib.entrypoint
27382743
# pylint: disable=redefined-builtin
2739-
def spot_status(all: bool, refresh: bool):
2744+
def spot_queue(all: bool, refresh: bool):
27402745
"""Show statuses of managed spot jobs.
27412746
27422747
\b
@@ -2757,17 +2762,17 @@ def spot_status(all: bool, refresh: bool):
27572762
27582763
If the job failed, either due to user code or spot unavailability, the error
27592764
log can be found with ``sky logs sky-spot-controller-<user_hash> job_id``.
2760-
Please find your exact spot controller name with ``sky status -a``.
2765+
Please find your exact spot controller name with ``sky status``.
27612766
27622767
(Tip) To fetch job statuses every 60 seconds, use ``watch``:
27632768
27642769
.. code-block:: bash
27652770
2766-
watch -n60 sky spot status
2771+
watch -n60 sky spot queue
27672772
"""
27682773
click.secho('Fetching managed spot job statuses...', fg='yellow')
27692774
try:
2770-
job_table = core.spot_status(refresh=refresh)
2775+
job_table = core.spot_queue(refresh=refresh)
27712776
except exceptions.ClusterNotUpError:
27722777
cache = spot_lib.load_job_table_cache()
27732778
if cache is not None:
@@ -2787,6 +2792,9 @@ def spot_status(all: bool, refresh: bool):
27872792
click.echo(f'Managed spot jobs:\n{job_table}')
27882793

27892794

2795+
_add_command_alias_to_group(spot, spot_queue, 'status', hidden=True)
2796+
2797+
27902798
@spot.command('cancel', cls=_DocumentedCodeCommand)
27912799
@click.option('--name',
27922800
'-n',

sky/core.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""SDK functions for cluster/job management."""
22
import colorama
33
import getpass
4+
import sys
45
from typing import Any, Dict, List, Optional, Tuple
56

67
from sky import dag
@@ -87,7 +88,7 @@ def _start(
8788
f'Starting cluster {cluster_name!r} with backend {backend.NAME} '
8889
'is not supported.')
8990

90-
# NOTE: if spot_status() calls _start() and hits here, that entrypoint
91+
# NOTE: if spot_queue() calls _start() and hits here, that entrypoint
9192
# would have a cluster name (the controller) filled in.
9293
usage_lib.record_cluster_name_for_current_operation(cluster_name)
9394

@@ -553,10 +554,20 @@ def _is_spot_controller_up(
553554

554555
@usage_lib.entrypoint
555556
def spot_status(refresh: bool) -> List[Dict[str, Any]]:
557+
"""[Deprecated] (alias of spot_queue) Get statuses of managed spot jobs."""
558+
print(
559+
f'{colorama.Fore.YELLOW}WARNING: `spot_status()` is deprecated. '
560+
f'Instead, use: spot_queue(){colorama.Style.RESET_ALL}',
561+
file=sys.stderr)
562+
return spot_queue(refresh=refresh)
563+
564+
565+
@usage_lib.entrypoint
566+
def spot_queue(refresh: bool) -> List[Dict[str, Any]]:
556567
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
557568
"""Get statuses of managed spot jobs.
558569
559-
Please refer to the sky.cli.spot_status for the document.
570+
Please refer to the sky.cli.spot_queue for the documentation.
560571
561572
Returns:
562573
[
@@ -579,7 +590,7 @@ def spot_status(refresh: bool) -> List[Dict[str, Any]]:
579590

580591
stop_msg = ''
581592
if not refresh:
582-
stop_msg = 'To view the latest job table: sky spot status --refresh'
593+
stop_msg = 'To view the latest job table: sky spot queue --refresh'
583594
controller_status, handle = _is_spot_controller_up(stop_msg)
584595

585596
if controller_status is None:

sky/spot/spot_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
259259
f'in {JOB_STATUS_CHECK_GAP_SECONDS} seconds.')
260260
# If the tailing fails, it is likely that the cluster fails, so we wait
261261
# a while to make sure the spot state is updated by the controller, and
262-
# check the spot status again.
262+
# check the spot queue again.
263263
time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
264264
spot_status = spot_state.get_status(job_id)
265265
else:

0 commit comments

Comments
 (0)