@@ -1671,19 +1671,22 @@ def status(all: bool, refresh: bool, show_spot_jobs: bool, clusters: List[str]):
1671
1671
@usage_lib .entrypoint
1672
1672
def cost_report (all : bool ): # pylint: disable=redefined-builtin
1673
1673
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1674
- """Show cost reports for each cluster .
1674
+ """Show estimated costs for launched clusters .
1675
1675
1676
- The following fields for each cluster are recorded: cluster name,
1677
- resources, launched time, duration that cluster was up,
1678
- and total cost.
1676
+ For each cluster, this shows: cluster name, resources, launched time,
1677
+ duration that cluster was up, and total estimated cost.
1679
1678
1680
1679
The estimated cost column indicates the price for the cluster based on the
1681
- type of resources being used and the duration of use up until the call
1682
- to status. This means if the cluster is UP, successive calls to report
1683
- will show increasing price. The estimated cost is calculated based on
1684
- the local cache of the cluster status, and may not be accurate for
1685
- the cluster with autostop/use_spot set or terminated/stopped
1686
- on the cloud console.
1680
+ type of resources being used and the duration of use up until now. This
1681
+ means if the cluster is UP, successive calls to cost-report will show
1682
+ increasing price.
1683
+
1684
+ The estimated cost is calculated based on the local cache of the cluster
1685
+ status, and may not be accurate for:
1686
+
1687
+ - clusters with autostop/use_spot set; or
1688
+
1689
+ - clusters that were terminated/stopped on the cloud console.
1687
1690
"""
1688
1691
cluster_records = core .cost_report ()
1689
1692
nonreserved_cluster_records = []
@@ -1778,8 +1781,8 @@ def queue(clusters: Sequence[str], skip_finished: bool, all_users: bool):
1778
1781
'-s' ,
1779
1782
is_flag = True ,
1780
1783
default = False ,
1781
- help = 'Sync down the logs of the job (this is useful for distributed jobs to '
1782
- 'download a separate log for each job from all the workers) .' )
1784
+ help = 'Sync down the logs of a job to the local machine. For a distributed '
1785
+ ' job, a separate log file from each worker will be downloaded .' )
1783
1786
@click .option (
1784
1787
'--status' ,
1785
1788
is_flag = True ,
@@ -1790,8 +1793,9 @@ def queue(clusters: Sequence[str], skip_finished: bool, all_users: bool):
1790
1793
'--follow/--no-follow' ,
1791
1794
is_flag = True ,
1792
1795
default = True ,
1793
- help = ('Follow the logs of the job. [default: --follow] '
1794
- 'If --no-follow is specified, print the log so far and exit.' ))
1796
+ help = ('Follow the logs of a job. '
1797
+ 'If --no-follow is specified, print the log so far and exit. '
1798
+ '[default: --follow]' ))
1795
1799
@click .argument ('cluster' ,
1796
1800
required = True ,
1797
1801
type = str ,
@@ -1813,12 +1817,14 @@ def logs(
1813
1817
1814
1818
1. If no flags are provided, tail the logs of the job_id specified. At most
1815
1819
one job_id can be provided.
1816
- 2. If --status is specified, print the status of the job and exit with
1817
- returncode 0 if the job is succeeded, or 1 otherwise. At most one job_id can
1820
+
1821
+ 2. If ``--status`` is specified, print the status of the job and exit with
1822
+ returncode 0 if the job succeeded, or 1 otherwise. At most one job_id can
1818
1823
be specified.
1819
- 3. If --sync-down is specified, the logs of the job will be downloaded from
1820
- the cluster and saved to the local machine under `~/sky_logs`. Mulitple
1821
- job_ids can be specified.
1824
+
1825
+ 3. If ``--sync-down`` is specified, the logs of the job will be downloaded
1826
+ from the cluster and saved to the local machine under
1827
+ ``~/sky_logs``. Mulitple job_ids can be specified.
1822
1828
"""
1823
1829
if sync_down and status :
1824
1830
raise click .UsageError (
@@ -2881,11 +2887,14 @@ def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
2881
2887
@cli .command ()
2882
2888
@usage_lib .entrypoint
2883
2889
def check ():
2884
- """Determine the set of clouds available to use.
2890
+ """Check which clouds are available to use.
2891
+
2892
+ This checks access credentials for all clouds supported by SkyPilot. If a
2893
+ cloud is detected to be inaccessible, the reason and correction steps will
2894
+ be shown.
2885
2895
2886
- This checks access credentials for AWS, Azure and GCP; on failure, it shows
2887
- the reason and suggests correction steps. Tasks will only run on clouds
2888
- that you have access to.
2896
+ The enabled clouds are cached and form the "search space" to be considered
2897
+ for each task.
2889
2898
"""
2890
2899
sky_check .check ()
2891
2900
@@ -2916,15 +2925,15 @@ def show_gpus(
2916
2925
all : bool , # pylint: disable=redefined-builtin
2917
2926
cloud : Optional [str ],
2918
2927
region : Optional [str ]):
2919
- """Show supported GPU/TPU/accelerators.
2928
+ """Show supported GPU/TPU/accelerators and their prices .
2920
2929
2921
2930
The names and counts shown can be set in the ``accelerators`` field in task
2922
2931
YAMLs, or in the ``--gpus`` flag in CLI commands. For example, if this
2923
2932
table shows 8x V100s are supported, then the string ``V100:8`` will be
2924
2933
accepted by the above.
2925
2934
2926
- To show the detailed information of a GPU/TPU type (which clouds offer it,
2927
- the quantity in each VM type, etc.), use ``sky show-gpus <gpu>``.
2935
+ To show the detailed information of a GPU/TPU type (its price, which clouds
2936
+ offer it, the quantity in each VM type, etc.), use ``sky show-gpus <gpu>``.
2928
2937
2929
2938
To show all accelerators, including less common ones and their detailed
2930
2939
information, use ``sky show-gpus --all``.
@@ -3058,7 +3067,7 @@ def _output():
3058
3067
3059
3068
@cli .group (cls = _NaturalOrderGroup )
3060
3069
def storage ():
3061
- """Storage related commands ."""
3070
+ """SkyPilot Storage CLI ."""
3062
3071
pass
3063
3072
3064
3073
@@ -3115,7 +3124,7 @@ def storage_delete(names: Sequence[str], all: bool): # pylint: disable=redefine
3115
3124
3116
3125
@cli .group (cls = _NaturalOrderGroup )
3117
3126
def admin ():
3118
- """Sky administrator commands for local clusters ."""
3127
+ """SkyPilot On-prem administrator CLI ."""
3119
3128
pass
3120
3129
3121
3130
@@ -3191,7 +3200,7 @@ def admin_deploy(clusterspec_yaml: str):
3191
3200
3192
3201
@cli .group (cls = _NaturalOrderGroup )
3193
3202
def spot ():
3194
- """Commands for managed spot jobs ."""
3203
+ """Managed Spot commands ( spot instances with auto-recovery) ."""
3195
3204
pass
3196
3205
3197
3206
@@ -3354,32 +3363,56 @@ def spot_launch(
3354
3363
def spot_queue (all : bool , refresh : bool , skip_finished : bool ):
3355
3364
"""Show statuses of managed spot jobs.
3356
3365
3357
- \b
3358
3366
Each spot job can have one of the following statuses:
3359
3367
3360
- \b
3361
- - SUBMITTED: The job is submitted to the spot controller.
3362
- - STARTING: The job is starting (starting a spot cluster).
3363
- - RUNNING: The job is running.
3364
- - RECOVERING: The spot cluster is recovering from a preemption.
3365
- - SUCCEEDED: The job succeeded.
3366
- - FAILED: The job failed due to an error from the job itself.
3367
- - FAILED_NO_RESOURCES: The job failed due to resources being unavailable
3368
- after a maximum number of retry attempts.
3369
- - FAILED_CONTROLLER: The job failed due to an unexpected error in the spot
3370
- controller.
3371
- - CANCELLING: The job was requested to be cancelled by the user, and the
3372
- cancellation is in progress.
3373
- - CANCELLED: The job was cancelled by the user.
3374
-
3375
- If the job failed, either due to user code or spot unavailability, the error
3376
- log can be found with ``sky spot logs --controller job_id``.
3368
+ - ``PENDING``: Job is waiting for a free slot on the spot controller to be
3369
+ accepted.
3370
+
3371
+ - ``SUBMITTED``: Job is submitted to and accepted by the spot controller.
3372
+
3373
+ - ``STARTING``: Job is starting (provisioning a spot cluster).
3374
+
3375
+ - ``RUNNING``: Job is running.
3376
+
3377
+ - ``RECOVERING``: The spot cluster is recovering from a preemption.
3378
+
3379
+ - ``SUCCEEDED``: Job succeeded.
3380
+
3381
+ - ``CANCELLING``: Job was requested to be cancelled by the user, and the
3382
+ cancellation is in progress.
3383
+
3384
+ - ``CANCELLED``: Job was cancelled by the user.
3385
+
3386
+ - ``FAILED``: Job failed due to an error from the job itself.
3387
+
3388
+ - ``FAILED_SETUP``: Job failed due to an error from the job's ``setup``
3389
+ commands.
3390
+
3391
+ - ``FAILED_PRECHECKS``: Job failed due to an error from our prechecks such
3392
+ as invalid cluster names or an infeasible resource is specified.
3393
+
3394
+ - ``FAILED_NO_RESOURCE``: Job failed due to resources being unavailable
3395
+ after a maximum number of retries.
3396
+
3397
+ - ``FAILED_CONTROLLER``: Job failed due to an unexpected error in the spot
3398
+ controller.
3399
+
3400
+ If the job failed, either due to user code or spot unavailability, the
3401
+ error log can be found with ``sky spot logs --controller``, e.g.:
3402
+
3403
+ .. code-block:: bash
3404
+
3405
+ sky spot logs --controller job_id
3406
+
3407
+ This also shows the logs for provisioning and any preemption and recovery
3408
+ attempts.
3377
3409
3378
3410
(Tip) To fetch job statuses every 60 seconds, use ``watch``:
3379
3411
3380
3412
.. code-block:: bash
3381
3413
3382
3414
watch -n60 sky spot queue
3415
+
3383
3416
"""
3384
3417
click .secho ('Fetching managed spot job statuses...' , fg = 'yellow' )
3385
3418
with log_utils .safe_rich_status ('[cyan]Checking spot jobs[/]' ):
@@ -3422,20 +3455,19 @@ def spot_queue(all: bool, refresh: bool, skip_finished: bool):
3422
3455
def spot_cancel (name : Optional [str ], job_ids : Tuple [int ], all : bool , yes : bool ):
3423
3456
"""Cancel managed spot jobs.
3424
3457
3425
- You can provide either a job name or a list of job ids to be cancelled.
3458
+ You can provide either a job name or a list of job IDs to be cancelled.
3426
3459
They are exclusive options.
3460
+
3427
3461
Examples:
3428
3462
3429
3463
.. code-block:: bash
3430
3464
3431
- # Cancel managed spot job with name 'my-job'
3432
- $ sky spot cancel -n my-job
3433
-
3434
- # Cancel managed spot jobs with IDs 1, 2, 3
3435
- $ sky spot cancel 1 2 3
3436
-
3465
+ # Cancel managed spot job with name 'my-job'
3466
+ $ sky spot cancel -n my-job
3467
+ \b
3468
+ # Cancel managed spot jobs with IDs 1, 2, 3
3469
+ $ sky spot cancel 1 2 3
3437
3470
"""
3438
-
3439
3471
_ , handle = spot_lib .is_spot_controller_up (
3440
3472
'All managed spot jobs should have finished.' )
3441
3473
if handle is None :
@@ -3544,7 +3576,7 @@ def _get_candidate_configs(yaml_path: str) -> Optional[List[Dict[str, str]]]:
3544
3576
3545
3577
@cli .group (cls = _NaturalOrderGroup )
3546
3578
def bench ():
3547
- """Sky Benchmark related commands ."""
3579
+ """SkyPilot Benchmark CLI ."""
3548
3580
pass
3549
3581
3550
3582
0 commit comments