Skip to content

Commit a9c194a

Browse files
authored
[Autostop/UX] Better UX when the cluster is partially stopped due to autostopping (skypilot-org#1637)
* Fix status logging warning * Not reset autostop column when the cluster is autostopping * Add a hint for INIT mode cluster which is autostopping * set the indicator * format * print out payload * fix 'to_down' * Not error out for is_autostopping * format * address comments * address comments
1 parent e535f29 commit a9c194a

File tree

5 files changed

+77
-14
lines changed

5 files changed

+77
-14
lines changed

sky/backends/backend_utils.py

+24-14
Original file line numberDiff line numberDiff line change
@@ -1950,24 +1950,34 @@ def _update_cluster_status_no_lock(
19501950
# abnormal.
19511951
#
19521952
# An abnormal cluster will transition to INIT and have any autostop setting
1953-
# reset.
1953+
# reset (unless it's autostopping/autodowning.).
19541954
is_abnormal = ((0 < len(node_statuses) < handle.launched_nodes) or
19551955
any(status != global_user_state.ClusterStatus.STOPPED
19561956
for status in node_statuses))
19571957
if is_abnormal:
1958-
# Reset the autostop to avoid false information with best effort.
1959-
# Side effect: if the status is refreshed during autostopping, the
1960-
# autostop field in the local cache will be reset, even though the
1961-
# cluster will still be correctly stopped.
1962-
try:
1963-
backend = backends.CloudVmRayBackend()
1964-
backend.set_autostop(handle, -1, stream_logs=False)
1965-
except (Exception, SystemExit) as e: # pylint: disable=broad-except
1966-
logger.debug(f'Failed to reset autostop. Due to '
1967-
f'{common_utils.format_exception(e)}')
1968-
global_user_state.set_cluster_autostop_value(handle.cluster_name,
1969-
-1,
1970-
to_down=False)
1958+
backend = get_backend_from_handle(handle)
1959+
if isinstance(backend,
1960+
backends.CloudVmRayBackend) and record['autostop'] >= 0:
1961+
if not backend.is_definitely_autostopping(handle,
1962+
stream_logs=False):
1963+
# Reset the autostopping as the cluster is abnormal, and may
1964+
# not correctly autostop. Resetting the autostop will let
1965+
# the user know that the autostop may not happen to avoid
1966+
# leakages from the assumption that the cluster will autostop.
1967+
try:
1968+
backend.set_autostop(handle, -1, stream_logs=False)
1969+
except (Exception, SystemExit) as e: # pylint: disable=broad-except
1970+
logger.debug(f'Failed to reset autostop. Due to '
1971+
f'{common_utils.format_exception(e)}')
1972+
global_user_state.set_cluster_autostop_value(
1973+
handle.cluster_name, -1, to_down=False)
1974+
else:
1975+
ux_utils.console_newline()
1976+
operation_str = 'autodowning' if record[
1977+
'to_down'] else 'autostopping'
1978+
logger.info(
1979+
f'Cluster {cluster_name!r} is {operation_str}. Setting to '
1980+
'INIT status; try refresh again in a while.')
19711981

19721982
# If the user starts part of a STOPPED cluster, we still need a status
19731983
# to represent the abnormal status. For spot cluster, it can also

sky/backends/cloud_vm_ray_backend.py

+21
Original file line numberDiff line numberDiff line change
@@ -3063,6 +3063,27 @@ def set_autostop(self,
30633063
global_user_state.set_cluster_autostop_value(
30643064
handle.cluster_name, idle_minutes_to_autostop, down)
30653065

3066+
def is_definitely_autostopping(self,
3067+
handle: ResourceHandle,
3068+
stream_logs: bool = True) -> bool:
3069+
"""Check if the cluster is autostopping.
3070+
3071+
Returns:
3072+
True if the cluster is definitely autostopping. It is possible
3073+
that the cluster is still autostopping when False is returned,
3074+
due to errors like transient network issues.
3075+
"""
3076+
code = autostop_lib.AutostopCodeGen.is_autostopping()
3077+
returncode, stdout, stderr = self.run_on_head(handle,
3078+
code,
3079+
require_outputs=True,
3080+
stream_logs=stream_logs)
3081+
3082+
if returncode == 0:
3083+
return common_utils.decode_payload(stdout)
3084+
logger.debug(f'Failed to check if cluster is autostopping: {stderr}')
3085+
return False
3086+
30663087
# TODO(zhwu): Refactor this to a CommandRunner class, so different backends
30673088
# can support its own command runner.
30683089
@timeline.event

sky/skylet/autostop_lib.py

+30
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from sky import sky_logging
99
from sky.skylet import configs
10+
from sky.utils import common_utils
1011

1112
logger = sky_logging.init_logger(__name__)
1213

@@ -16,6 +17,10 @@
1617
# user-issued commands (this module) and the Skylet process running the
1718
# AutostopEvent need to access that state.
1819
_AUTOSTOP_LAST_ACTIVE_TIME = 'autostop_last_active_time'
20+
# AutostopEvent sets this to the boot time when the autostop of the cluster
21+
# starts. This is used for checking whether the cluster is in the process
22+
# of autostopping for the current machine.
23+
_AUTOSTOP_INDICATOR = 'autostop_indicator'
1924

2025

2126
class AutostopConfig:
@@ -57,6 +62,26 @@ def set_autostop(idle_minutes: int, backend: Optional[str], down: bool) -> None:
5762
set_last_active_time_to_now()
5863

5964

65+
def set_autostopping_started() -> None:
66+
"""Sets the boot time of the machine when autostop starts.
67+
68+
This function should be called when the cluster is started to autostop,
69+
and the boot time of the machine will be stored in the configs database
70+
as an autostop indicator, which is used for checking whether the cluster
71+
is in the process of autostopping. The indicator is valid only when the
72+
machine has the same boot time as the one stored in the indicator.
73+
"""
74+
logger.debug('Setting is_autostopping.')
75+
configs.set_config(_AUTOSTOP_INDICATOR, str(psutil.boot_time()))
76+
77+
78+
def get_is_autostopping_payload() -> str:
79+
"""Returns whether the cluster is in the process of autostopping."""
80+
result = configs.get_config(_AUTOSTOP_INDICATOR)
81+
is_autostopping = (result == str(psutil.boot_time()))
82+
return common_utils.encode_payload(is_autostopping)
83+
84+
6085
def get_last_active_time() -> float:
6186
"""Returns the last active time, or -1 if none has been set."""
6287
result = configs.get_config(_AUTOSTOP_LAST_ACTIVE_TIME)
@@ -88,6 +113,11 @@ def set_autostop(cls, idle_minutes: int, backend: str, down: bool) -> str:
88113
]
89114
return cls._build(code)
90115

116+
@classmethod
117+
def is_autostopping(cls) -> str:
118+
code = ['print(autostop_lib.get_is_autostopping_payload())']
119+
return cls._build(code)
120+
91121
@classmethod
92122
def _build(cls, code: List[str]) -> str:
93123
code = cls._PREFIX + code

sky/skylet/configs.py

+1
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ def get_config(key: str) -> Optional[str]:
6060
rows = cursor.execute('SELECT value FROM config WHERE key = ?', (key,))
6161
for (value,) in rows:
6262
return value
63+
return None
6364

6465

6566
@ensure_table

sky/skylet/events.py

+1
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ def _run(self):
128128
def _stop_cluster(self, autostop_config):
129129
if (autostop_config.backend ==
130130
cloud_vm_ray_backend.CloudVmRayBackend.NAME):
131+
autostop_lib.set_autostopping_started()
131132
self._replace_yaml_for_stopping(self._ray_yaml_path,
132133
autostop_config.down)
133134

0 commit comments

Comments
 (0)