Skip to content

Commit 4240be7

Browse files
authored
Retry for azure when first launched (skypilot-org#985)
* retry for azure head timeout * check more matching * Add retries * reformat * increase retries for node ip query * format * Address comments * Fix readme for env * Address comments * format
1 parent 3c31825 commit 4240be7

File tree

3 files changed

+59
-17
lines changed

3 files changed

+59
-17
lines changed

README.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ IMPORTANT: Please `export SKY_DEV=1` before running the sky commands in the term
3232

3333

3434
### Environment Variable Options
35-
- `export SKY_DEV=1` to show debugging logs (logging.DEBUG) and send the logs to dev space.
35+
- `export SKY_DEV=1` to send the logs to dev space.
36+
- `export SKY_DEBUG=1` to show debugging logs (logging.DEBUG).
3637
- `export SKY_DISABLE_USAGE_COLLECTION=1` to disable usage logging.
3738
- `export SKY_MINIMIZE_LOGGING=1` to minimize the sky outputs for demo purpose.
3839

sky/backends/backend_utils.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -925,6 +925,7 @@ def generate_cluster_name():
925925

926926
def query_head_ip_with_retries(cluster_yaml: str, max_attempts: int = 1) -> str:
927927
"""Returns the ip of the head node from yaml file."""
928+
backoff = Backoff(initial_backoff=5, max_backoff_factor=5)
928929
for i in range(max_attempts):
929930
try:
930931
out = subprocess_utils.run(
@@ -940,7 +941,7 @@ def query_head_ip_with_retries(cluster_yaml: str, max_attempts: int = 1) -> str:
940941
raise RuntimeError('Failed to get head ip') from e
941942
# Retry if the cluster is not up yet.
942943
logger.debug('Retrying to get head ip.')
943-
time.sleep(5)
944+
time.sleep(backoff.current_backoff())
944945
return head_ip
945946

946947

sky/backends/cloud_vm_ray_backend.py

+55-15
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@
6868
_RETRY_UNTIL_UP_INIT_GAP_SECONDS = 60
6969

7070
# The maximum retry count for fetching head IP address.
71-
_HEAD_IP_MAX_ATTEMPTS = 3
71+
_HEAD_IP_MAX_ATTEMPTS = 5
7272

7373
_TEARDOWN_FAILURE_MESSAGE = (
7474
f'{colorama.Fore.RED}Failed to terminate '
@@ -89,6 +89,8 @@
8989

9090
_TPU_NOT_FOUND_ERROR = 'ERROR: (gcloud.compute.tpus.delete) NOT_FOUND'
9191

92+
_MAX_RAY_UP_RETRY = 5
93+
9294

9395
def _get_cluster_config_template(cloud):
9496
cloud_to_template = {
@@ -1058,18 +1060,51 @@ def ray_up():
10581060
# Edge case: /tmp/ray does not exist, so autoscaler can't create/store
10591061
# cluster lock and cluster state.
10601062
os.makedirs('/tmp/ray', exist_ok=True)
1061-
returncode, stdout, stderr = ray_up()
1062-
if (returncode != 0 and 'Processing file mounts' in stdout and
1063-
'Running setup commands' not in stdout):
1064-
# Retry ray up if it failed due to file mounts, because it is
1065-
# probably due to too many ssh connections issue and can be fixed
1066-
# by retrying.
1067-
# This is required when using custom image for GCP.
1068-
logger.info(
1069-
'Retrying sky runtime setup due to ssh connection issue.')
1070-
returncode, stdout, stderr = ray_up()
10711063

1072-
logger.debug(f'Ray up takes {time.time() - start} seconds.')
1064+
# Launch the cluster with ray up
1065+
1066+
# Retry if the any of the following happens:
1067+
# 1. Failed due to timeout when fetching head node for Azure.
1068+
# 2. Failed due to file mounts, because it is probably has too
1069+
# many ssh connections and can be fixed by retrying.
1070+
# This is required when using custom image for GCP.
1071+
def need_ray_up(
1072+
ray_up_return_value: Optional[Tuple[int, str, str]]) -> bool:
1073+
1074+
# Indicates the first ray up.
1075+
if ray_up_return_value is None:
1076+
return True
1077+
1078+
returncode, stdout, stderr = ray_up_return_value
1079+
if returncode == 0:
1080+
return False
1081+
1082+
if ('Head node fetch timed out. Failed to create head node.'
1083+
in stderr and isinstance(to_provision_cloud, clouds.Azure)):
1084+
logger.info(
1085+
'Retrying head node provisioning due to head fetching '
1086+
'timeout.')
1087+
return True
1088+
if ('Processing file mounts' in stdout and
1089+
'Running setup commands' not in stdout and
1090+
'Failed to setup head node.' in stderr):
1091+
logger.info(
1092+
'Retrying sky runtime setup due to ssh connection issue.')
1093+
return True
1094+
return False
1095+
1096+
retry_cnt = 0
1097+
ray_up_return_value = None
1098+
while (retry_cnt < _MAX_RAY_UP_RETRY and
1099+
need_ray_up(ray_up_return_value)):
1100+
retry_cnt += 1
1101+
ray_up_return_value = ray_up()
1102+
1103+
assert ray_up_return_value is not None
1104+
returncode, stdout, stderr = ray_up_return_value
1105+
1106+
logger.debug(f'Ray up takes {time.time() - start} seconds with '
1107+
f'{retry_cnt} retries.')
10731108

10741109
# Only 1 node or head node provisioning failure.
10751110
if num_nodes == 1 and returncode == 0:
@@ -1787,9 +1822,14 @@ def _exec_code_on_head(
17871822
f'--address=127.0.0.1:8265 --job-id {ray_job_id} --no-wait '
17881823
f'-- "{executable} -u {script_path} > {remote_log_path} 2>&1"')
17891824

1790-
returncode = self.run_on_head(handle, job_submit_cmd, stream_logs=False)
1791-
subprocess_utils.handle_returncode(returncode, job_submit_cmd,
1792-
f'Failed to submit job {job_id}.')
1825+
returncode, stdout, stderr = self.run_on_head(handle,
1826+
job_submit_cmd,
1827+
stream_logs=False,
1828+
require_outputs=True)
1829+
subprocess_utils.handle_returncode(returncode,
1830+
job_submit_cmd,
1831+
f'Failed to submit job {job_id}.',
1832+
stderr=stdout + stderr)
17931833

17941834
logger.info('Job submitted with Job ID: '
17951835
f'{style.BRIGHT}{job_id}{style.RESET_ALL}')

0 commit comments

Comments
 (0)