|
68 | 68 | _RETRY_UNTIL_UP_INIT_GAP_SECONDS = 60
|
69 | 69 |
|
70 | 70 | # The maximum retry count for fetching head IP address.
|
71 |
| -_HEAD_IP_MAX_ATTEMPTS = 3 |
| 71 | +_HEAD_IP_MAX_ATTEMPTS = 5 |
72 | 72 |
|
73 | 73 | _TEARDOWN_FAILURE_MESSAGE = (
|
74 | 74 | f'{colorama.Fore.RED}Failed to terminate '
|
|
89 | 89 |
|
90 | 90 | _TPU_NOT_FOUND_ERROR = 'ERROR: (gcloud.compute.tpus.delete) NOT_FOUND'
|
91 | 91 |
|
| 92 | +_MAX_RAY_UP_RETRY = 5 |
| 93 | + |
92 | 94 |
|
93 | 95 | def _get_cluster_config_template(cloud):
|
94 | 96 | cloud_to_template = {
|
@@ -1058,18 +1060,51 @@ def ray_up():
|
1058 | 1060 | # Edge case: /tmp/ray does not exist, so autoscaler can't create/store
|
1059 | 1061 | # cluster lock and cluster state.
|
1060 | 1062 | os.makedirs('/tmp/ray', exist_ok=True)
|
1061 |
| - returncode, stdout, stderr = ray_up() |
1062 |
| - if (returncode != 0 and 'Processing file mounts' in stdout and |
1063 |
| - 'Running setup commands' not in stdout): |
1064 |
| - # Retry ray up if it failed due to file mounts, because it is |
1065 |
| - # probably due to too many ssh connections issue and can be fixed |
1066 |
| - # by retrying. |
1067 |
| - # This is required when using custom image for GCP. |
1068 |
| - logger.info( |
1069 |
| - 'Retrying sky runtime setup due to ssh connection issue.') |
1070 |
| - returncode, stdout, stderr = ray_up() |
1071 | 1063 |
|
1072 |
| - logger.debug(f'Ray up takes {time.time() - start} seconds.') |
| 1064 | + # Launch the cluster with ray up |
| 1065 | + |
| 1066 | + # Retry if the any of the following happens: |
| 1067 | + # 1. Failed due to timeout when fetching head node for Azure. |
| 1068 | + # 2. Failed due to file mounts, because it is probably has too |
| 1069 | + # many ssh connections and can be fixed by retrying. |
| 1070 | + # This is required when using custom image for GCP. |
| 1071 | + def need_ray_up( |
| 1072 | + ray_up_return_value: Optional[Tuple[int, str, str]]) -> bool: |
| 1073 | + |
| 1074 | + # Indicates the first ray up. |
| 1075 | + if ray_up_return_value is None: |
| 1076 | + return True |
| 1077 | + |
| 1078 | + returncode, stdout, stderr = ray_up_return_value |
| 1079 | + if returncode == 0: |
| 1080 | + return False |
| 1081 | + |
| 1082 | + if ('Head node fetch timed out. Failed to create head node.' |
| 1083 | + in stderr and isinstance(to_provision_cloud, clouds.Azure)): |
| 1084 | + logger.info( |
| 1085 | + 'Retrying head node provisioning due to head fetching ' |
| 1086 | + 'timeout.') |
| 1087 | + return True |
| 1088 | + if ('Processing file mounts' in stdout and |
| 1089 | + 'Running setup commands' not in stdout and |
| 1090 | + 'Failed to setup head node.' in stderr): |
| 1091 | + logger.info( |
| 1092 | + 'Retrying sky runtime setup due to ssh connection issue.') |
| 1093 | + return True |
| 1094 | + return False |
| 1095 | + |
| 1096 | + retry_cnt = 0 |
| 1097 | + ray_up_return_value = None |
| 1098 | + while (retry_cnt < _MAX_RAY_UP_RETRY and |
| 1099 | + need_ray_up(ray_up_return_value)): |
| 1100 | + retry_cnt += 1 |
| 1101 | + ray_up_return_value = ray_up() |
| 1102 | + |
| 1103 | + assert ray_up_return_value is not None |
| 1104 | + returncode, stdout, stderr = ray_up_return_value |
| 1105 | + |
| 1106 | + logger.debug(f'Ray up takes {time.time() - start} seconds with ' |
| 1107 | + f'{retry_cnt} retries.') |
1073 | 1108 |
|
1074 | 1109 | # Only 1 node or head node provisioning failure.
|
1075 | 1110 | if num_nodes == 1 and returncode == 0:
|
@@ -1787,9 +1822,14 @@ def _exec_code_on_head(
|
1787 | 1822 | f'--address=127.0.0.1:8265 --job-id {ray_job_id} --no-wait '
|
1788 | 1823 | f'-- "{executable} -u {script_path} > {remote_log_path} 2>&1"')
|
1789 | 1824 |
|
1790 |
| - returncode = self.run_on_head(handle, job_submit_cmd, stream_logs=False) |
1791 |
| - subprocess_utils.handle_returncode(returncode, job_submit_cmd, |
1792 |
| - f'Failed to submit job {job_id}.') |
| 1825 | + returncode, stdout, stderr = self.run_on_head(handle, |
| 1826 | + job_submit_cmd, |
| 1827 | + stream_logs=False, |
| 1828 | + require_outputs=True) |
| 1829 | + subprocess_utils.handle_returncode(returncode, |
| 1830 | + job_submit_cmd, |
| 1831 | + f'Failed to submit job {job_id}.', |
| 1832 | + stderr=stdout + stderr) |
1793 | 1833 |
|
1794 | 1834 | logger.info('Job submitted with Job ID: '
|
1795 | 1835 | f'{style.BRIGHT}{job_id}{style.RESET_ALL}')
|
|
0 commit comments