|
9 | 9 | from typing import Dict, Iterator, List, Optional, Tuple
|
10 | 10 |
|
11 | 11 | from sky import clouds
|
| 12 | +from sky import exceptions |
12 | 13 | from sky.clouds import service_catalog
|
13 | 14 |
|
14 | 15 | if typing.TYPE_CHECKING:
|
@@ -94,45 +95,41 @@ def region_zones_provision_loop(
|
94 | 95 | @classmethod
|
95 | 96 | def get_default_ami(cls, region_name: str, instance_type: str) -> str:
|
96 | 97 | acc = cls.get_accelerators_from_instance_type(instance_type)
|
| 98 | + image_id = service_catalog.get_image_id_from_tag( |
| 99 | + 'skypilot:gpu-ubuntu-2004', region_name, clouds='aws') |
97 | 100 | if acc is not None:
|
98 | 101 | assert len(acc) == 1, acc
|
99 | 102 | acc_name = list(acc.keys())[0]
|
100 | 103 | if acc_name == 'K80':
|
101 |
| - # Deep Learning AMI GPU PyTorch 1.10.0 (Ubuntu 20.04) 20211208 |
102 |
| - # Downgrade the AMI for K80 due as it is only compatible with |
103 |
| - # NVIDIA driver lower than 470. |
104 |
| - amis = { |
105 |
| - 'us-east-1': 'ami-0868a20f5a3bf9702', |
106 |
| - 'us-east-2': 'ami-09b8825010d4dc701', |
107 |
| - # This AMI is 20210623 as aws does not provide a newer one. |
108 |
| - 'us-west-1': 'ami-0b3c34d643904a734', |
109 |
| - 'us-west-2': 'ami-06b3479ab15aaeaf1', |
110 |
| - } |
111 |
| - assert region_name in amis, region_name |
112 |
| - return amis[region_name] |
113 |
| - # https://console.aws.amazon.com/ec2/v2/home?region=us-east-1#Images:visibility=public-images;v=3;search=:64,:Ubuntu%2020,:Deep%20Learning%20AMI%20GPU%20PyTorch # pylint: disable=line-too-long |
114 |
| - |
115 |
| - # Commented below are newer AMIs, but as other clouds do not support |
116 |
| - # torch==1.13.0+cu117 we do not use these AMIs to avoid frequent updates: |
117 |
| - # Deep Learning AMI GPU PyTorch 1.12.1 (Ubuntu 20.04) 20221025 |
118 |
| - # Nvidia driver: 510.47.03, CUDA Version: 11.6 (supports torch==1.13.0+cu117) |
119 |
| - # 'us-east-1': 'ami-0eb1f91977a3fcc1b' |
120 |
| - # 'us-east-2': 'ami-0274a6db2e19b7cc6' |
121 |
| - # 'us-west-1': 'ami-0fb299af41d32cfd3' |
122 |
| - # 'us-west-2': 'ami-04ba15f9bd464eb20' |
123 |
| - # |
124 |
| - # Current AMIs: |
125 |
| - # Deep Learning AMI GPU PyTorch 1.10.0 (Ubuntu 20.04) 20220308 |
126 |
| - # Nvidia driver: 510.47.03, CUDA Version: 11.6 (does not support torch==1.13.0+cu117) |
127 |
| - amis = { |
128 |
| - 'us-east-1': 'ami-0729d913a335efca7', |
129 |
| - 'us-east-2': 'ami-070f4af81c19b41bf', |
130 |
| - # This AMI is 20210623 as aws does not provide a newer one. |
131 |
| - 'us-west-1': 'ami-0b3c34d643904a734', |
132 |
| - 'us-west-2': 'ami-050814f384259894c', |
133 |
| - } |
134 |
| - assert region_name in amis, region_name |
135 |
| - return amis[region_name] |
| 104 | + image_id = service_catalog.get_image_id_from_tag( |
| 105 | + 'skypilot:k80-ubuntu-2004', region_name, clouds='aws') |
| 106 | + if image_id is not None: |
| 107 | + return image_id |
| 108 | + # Raise ResourcesUnavailableError to make sure the failover in |
| 109 | + # CloudVMRayBackend will be correctly triggered. |
| 110 | + # TODO(zhwu): This is a information leakage to the cloud implementor, |
| 111 | + # we need to find a better way to handle this. |
| 112 | + raise exceptions.ResourcesUnavailableError( |
| 113 | + 'No image found in catalog for region ' |
| 114 | + f'{region_name}. Try setting a valid image_id.') |
| 115 | + |
| 116 | + @classmethod |
| 117 | + def _get_image_id(cls, region_name: str, instance_type: str, |
| 118 | + image_id: Optional[str]) -> str: |
| 119 | + if image_id is not None: |
| 120 | + if image_id.startswith('skypilot:'): |
| 121 | + image_id = service_catalog.get_image_id_from_tag(image_id, |
| 122 | + region_name, |
| 123 | + clouds='aws') |
| 124 | + if image_id is None: |
| 125 | + # Raise ResourcesUnavailableError to make sure the failover |
| 126 | + # in CloudVMRayBackend will be correctly triggered. |
| 127 | + # TODO(zhwu): This is a information leakage to the cloud |
| 128 | + # implementor, we need to find a better way to handle this. |
| 129 | + raise exceptions.ResourcesUnavailableError( |
| 130 | + f'No image found for region {region_name}') |
| 131 | + return image_id |
| 132 | + return cls.get_default_ami(region_name, instance_type) |
136 | 133 |
|
137 | 134 | @classmethod
|
138 | 135 | def get_zone_shell_cmd(cls) -> Optional[str]:
|
@@ -232,10 +229,7 @@ def make_deploy_resources_variables(
|
232 | 229 | else:
|
233 | 230 | custom_resources = None
|
234 | 231 |
|
235 |
| - if r.image_id is not None: |
236 |
| - image_id = r.image_id |
237 |
| - else: |
238 |
| - image_id = self.get_default_ami(region_name, r.instance_type) |
| 232 | + image_id = self._get_image_id(region_name, r.instance_type, r.image_id) |
239 | 233 |
|
240 | 234 | return {
|
241 | 235 | 'instance_type': r.instance_type,
|
@@ -342,9 +336,6 @@ def get_credential_file_mounts(self) -> Dict[str, str]:
|
342 | 336 | def instance_type_exists(self, instance_type):
|
343 | 337 | return service_catalog.instance_type_exists(instance_type, clouds='aws')
|
344 | 338 |
|
345 |
| - def validate_region_zone(self, region: Optional[str], zone: Optional[str]): |
346 |
| - return service_catalog.validate_region_zone(region, zone, clouds='aws') |
347 |
| - |
348 | 339 | def accelerator_in_region_or_zone(self,
|
349 | 340 | accelerator: str,
|
350 | 341 | acc_count: int,
|
|
0 commit comments