Skip to content

Commit 309ecb0

Browse files
authored
[Image] Check image size and existence (skypilot-org#1508)
* Check image size and existence * Check image existence and size * Fix no credential * Fix error message * fix gcp image * lint * address comments * yapf
1 parent 678821e commit 309ecb0

File tree

8 files changed

+143
-37
lines changed

8 files changed

+143
-37
lines changed

sky/adaptors/aws.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def session():
5656

5757

5858
@import_package
59-
def client_exception():
60-
"""Client exception."""
59+
def botocore_exceptions():
60+
"""AWS botocore exception."""
6161
from botocore import exceptions
62-
return exceptions.ClientError
62+
return exceptions

sky/adaptors/gcp.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,3 +68,17 @@ def forbidden_exception():
6868
"""Forbidden exception."""
6969
from google.api_core import exceptions as gcs_exceptions
7070
return gcs_exceptions.Forbidden
71+
72+
73+
@import_package
74+
def http_error_exception():
75+
"""HttpError exception."""
76+
from googleapiclient import errors
77+
return errors.HttpError
78+
79+
80+
@import_package
81+
def credential_error_exception():
82+
"""CredentialError exception."""
83+
from google.auth import exceptions
84+
return exceptions.DefaultCredentialsError

sky/backends/cloud_vm_ray_backend.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -633,12 +633,18 @@ def _update_blocklist_on_gcp_error(self, region, zones, stdout, stderr):
633633
else:
634634
assert False, error
635635
elif len(httperror_str) >= 1:
636-
# Parse HttpError for unauthorized regions. Example:
637-
# googleapiclient.errors.HttpError: <HttpError 403 when requesting ... returned "Location us-east1-d is not found or access is unauthorized.". # pylint: disable=line-too-long
638-
# Details: "Location us-east1-d is not found or access is
639-
# unauthorized.">
640636
logger.info(f'Got {httperror_str[0]}')
641-
self._blocked_zones.add(zone.name)
637+
if ('Requested disk size cannot be smaller than the image size'
638+
in httperror_str[0]):
639+
logger.info('Skipping all regions due to disk size issue.')
640+
for r, _ in clouds.GCP.region_zones_provision_loop():
641+
self._blocked_regions.add(r.name)
642+
else:
643+
# Parse HttpError for unauthorized regions. Example:
644+
# googleapiclient.errors.HttpError: <HttpError 403 when requesting ... returned "Location us-east1-d is not found or access is unauthorized.". # pylint: disable=line-too-long
645+
# Details: "Location us-east1-d is not found or access is
646+
# unauthorized.">
647+
self._blocked_zones.add(zone.name)
642648
else:
643649
# No such structured error response found.
644650
assert not exception_str, stderr

sky/clouds/aws.py

Lines changed: 54 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010

1111
from sky import clouds
1212
from sky import exceptions
13+
from sky.adaptors import aws
1314
from sky.clouds import service_catalog
15+
from sky.utils import ux_utils
1416

1517
if typing.TYPE_CHECKING:
1618
# renaming to avoid shadowing variables
@@ -31,6 +33,10 @@ def _run_output(cmd):
3133
return proc.stdout.decode('ascii')
3234

3335

36+
# TODO(zhwu): Move the default AMI size to the catalog instead.
37+
DEFAULT_AMI_GB = 45
38+
39+
3440
@clouds.CLOUD_REGISTRY.register
3541
class AWS(clouds.Cloud):
3642
"""Amazon Web Services."""
@@ -114,27 +120,51 @@ def get_default_ami(cls, region_name: str, instance_type: str) -> str:
114120
f'{region_name}. Try setting a valid image_id.')
115121

116122
@classmethod
117-
def _get_image_id(cls, region_name: str, instance_type: str,
118-
image_id: Optional[Dict[str, str]]) -> str:
119-
if image_id is not None:
120-
if None in image_id:
121-
image_id = image_id[None]
122-
else:
123-
assert region_name in image_id, image_id
124-
image_id = image_id[region_name]
125-
if image_id.startswith('skypilot:'):
126-
image_id = service_catalog.get_image_id_from_tag(image_id,
127-
region_name,
128-
clouds='aws')
129-
if image_id is None:
130-
# Raise ResourcesUnavailableError to make sure the failover
131-
# in CloudVMRayBackend will be correctly triggered.
132-
# TODO(zhwu): This is a information leakage to the cloud
133-
# implementor, we need to find a better way to handle this.
134-
raise exceptions.ResourcesUnavailableError(
135-
f'No image found for region {region_name}')
136-
return image_id
137-
return cls.get_default_ami(region_name, instance_type)
123+
def _get_image_id(
124+
cls,
125+
image_id: Optional[Dict[str, str]],
126+
region_name: str,
127+
) -> str:
128+
if image_id is None:
129+
return None
130+
if None in image_id:
131+
image_id = image_id[None]
132+
else:
133+
assert region_name in image_id, image_id
134+
image_id = image_id[region_name]
135+
if image_id.startswith('skypilot:'):
136+
image_id = service_catalog.get_image_id_from_tag(image_id,
137+
region_name,
138+
clouds='aws')
139+
if image_id is None:
140+
# Raise ResourcesUnavailableError to make sure the failover
141+
# in CloudVMRayBackend will be correctly triggered.
142+
# TODO(zhwu): This is a information leakage to the cloud
143+
# implementor, we need to find a better way to handle this.
144+
raise exceptions.ResourcesUnavailableError(
145+
f'No image found for region {region_name}')
146+
return image_id
147+
148+
def get_image_size(self, image_id: str, region: Optional[str]) -> float:
149+
if image_id.startswith('skypilot:'):
150+
return DEFAULT_AMI_GB
151+
assert region is not None, (image_id, region)
152+
client = aws.client('ec2', region_name=region)
153+
try:
154+
image_info = client.describe_images(ImageIds=[image_id])
155+
image_info = image_info['Images'][0]
156+
image_size = image_info['BlockDeviceMappings'][0]['Ebs'][
157+
'VolumeSize']
158+
except aws.botocore_exceptions().NoCredentialsError:
159+
# Fallback to default image size if no credentials are available.
160+
# The credentials issue will be caught when actually provisioning
161+
# the instance and appropriate errors will be raised there.
162+
return DEFAULT_AMI_GB
163+
except aws.botocore_exceptions().ClientError:
164+
with ux_utils.print_exception_no_traceback():
165+
raise ValueError(f'Image {image_id!r} not found in '
166+
f'AWS region {region}') from None
167+
return image_size
138168

139169
@classmethod
140170
def get_zone_shell_cmd(cls) -> Optional[str]:
@@ -234,7 +264,9 @@ def make_deploy_resources_variables(
234264
else:
235265
custom_resources = None
236266

237-
image_id = self._get_image_id(region_name, r.instance_type, r.image_id)
267+
image_id = self._get_image_id(r.image_id, region_name)
268+
if image_id is None:
269+
image_id = self.get_default_ami(region_name, r.instance_type)
238270

239271
return {
240272
'instance_type': r.instance_type,

sky/clouds/cloud.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,14 @@ def get_credential_file_mounts(self) -> Dict[str, str]:
196196
"""
197197
raise NotImplementedError
198198

199+
def get_image_size(self, image_id: str, region: Optional[str]) -> float:
200+
"""Check the image size from the cloud.
201+
202+
Returns: the image size in GB.
203+
Raises: ValueError if the image cannot be found.
204+
"""
205+
raise NotImplementedError
206+
199207
def instance_type_exists(self, instance_type):
200208
"""Returns whether the instance type exists for this cloud."""
201209
raise NotImplementedError

sky/clouds/gcp.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,16 @@
99
from google import auth
1010

1111
from sky import clouds
12+
from sky import sky_logging
13+
from sky.adaptors import gcp
1214
from sky.clouds import service_catalog
1315
from sky.utils import ux_utils
1416

1517
if typing.TYPE_CHECKING:
1618
from sky import resources
1719

20+
logger = sky_logging.init_logger(__name__)
21+
1822
DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH = os.path.expanduser(
1923
'~/.config/gcloud/'
2024
'application_default_credentials.json')
@@ -51,6 +55,9 @@
5155
{{ cp {GCP_CONFIG_SKY_BACKUP_PATH} {GCP_CONFIG_PATH} > /dev/null 2>&1 || true; }} && \
5256
popd &>/dev/null'
5357

58+
# TODO(zhwu): Move the default AMI size to the catalog instead.
59+
DEFAULT_GCP_IMAGE_GB = 50
60+
5461

5562
def _run_output(cmd):
5663
proc = subprocess.run(cmd,
@@ -187,6 +194,37 @@ def get_egress_cost(self, num_gigabytes):
187194
def is_same_cloud(self, other):
188195
return isinstance(other, GCP)
189196

197+
def get_image_size(self, image_id: str, region: Optional[str]) -> float:
198+
del region # unused
199+
if image_id.startswith('skypilot:'):
200+
return DEFAULT_GCP_IMAGE_GB
201+
try:
202+
compute = gcp.build('compute',
203+
'v1',
204+
credentials=None,
205+
cache_discovery=False)
206+
except gcp.credential_error_exception() as e:
207+
return DEFAULT_GCP_IMAGE_GB
208+
try:
209+
image_attrs = image_id.split('/')
210+
if len(image_attrs) == 1:
211+
raise ValueError(f'Image {image_id!r} not found in GCP.')
212+
project = image_attrs[1]
213+
image_name = image_attrs[-1]
214+
image_infos = compute.images().get(project=project,
215+
image=image_name).execute()
216+
return float(image_infos['diskSizeGb'])
217+
except gcp.http_error_exception() as e:
218+
if e.resp.status == 403:
219+
with ux_utils.print_exception_no_traceback():
220+
raise ValueError('Not able to access the image '
221+
f'{image_id!r}') from None
222+
if e.resp.status == 404:
223+
with ux_utils.print_exception_no_traceback():
224+
raise ValueError(f'Image {image_id!r} not found in '
225+
'GCP.') from None
226+
raise
227+
190228
@classmethod
191229
def get_default_instance_type(cls) -> str:
192230
# 8 vCpus, 52 GB RAM. First-gen general purpose.

sky/data/storage.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -972,7 +972,7 @@ def _get_bucket(self) -> Tuple[StorageHandle, bool]:
972972
# accessible.
973973
self.client.head_bucket(Bucket=self.name)
974974
return bucket, False
975-
except aws.client_exception() as e:
975+
except aws.botocore_exceptions().ClientError as e:
976976
error_code = e.response['Error']['Code']
977977
# AccessDenied error for buckets that are private and not owned by
978978
# user.
@@ -1044,7 +1044,7 @@ def _create_s3_bucket(self,
10441044
s3_client.create_bucket(Bucket=bucket_name,
10451045
CreateBucketConfiguration=location)
10461046
logger.info(f'Created S3 bucket {bucket_name} in {region}')
1047-
except aws.client_exception() as e:
1047+
except aws.botocore_exceptions().ClientError as e:
10481048
with ux_utils.print_exception_no_traceback():
10491049
raise exceptions.StorageBucketCreateError(
10501050
f'Attempted to create a bucket '

sky/resources.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,11 +74,6 @@ def __init__(
7474
self._spot_recovery = spot_recovery.upper()
7575

7676
if disk_size is not None:
77-
if disk_size < 50:
78-
with ux_utils.print_exception_no_traceback():
79-
raise ValueError(
80-
'OS disk size must be larger than 50GB. Got: '
81-
f'{disk_size}.')
8277
if round(disk_size) != disk_size:
8378
with ux_utils.print_exception_no_traceback():
8479
raise ValueError(
@@ -440,6 +435,19 @@ def _try_validate_image_id(self) -> None:
440435
'image_id is only supported for AWS in a specific '
441436
'region, please explicitly specify the region.')
442437

438+
# Validate the image exists and the size is smaller than the disk size.
439+
for region, image_id in self._image_id.items():
440+
# Check the image exists and get the image size.
441+
# It will raise ValueError if the image does not exist.
442+
image_size = self.cloud.get_image_size(image_id, region)
443+
if image_size > self.disk_size:
444+
with ux_utils.print_exception_no_traceback():
445+
raise ValueError(
446+
f'Image {image_id!r} is {image_size}GB, which is '
447+
f'larger than the specified disk_size: {self.disk_size}'
448+
' GB. Please specify a larger disk_size to use this '
449+
'image.')
450+
443451
def get_cost(self, seconds: float) -> float:
444452
"""Returns cost in USD for the runtime in seconds."""
445453
hours = seconds / 3600

0 commit comments

Comments
 (0)