Skip to content

Commit 4dda5ee

Browse files
authored
Update default images for AWS and GCP (skypilot-org#1608)
* Update images for AWS * remnant * comment * refactor fetch_aws a bit * fix tests * fix * wait longer for recovery on AWS * fix * fix no conda test * longer recover time * fix tests * fix * fix azure zones * format * update comments * newline
1 parent 9165b3e commit 4dda5ee

File tree

5 files changed

+59
-59
lines changed

5 files changed

+59
-59
lines changed

examples/huggingface_glue_imdb_app.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,11 @@ setup: |
1010
git clone https://github.com/huggingface/transformers/
1111
# checkout to the correct version
1212
cd transformers
13-
git checkout v4.21.0
13+
git checkout v4.25.1
1414
pip3 install .
1515
cd examples/pytorch/text-classification
1616
# SkyPilot's default image on AWS/GCP has CUDA 11.6 (Azure 11.5).
17-
pip3 install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
17+
pip3 install -r requirements.txt tensorboard torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
1818
1919
# The command to run. Will be run under the working directory.
2020
run: |

examples/per_region_images.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ resources:
22
cloud: aws
33
instance_type: g4dn.xlarge
44
image_id:
5-
us-west-2: ami-0fe5af21074ad2a10 # Deep learning AMI with CUDA 11.6 without conda installed
6-
us-west-1: skypilot:gpu-ubuntu-1804
5+
us-west-2: skypilot:gpu-ubuntu-1804
6+
us-east-2: ami-0406ab83559331633 # AWS Deep Learning AMI GPU CUDA 11.4.3 (Ubuntu 20.04) 20220210 without conda installed
77

88

99
setup: |

sky/authentication.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,8 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
245245
# OS Login is not enabled for the project. Add the ssh key directly to the
246246
# metadata.
247247
# TODO(zhwu): Use cloud init to add ssh public key, to avoid the permission
248-
# issue.
248+
# issue. A blocker is that the cloud init is not installed in the debian
249+
# image by default.
249250
project_keys = next(
250251
(item for item in project['commonInstanceMetadata'].get('items', [])
251252
if item['key'] == 'ssh-keys'), {}).get('value', '')

sky/clouds/service_catalog/data_fetchers/fetch_aws.py

+40-41
Original file line numberDiff line numberDiff line change
@@ -251,29 +251,36 @@ def get_all_regions_instance_types_df(regions: Set[str]) -> pd.DataFrame:
251251

252252

253253
# Fetch Images
254-
_GPU_TO_IMAGE_DATE = {
255-
# https://console.aws.amazon.com/ec2/v2/home?region=us-east-1#Images:visibility=public-images;v=3;search=:64,:Ubuntu%2020,:Deep%20Learning%20AMI%20GPU%20PyTorch # pylint: disable=line-too-long
256-
# Current AMIs:
257-
# Deep Learning AMI GPU PyTorch 1.10.0 (Ubuntu 20.04) 20220308
258-
# Nvidia driver: 510.47.03, CUDA Version: 11.6 (does not support torch==1.13.0+cu117)
259-
#
260-
# Use a list to fallback to newer AMI, as some regions like ap-southeast-3 does not have
261-
# the older AMI.
262-
'gpu': ['20220308', '20221101'],
263-
# Deep Learning AMI GPU PyTorch 1.10.0 (Ubuntu 20.04) 20211208
264-
# Downgrade the AMI for K80 due as it is only compatible with
265-
# NVIDIA driver lower than 470.
266-
'k80': ['20211208']
267-
}
268-
_UBUNTU_VERSION = ['18.04', '20.04']
269-
270-
271-
def _fetch_image_id(region: str, ubuntu_version: str,
272-
creation_date: str) -> Optional[str]:
254+
# https://console.aws.amazon.com/ec2/v2/home?region=us-east-1#Images:visibility=public-images;v=3;search=:64,:Ubuntu%2020,:Deep%20Learning%20AMI%20GPU%20PyTorch # pylint: disable=line-too-long
255+
# Current AMIs (we have to use different PyTorch versions for different OS as Ubuntu 18.04
256+
# does not have the latest PyTorch version):
257+
# GPU:
258+
# Deep Learning AMI GPU PyTorch 1.13.1 (Ubuntu 20.04) 20230103
259+
# Nvidia driver: 515.65.01, CUDA Version: 11.7
260+
#
261+
# Deep Learning AMI GPU PyTorch 1.10.0 (Ubuntu 18.04) 20221114
262+
# Nvidia driver: 510.47.03, CUDA Version: 11.6
263+
#
264+
# K80:
265+
# Deep Learning AMI GPU PyTorch 1.10.0 (Ubuntu 20.04) 20211208
266+
# Nvidia driver: 470.57.02, CUDA Version: 11.4
267+
#
268+
# Deep Learning AMI GPU PyTorch 1.10.0 (Ubuntu 18.04) 20211208
269+
# Nvidia driver: 470.57.02, CUDA Version: 11.4
270+
_GPU_UBUNTU_DATE_PYTORCH = [
271+
('gpu', '20.04', '20230103', '1.13.1'),
272+
('gpu', '18.04', '20221114', '1.10.0'),
273+
('k80', '20.04', '20211208', '1.10.0'),
274+
('k80', '18.04', '20211208', '1.10.0'),
275+
]
276+
277+
278+
def _fetch_image_id(region: str, ubuntu_version: str, creation_date: str,
279+
pytorch_version: str) -> Optional[str]:
273280
try:
274281
image = subprocess.check_output(f"""\
275282
aws ec2 describe-images --region {region} --owners amazon \\
276-
--filters 'Name=name,Values="Deep Learning AMI GPU PyTorch 1.10.0 (Ubuntu {ubuntu_version}) {creation_date}"' \\
283+
--filters 'Name=name,Values="Deep Learning AMI GPU PyTorch {pytorch_version} (Ubuntu {ubuntu_version}) {creation_date}"' \\
277284
'Name=state,Values=available' --query 'Images[:1].ImageId' --output text
278285
""",
279286
shell=True)
@@ -290,33 +297,25 @@ def _fetch_image_id(region: str, ubuntu_version: str,
290297

291298
@ray.remote
292299
def _get_image_row(
293-
region: str, ubuntu_version: str,
294-
cpu_or_gpu: str) -> Tuple[str, str, str, str, Optional[str], str]:
295-
print(f'Getting image for {region}, {ubuntu_version}, {cpu_or_gpu}')
296-
creation_date = _GPU_TO_IMAGE_DATE[cpu_or_gpu]
297-
date = None
298-
for date in creation_date:
299-
image_id = _fetch_image_id(region, ubuntu_version, date)
300-
if image_id:
301-
break
302-
else:
300+
region: str, gpu: str, ubuntu_version: str, date: str,
301+
pytorch_version) -> Tuple[str, str, str, str, Optional[str], str]:
302+
print(f'Getting image for {region}, {ubuntu_version}, {gpu}')
303+
image_id = _fetch_image_id(region, ubuntu_version, date, pytorch_version)
304+
if image_id is None:
303305
# not found
304-
print(
305-
f'Failed to find image for {region}, {ubuntu_version}, {cpu_or_gpu}'
306-
)
307-
if date is None:
308-
raise ValueError(f'Could not find the creation date for {cpu_or_gpu}.')
309-
tag = f'skypilot:{cpu_or_gpu}-ubuntu-{ubuntu_version.replace(".", "")}'
306+
print(f'Failed to find image for {region}, {ubuntu_version}, {gpu}')
307+
tag = f'skypilot:{gpu}-ubuntu-{ubuntu_version.replace(".", "")}'
310308
return tag, region, 'ubuntu', ubuntu_version, image_id, date
311309

312310

313311
def get_all_regions_images_df(regions: Set[str]) -> pd.DataFrame:
314312
workers = []
315-
for cpu_or_gpu in _GPU_TO_IMAGE_DATE:
316-
for ubuntu_version in _UBUNTU_VERSION:
317-
for region in regions:
318-
workers.append(
319-
_get_image_row.remote(region, ubuntu_version, cpu_or_gpu))
313+
for (gpu, ubuntu_version, date,
314+
pytorch_version) in _GPU_UBUNTU_DATE_PYTORCH:
315+
for region in regions:
316+
workers.append(
317+
_get_image_row.remote(region, gpu, ubuntu_version, date,
318+
pytorch_version))
320319

321320
results = ray.get(workers)
322321
results = pd.DataFrame(

tests/test_smoke.py

+13-13
Original file line numberDiff line numberDiff line change
@@ -337,21 +337,21 @@ def test_aws_image_id_dict_region():
337337
'aws_image_id_dict_region',
338338
[
339339
# Use region to filter image_id dict.
340-
f'sky launch -y -c {name} --region us-east-2 examples/per_region_images.yaml && exit 1 || true',
340+
f'sky launch -y -c {name} --region us-east-1 examples/per_region_images.yaml && exit 1 || true',
341341
f'sky status | grep {name} && exit 1 || true', # Ensure the cluster is not created.
342-
f'sky launch -y -c {name} --region us-west-1 examples/per_region_images.yaml',
342+
f'sky launch -y -c {name} --region us-west-2 examples/per_region_images.yaml',
343343
# Should success because the image id match for the region.
344344
f'sky launch -c {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml',
345345
f'sky exec {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml',
346346
f'sky exec {name} --image-id skypilot:gpu-ubuntu-2004 examples/minimal.yaml && exit 1 || true',
347347
f'sky logs {name} 1 --status',
348348
f'sky logs {name} 2 --status',
349349
f'sky logs {name} 3 --status',
350-
f'sky status --all | grep {name} | grep us-west-1', # Ensure the region is correct.
350+
f'sky status --all | grep {name} | grep us-west-2', # Ensure the region is correct.
351351
# Ensure exec works.
352-
f'sky exec {name} --region us-west-1 examples/per_region_images.yaml',
352+
f'sky exec {name} --region us-west-2 examples/per_region_images.yaml',
353353
f'sky exec {name} examples/per_region_images.yaml',
354-
f'sky exec {name} --cloud aws --region us-west-1 "ls ~"',
354+
f'sky exec {name} --cloud aws --region us-west-2 "ls ~"',
355355
f'sky exec {name} "ls ~"',
356356
f'sky logs {name} 4 --status',
357357
f'sky logs {name} 5 --status',
@@ -403,9 +403,9 @@ def test_aws_image_id_dict_zone():
403403
'aws_image_id_dict_zone',
404404
[
405405
# Use zone to filter image_id dict.
406-
f'sky launch -y -c {name} --zone us-east-2b examples/per_region_images.yaml && exit 1 || true',
406+
f'sky launch -y -c {name} --zone us-east-1b examples/per_region_images.yaml && exit 1 || true',
407407
f'sky status | grep {name} && exit 1 || true', # Ensure the cluster is not created.
408-
f'sky launch -y -c {name} --zone us-west-1a examples/per_region_images.yaml',
408+
f'sky launch -y -c {name} --zone us-west-2a examples/per_region_images.yaml',
409409
# Should success because the image id match for the zone.
410410
f'sky launch -y -c {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml',
411411
f'sky exec {name} --image-id skypilot:gpu-ubuntu-1804 examples/minimal.yaml',
@@ -414,11 +414,11 @@ def test_aws_image_id_dict_zone():
414414
f'sky logs {name} 1 --status',
415415
f'sky logs {name} 2 --status',
416416
f'sky logs {name} 3 --status',
417-
f'sky status --all | grep {name} | grep us-west-1a', # Ensure the zone is correct.
417+
f'sky status --all | grep {name} | grep us-west-2a', # Ensure the zone is correct.
418418
# Ensure exec works.
419-
f'sky exec {name} --zone us-west-1a examples/per_region_images.yaml',
419+
f'sky exec {name} --zone us-west-2a examples/per_region_images.yaml',
420420
f'sky exec {name} examples/per_region_images.yaml',
421-
f'sky exec {name} --cloud aws --region us-west-1 "ls ~"',
421+
f'sky exec {name} --cloud aws --region us-west-2 "ls ~"',
422422
f'sky exec {name} "ls ~"',
423423
f'sky logs {name} 4 --status',
424424
f'sky logs {name} 5 --status',
@@ -471,7 +471,7 @@ def test_image_no_conda():
471471
'image_no_conda',
472472
[
473473
# Use image id dict.
474-
f'sky launch -y -c {name} --region us-west-2 examples/per_region_images.yaml',
474+
f'sky launch -y -c {name} --region us-east-2 examples/per_region_images.yaml',
475475
f'sky logs {name} 1 --status',
476476
f'sky stop {name} -y',
477477
f'sky start {name} -y',
@@ -606,7 +606,7 @@ def test_aws_storage_mounts():
606606
file_path = f.name
607607
test_commands = [
608608
*storage_setup_commands,
609-
f'sky launch -y -c {name}-aws --cloud aws {file_path}',
609+
f'sky launch -y -c {name} --cloud aws {file_path}',
610610
f'sky logs {name} 1 --status', # Ensure job succeeded.
611611
f'aws s3 ls {storage_name}/hello.txt',
612612
]
@@ -1246,7 +1246,7 @@ def test_spot_recovery_multi_node_aws():
12461246
'--output text)'),
12471247
'sleep 50',
12481248
f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RECOVERING"',
1249-
'sleep 420',
1249+
'sleep 500',
12501250
f'{_SPOT_QUEUE_WAIT}| grep {name} | head -n1 | grep "RUNNING"',
12511251
f'RUN_ID=$(cat /tmp/{name}-run-id); echo $RUN_ID; sky spot logs -n {name} --no-follow | grep SKYPILOT_JOB_ID | cut -d: -f2 | grep "$RUN_ID"',
12521252
],

0 commit comments

Comments
 (0)