-
Notifications
You must be signed in to change notification settings - Fork 917
Default to ubuntu for GCP and avoid key pair checking #1641
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7b00703
685327e
289db1e
17e35b2
3f86961
f4507bb
d1cc352
56b925f
ccd7365
f5c2044
31f906a
fcf05ec
0985954
66d7dbb
2e871e7
f85b4f3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,7 +5,7 @@ | |
| import subprocess | ||
| import time | ||
| import typing | ||
| from typing import Dict, Iterator, List, Optional, Tuple | ||
| from typing import Any, Dict, Iterator, List, Optional, Tuple | ||
|
|
||
| from sky import clouds | ||
| from sky import exceptions | ||
|
|
@@ -77,6 +77,11 @@ | |
|
|
||
| # TODO(zhwu): Move the default AMI size to the catalog instead. | ||
| DEFAULT_GCP_IMAGE_GB = 50 | ||
| _DEFAULT_CPU_IMAGE = 'skypilot:cpu-ubuntu-2004' | ||
| # Other GPUs: CUDA driver version 510.47.03, CUDA Library 11.6. | ||
| # K80: CUDA driver version 470.103.01, CUDA Library 11.4 (we manually install | ||
| # the older CUDA driver in the gcp-ray.yaml to support K80). | ||
| _DEFAULT_GPU_IMAGE = 'skypilot:gpu-ubuntu-2004' | ||
|
|
||
|
|
||
| def _run_output(cmd): | ||
|
|
@@ -243,26 +248,24 @@ def get_egress_cost(self, num_gigabytes): | |
| def is_same_cloud(self, other): | ||
| return isinstance(other, GCP) | ||
|
|
||
| def get_image_size(self, image_id: str, region: Optional[str]) -> float: | ||
| del region # unused | ||
| if image_id.startswith('skypilot:'): | ||
| return DEFAULT_GCP_IMAGE_GB | ||
| @classmethod | ||
| def get_image_info(cls, image_id) -> Dict[str, Any]: | ||
| try: | ||
| compute = gcp.build('compute', | ||
| 'v1', | ||
| credentials=None, | ||
| cache_discovery=False) | ||
| except gcp.credential_error_exception() as e: | ||
| return DEFAULT_GCP_IMAGE_GB | ||
| return {} | ||
| try: | ||
| image_attrs = image_id.split('/') | ||
| if len(image_attrs) == 1: | ||
| raise ValueError(f'Image {image_id!r} not found in GCP.') | ||
| project = image_attrs[1] | ||
| image_name = image_attrs[-1] | ||
| image_infos = compute.images().get(project=project, | ||
| image=image_name).execute() | ||
| return float(image_infos['diskSizeGb']) | ||
| image_info = compute.images().get(project=project, | ||
| image=image_name).execute() | ||
| return image_info | ||
| except gcp.http_error_exception() as e: | ||
| if e.resp.status == 403: | ||
| with ux_utils.print_exception_no_traceback(): | ||
|
|
@@ -274,6 +277,21 @@ def get_image_size(self, image_id: str, region: Optional[str]) -> float: | |
| 'GCP.') from None | ||
| raise | ||
|
|
||
| def get_image_size(self, image_id: str, region: Optional[str]) -> float: | ||
| del region # unused | ||
| if image_id.startswith('skypilot:'): | ||
| # Hack: this utilizes the knowledge that both the selected debian | ||
| # and ubuntu images on GCP have the same size of 50GB, to reduce | ||
| # the overhead for querying the image size. | ||
| return DEFAULT_GCP_IMAGE_GB | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Q: how do we guarantee that the ubuntu & debian tags have the same size, DEFAULT_GCP_IMAGE_GB?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The image size can be got using |
||
| image_info = self.get_image_info(image_id) | ||
| if 'diskSizeGb' not in image_info: | ||
| # All the images in GCP should have the diskSizeGb field, but | ||
| # just in case, we do not want to crash the program, as the image | ||
| # size check is not critical. | ||
| return DEFAULT_GCP_IMAGE_GB | ||
| return float(image_info['diskSizeGb']) | ||
|
|
||
| @classmethod | ||
| def get_default_instance_type( | ||
| cls, | ||
|
|
@@ -295,10 +313,8 @@ def make_deploy_resources_variables( | |
|
|
||
| # gcloud compute images list \ | ||
| # --project deeplearning-platform-release \ | ||
| # --no-standard-images | ||
| # We use the debian image, as the ubuntu image has some connectivity | ||
| # issue when first booted. | ||
| image_id = 'skypilot:cpu-debian-10' | ||
| # --no-standard-images | grep ubuntu-2004 | ||
| image_id = _DEFAULT_CPU_IMAGE | ||
|
|
||
| r = resources | ||
| # Find GPU spec, if any. | ||
|
|
@@ -338,17 +354,8 @@ def make_deploy_resources_variables( | |
| resources_vars['gpu'] = 'nvidia-tesla-{}'.format( | ||
| acc.lower()) | ||
| resources_vars['gpu_count'] = acc_count | ||
| if acc == 'K80': | ||
| # Though the image is called cu113, it actually has later | ||
| # versions of CUDA as noted below. | ||
| # CUDA driver version 470.57.02, CUDA Library 11.4 | ||
| image_id = 'skypilot:k80-debian-10' | ||
| else: | ||
| # Though the image is called cu113, it actually has later | ||
| # versions of CUDA as noted below. | ||
| # CUDA driver version 510.47.03, CUDA Library 11.6 | ||
| # Does not support torch==1.13.0 with cu117 | ||
| image_id = 'skypilot:gpu-debian-10' | ||
|
|
||
| image_id = _DEFAULT_GPU_IMAGE | ||
|
|
||
| if resources.image_id is not None: | ||
| if None in resources.image_id: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,6 +26,10 @@ | |
| pull_frequency_hours=_PULL_FREQUENCY_HOURS) | ||
| _image_df = common.read_catalog('gcp/images.csv', | ||
| pull_frequency_hours=_PULL_FREQUENCY_HOURS) | ||
| if _image_df[_image_df['Tag'] == 'skypilot:cpu-ubuntu-2004'].empty: | ||
| # Update the image catalog if it does not include the updated images | ||
| # https://github.com/skypilot-org/skypilot-catalog/pull/25. | ||
| _image_df = common.read_catalog('gcp/images.csv', pull_frequency_hours=0) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to make sure, after increasing our version number, we don't need this right? |
||
|
|
||
| _TPU_REGIONS = [ | ||
| 'us-central1', | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.