From 7b00703a5ff7843dee50cc7e83bde6cfb4d25c54 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 11 May 2023 17:11:12 -0700 Subject: [PATCH 01/14] Use cloud init use ubuntu fix Disable control master for GCP fix test no control master and longer alive interval retry run command runner with retry add sleep retry for ray status Longer timeout avoid control persist format set ClientAliveInterval revert command runner revert command_runner fix ssh disconnection issue format fallback to debian for K80 fix k80 Backward compatibility for old debian based images minor fix Fix TPU VM add clientaliveinterval add markers for specific clouds uninstall before install uninstall -y azure uninstall longer wait time remove apt related setting fix ssh name --- sky/authentication.py | 187 +++++++++++++--------- sky/clouds/gcp.py | 41 ++--- sky/skylet/providers/gcp/node_provider.py | 50 ++++++ sky/templates/aws-ray.yml.j2 | 2 +- sky/templates/azure-ray.yml.j2 | 2 +- sky/templates/gcp-ray.yml.j2 | 96 ++++++++--- tests/test_smoke.py | 7 +- 7 files changed, 262 insertions(+), 123 deletions(-) diff --git a/sky/authentication.py b/sky/authentication.py index db884187b6..99de4bdff9 100644 --- a/sky/authentication.py +++ b/sky/authentication.py @@ -1,6 +1,7 @@ """Module to enable a single SkyPilot key for all VMs in each cloud.""" import copy import functools +import json import os import re import socket @@ -132,6 +133,104 @@ def _wait_for_compute_global_operation(project_name: str, operation_name: str, return result +def _maybe_gcp_add_ssh_key_to_account(compute, project, config: Dict[str, Any], + os_login_enabled: bool): + """Add ssh key to GCP account if using Debian image without cloud-init. + + This function is for backward compatibility. It is only used when the user + is using the old Debian image without cloud-init. In this case, we need to + add the ssh key to the GCP account so that we can ssh into the instance. + """ + private_key_path, public_key_path = get_or_generate_keys() + user = config['auth']['ssh_user'] + + node_config = config.get('available_node_types', + {}).get('ray_head_default', + {}).get('node_config', {}) + image_id = node_config.get('disks', [{}])[0].get('initializeParams', + {}).get('sourceImage') + # image_id is None when TPU VM is used, as TPU VM does not use image. + if image_id is not None and 'debian' not in image_id.lower(): + image_infos = clouds.GCP.get_image_infos(image_id) + if 'debian' not in json.dumps(image_infos).lower(): + # The non-Debian images have the ssh key setup by cloud-init. + return + logger.info('Adding ssh key to GCP account.') + if os_login_enabled: + # Add ssh key to GCP with oslogin + subprocess.run( + 'gcloud compute os-login ssh-keys add ' + f'--key-file={public_key_path}', + check=True, + shell=True, + stdout=subprocess.DEVNULL) + # Enable ssh port for all the instances + enable_ssh_cmd = ('gcloud compute firewall-rules create ' + 'allow-ssh-ingress-from-iap ' + '--direction=INGRESS ' + '--action=allow ' + '--rules=tcp:22 ' + '--source-ranges=0.0.0.0/0') + proc = subprocess.run(enable_ssh_cmd, + check=False, + shell=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE) + if proc.returncode != 0 and 'already exists' not in proc.stderr.decode( + 'utf-8'): + subprocess_utils.handle_returncode(proc.returncode, enable_ssh_cmd, + 'Failed to enable ssh port.', + proc.stderr.decode('utf-8')) + return config + + # OS Login is not enabled for the project. Add the ssh key directly to the + # metadata. + project_keys: str = next( # type: ignore + (item for item in project['commonInstanceMetadata'].get('items', []) + if item['key'] == 'ssh-keys'), {}).get('value', '') + ssh_keys = project_keys.split('\n') if project_keys else [] + + # Get public key from file. + with open(public_key_path, 'r') as f: + public_key = f.read() + + # Check if ssh key in Google Project's metadata + public_key_token = public_key.split(' ')[1] + + key_found = False + for key in ssh_keys: + key_list = key.split(' ') + if len(key_list) != 3: + continue + if user == key_list[-1] and os.path.exists( + private_key_path) and key_list[1] == public_key.split(' ')[1]: + key_found = True + + if not key_found: + new_ssh_key = '{user}:ssh-rsa {public_key_token} {user}'.format( + user=user, public_key_token=public_key_token) + metadata = project['commonInstanceMetadata'].get('items', []) + + ssh_key_index = [ + k for k, v in enumerate(metadata) if v['key'] == 'ssh-keys' + ] + assert len(ssh_key_index) <= 1 + + if len(ssh_key_index) == 0: + metadata.append({'key': 'ssh-keys', 'value': new_ssh_key}) + else: + first_ssh_key_index = ssh_key_index[0] + metadata[first_ssh_key_index]['value'] += '\n' + new_ssh_key + + project['commonInstanceMetadata']['items'] = metadata + + operation = compute.projects().setCommonInstanceMetadata( + project=project['name'], + body=project['commonInstanceMetadata']).execute() + _wait_for_compute_global_operation(project['name'], operation['name'], + compute) + + # Snippets of code inspired from # https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/gcp/config.py # Takes in config, a yaml dict and outputs a postprocessed dict @@ -140,7 +239,9 @@ def _wait_for_compute_global_operation(project_name: str, operation_name: str, # Retry for the GCP as sometimes there will be connection reset by peer error. @common_utils.retry def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]: - private_key_path, public_key_path = get_or_generate_keys() + _, public_key_path = get_or_generate_keys() + with open(public_key_path, 'r') as f: + public_key = f.read() config = copy.deepcopy(config) project_id = config['provider']['project_id'] @@ -148,7 +249,6 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]: 'v1', credentials=None, cache_discovery=False) - user = config['auth']['ssh_user'] try: project = compute.projects().get(project=project_id).execute() @@ -191,7 +291,8 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]: (item for item in project['commonInstanceMetadata'].get('items', []) if item['key'] == 'enable-oslogin'), {}).get('value', 'False') - if project_oslogin.lower() == 'true': + oslogin_enabled = project_oslogin.lower() == 'true' + if oslogin_enabled: # project. logger.info( f'OS Login is enabled for GCP project {project_id}. Running ' @@ -218,81 +319,11 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]: 'account information.') config['auth']['ssh_user'] = account.replace('@', '_').replace('.', '_') - # Add ssh key to GCP with oslogin - subprocess.run( - 'gcloud compute os-login ssh-keys add ' - f'--key-file={public_key_path}', - check=True, - shell=True, - stdout=subprocess.DEVNULL) - # Enable ssh port for all the instances - enable_ssh_cmd = ('gcloud compute firewall-rules create ' - 'allow-ssh-ingress-from-iap ' - '--direction=INGRESS ' - '--action=allow ' - '--rules=tcp:22 ' - '--source-ranges=0.0.0.0/0') - proc = subprocess.run(enable_ssh_cmd, - check=False, - shell=True, - stdout=subprocess.DEVNULL, - stderr=subprocess.PIPE) - if proc.returncode != 0 and 'already exists' not in proc.stderr.decode( - 'utf-8'): - subprocess_utils.handle_returncode(proc.returncode, enable_ssh_cmd, - 'Failed to enable ssh port.', - proc.stderr.decode('utf-8')) - return config - - # OS Login is not enabled for the project. Add the ssh key directly to the - # metadata. - # TODO(zhwu): Use cloud init to add ssh public key, to avoid the permission - # issue. A blocker is that the cloud init is not installed in the debian - # image by default. - project_keys: str = next( # type: ignore - (item for item in project['commonInstanceMetadata'].get('items', []) - if item['key'] == 'ssh-keys'), {}).get('value', '') - ssh_keys = project_keys.split('\n') if project_keys else [] - - # Get public key from file. - with open(public_key_path, 'r') as f: - public_key = f.read() - - # Check if ssh key in Google Project's metadata - public_key_token = public_key.split(' ')[1] - - key_found = False - for key in ssh_keys: - key_list = key.split(' ') - if len(key_list) != 3: - continue - if user == key_list[-1] and os.path.exists( - private_key_path) and key_list[1] == public_key.split(' ')[1]: - key_found = True - - if not key_found: - new_ssh_key = '{user}:ssh-rsa {public_key_token} {user}'.format( - user=user, public_key_token=public_key_token) - metadata = project['commonInstanceMetadata'].get('items', []) - - ssh_key_index = [ - k for k, v in enumerate(metadata) if v['key'] == 'ssh-keys' - ] - assert len(ssh_key_index) <= 1 - - if len(ssh_key_index) == 0: - metadata.append({'key': 'ssh-keys', 'value': new_ssh_key}) - else: - first_ssh_key_index = ssh_key_index[0] - metadata[first_ssh_key_index]['value'] += '\n' + new_ssh_key - - project['commonInstanceMetadata']['items'] = metadata - - operation = compute.projects().setCommonInstanceMetadata( - project=project['name'], - body=project['commonInstanceMetadata']).execute() - _wait_for_compute_global_operation(project['name'], operation['name'], - compute) + config = _replace_cloud_init_ssh_info_in_config(config, public_key) + # This function is for backward compatibility, as the user using the old + # Debian-based image may not have the cloud-init enabled, and we need to + # add the ssh key to the account. + _maybe_gcp_add_ssh_key_to_account(compute, project, config, oslogin_enabled) return config diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index 89fe60f427..f36e2fee21 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -5,7 +5,7 @@ import subprocess import time import typing -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Any, Dict, Iterator, List, Optional, Tuple from sky import clouds from sky import exceptions @@ -235,17 +235,15 @@ def get_egress_cost(self, num_gigabytes): def is_same_cloud(self, other): return isinstance(other, GCP) - def get_image_size(self, image_id: str, region: Optional[str]) -> float: - del region # unused - if image_id.startswith('skypilot:'): - return DEFAULT_GCP_IMAGE_GB + @classmethod + def get_image_infos(cls, image_id) -> Dict[str, Any]: try: compute = gcp.build('compute', 'v1', credentials=None, cache_discovery=False) except gcp.credential_error_exception() as e: - return DEFAULT_GCP_IMAGE_GB + return {} try: image_attrs = image_id.split('/') if len(image_attrs) == 1: @@ -254,7 +252,7 @@ def get_image_size(self, image_id: str, region: Optional[str]) -> float: image_name = image_attrs[-1] image_infos = compute.images().get(project=project, image=image_name).execute() - return float(image_infos['diskSizeGb']) + return image_infos except gcp.http_error_exception() as e: if e.resp.status == 403: with ux_utils.print_exception_no_traceback(): @@ -266,6 +264,15 @@ def get_image_size(self, image_id: str, region: Optional[str]) -> float: 'GCP.') from None raise + def get_image_size(self, image_id: str, region: Optional[str]) -> float: + del region # unused + if image_id.startswith('skypilot:'): + return DEFAULT_GCP_IMAGE_GB + image_infos = self.get_image_infos(image_id) + if 'diskSizeGb' not in image_infos: + return DEFAULT_GCP_IMAGE_GB + return float(image_infos['diskSizeGb']) + @classmethod def get_default_instance_type( cls, @@ -287,10 +294,10 @@ def make_deploy_resources_variables( # gcloud compute images list \ # --project deeplearning-platform-release \ - # --no-standard-images + # --no-standard-images | grep ubuntu-2004 # We use the debian image, as the ubuntu image has some connectivity # issue when first booted. - image_id = 'skypilot:cpu-debian-10' + image_id = 'skypilot:cpu-ubuntu-2004' r = resources # Find GPU spec, if any. @@ -330,17 +337,11 @@ def make_deploy_resources_variables( resources_vars['gpu'] = 'nvidia-tesla-{}'.format( acc.lower()) resources_vars['gpu_count'] = acc_count - if acc == 'K80': - # Though the image is called cu113, it actually has later - # versions of CUDA as noted below. - # CUDA driver version 470.57.02, CUDA Library 11.4 - image_id = 'skypilot:k80-debian-10' - else: - # Though the image is called cu113, it actually has later - # versions of CUDA as noted below. - # CUDA driver version 510.47.03, CUDA Library 11.6 - # Does not support torch==1.13.0 with cu117 - image_id = 'skypilot:gpu-debian-10' + # Though the image is called cu113, it actually has later + # versions of CUDA as noted below. + # CUDA driver version 510.47.03, CUDA Library 11.6 + # K80: CUDA driver version 470.103.01, CUDA Library 11.4 + image_id = 'skypilot:gpu-ubuntu-2004' if resources.image_id is not None: if None in resources.image_id: diff --git a/sky/skylet/providers/gcp/node_provider.py b/sky/skylet/providers/gcp/node_provider.py index 1a3443f52e..224a75d4da 100644 --- a/sky/skylet/providers/gcp/node_provider.py +++ b/sky/skylet/providers/gcp/node_provider.py @@ -362,3 +362,53 @@ def _get_cached_node(self, node_id: str) -> GCPNode: @staticmethod def bootstrap_config(cluster_config): return bootstrap_gcp(cluster_config) + + def get_command_runner( + self, + log_prefix, + node_id, + auth_config, + cluster_name, + process_runner, + use_internal_ip, + docker_config, + ): + from ray.autoscaler._private.command_runner import ( + DockerCommandRunner, + SSHCommandRunner, + ) + + class SSHCommandRunnerWithRetry(SSHCommandRunner): + def _run_helper( + self, final_cmd, with_output=False, exit_on_fail=False, silent=False + ): + """Wrapper around _run_helper to retry on failure.""" + retry_cnt = 0 + import click + + while True: + try: + return super()._run_helper( + final_cmd, with_output, exit_on_fail, silent + ) + except click.ClickException as e: + retry_cnt += 1 + if retry_cnt > 3: + raise e + logger.info(f"Retrying SSH command in 5 seconds: {e}") + time.sleep(5) + + # Adopted from super().get_command_runner() + common_args = { + "log_prefix": log_prefix, + "node_id": node_id, + "provider": self, + "auth_config": auth_config, + "cluster_name": cluster_name, + "process_runner": process_runner, + "use_internal_ip": use_internal_ip, + } + if docker_config and docker_config["container_name"] != "": + return DockerCommandRunner(docker_config, **common_args) + else: + return SSHCommandRunnerWithRetry(**common_args) diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index 8d82a03ccc..66966e6384 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -177,7 +177,7 @@ setup_commands: (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; (which conda > /dev/null 2>&1 && conda init > /dev/null) || (wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -b && eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true); source ~/.bashrc; - (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app; + (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 uninstall -y ray && pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app; (pip3 list | grep skypilot && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[aws]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf'; sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2 index bb108c6460..fb88290940 100644 --- a/sky/templates/azure-ray.yml.j2 +++ b/sky/templates/azure-ray.yml.j2 @@ -126,7 +126,7 @@ setup_commands: (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; which conda > /dev/null 2>&1 || (wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -b && eval "$(/home/azureuser/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true); source ~/.bashrc; - (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful; + (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 uninstall -y ray && pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful; (pip3 list | grep skypilot && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[azure]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf'; sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index dc17ea2e55..5874ccc4d0 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -55,14 +55,35 @@ available_node_types: # See https://cloud.google.com/deep-learning-vm/docs/images sourceImage: {{image_id}} diskType: zones/{{zones}}/diskTypes/{{disk_tier}} + metadata: + items: + - key: user-data + value: | + #cloud-config + users: + - name: skypilot:ssh_user + shell: /bin/bash + sudo: ALL=(ALL) NOPASSWD:ALL + ssh-authorized-keys: + - skypilot:ssh_public_key_content + bootcmd: + - echo 'ClientAliveInterval 720' >> /etc/ssh/sshd_config + - echo 'ClientAliveCountMax 720' >> /etc/ssh/sshd_config + - systemctl restart sshd + - echo 'APT::Periodic::Enable "0";' > /etc/apt/apt.conf.d/10cloudinit-disable + - apt-get -y purge update-notifier-common ubuntu-release-upgrader-core landscape-common unattended-upgrades + - echo "Removed APT" | systemd-cat {%- if gpu is not none %} + {%- if 'tesla-k80' in gpu %} + runcmd: + - curl https://raw.githubusercontent.com/GoogleCloudPlatform/compute-gpu-installation/main/linux/install_gpu_driver.py --output install_gpu_driver.py + - python3 install_gpu_driver.py + {%- endif %} + - key: install-nvidia-driver + value: true guestAccelerators: - acceleratorType: projects/{{gcp_project_id}}/zones/{{zones}}/acceleratorTypes/{{gpu}} acceleratorCount: {{gpu_count}} - metadata: - items: - - key: install-nvidia-driver - value: "True" {%- endif %} scheduling: {%- if use_spot %} @@ -98,14 +119,35 @@ available_node_types: # See https://cloud.google.com/deep-learning-vm/docs/images sourceImage: {{image_id}} diskType: zones/{{zones}}/diskTypes/{{disk_tier}} + metadata: + items: + - key: user-data + value: | + #cloud-config + users: + - name: skypilot:ssh_user + shell: /bin/bash + sudo: ALL=(ALL) NOPASSWD:ALL + ssh-authorized-keys: + - skypilot:ssh_public_key_content + bootcmd: + - echo 'ClientAliveInterval 720' >> /etc/ssh/sshd_config + - echo 'ClientAliveCountMax 720' >> /etc/ssh/sshd_config + - systemctl restart sshd + - echo 'APT::Periodic::Enable "0";' > /etc/apt/apt.conf.d/10cloudinit-disable + - apt-get -y purge update-notifier-common ubuntu-release-upgrader-core landscape-common unattended-upgrades + - echo "Removed APT" | systemd-cat {%- if gpu is not none %} + {%- if 'tesla-k80' in gpu %} + runcmd: + - curl https://raw.githubusercontent.com/GoogleCloudPlatform/compute-gpu-installation/main/linux/install_gpu_driver.py --output install_gpu_driver.py + - python3 install_gpu_driver.py + {%- endif %} + - key: install-nvidia-driver + value: true guestAccelerators: - acceleratorType: projects/{{gcp_project_id}}/zones/{{zones}}/acceleratorTypes/{{gpu}} acceleratorCount: {{gpu_count}} - metadata: - items: - - key: install-nvidia-driver - value: "True" {%- endif %} scheduling: {%- if use_spot %} @@ -150,17 +192,7 @@ setup_commands: # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase. # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - - function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; }; - sudo systemctl stop unattended-upgrades || true; - sudo systemctl disable unattended-upgrades || true; - sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true; - p=$(mylsof "/var/lib/dpkg/lock-frontend"); echo "$p"; - sudo kill -9 `echo "$p" | tail -n 1` || true; - sudo rm /var/lib/dpkg/lock-frontend; - sudo pkill -9 dpkg; - sudo pkill -9 apt-get; - sudo dpkg --configure --force-overwrite -a; - mkdir -p ~/.ssh; touch ~/.ssh/config; + - mkdir -p ~/.ssh; touch ~/.ssh/config; pip3 --version > /dev/null 2>&1 || (curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py && echo "PATH=$HOME/.local/bin:$PATH" >> ~/.bashrc); (type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc; (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; @@ -170,7 +202,7 @@ setup_commands: test -f /home/gcpuser/miniconda3/etc/profile.d/conda.sh && source /home/gcpuser/miniconda3/etc/profile.d/conda.sh && conda activate base || true; pip3 install --upgrade google-api-python-client; {%- endif %} - (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app; + (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 uninstall -y ray && pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app; (pip3 list | grep skypilot && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[gcp]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf'; sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; @@ -184,12 +216,23 @@ setup_commands: # items! The same comment applies for worker_start_ray_commands. # # Increment the following for catching performance bugs easier: -# current num items (num SSH connections): 1 +# current num items (num SSH connections): 2 head_start_ray_commands: # Start skylet daemon. (Should not place it in the head_setup_commands, otherwise it will run before sky is installed.) # NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait. # Line "which prlimit ..": increase the limit of the number of open files for the raylet process, as the `ulimit` may not take effect at this point, because it requires # all the sessions to be reloaded. This is a workaround. +{%- if gpu is not none %} + - | + echo "Installing NVIDIA GPU driver." >> ~/.sky/nvlog + while ! nvidia-smi &> /dev/null + do + echo "Waiting for NVIDIA drivers to be installed..." >> ~/.sky/nvlog + sleep 5 + done + sleep 10 + echo "NVIDIA GPU is ready." >> ~/.sky/nvlog +{%- endif %} - ((ps aux | grep -v nohup | grep -v grep | grep -q -- "python3 -m sky.skylet.skylet") || nohup python3 -m sky.skylet.skylet >> ~/.sky/skylet.log 2>&1 &); export SKYPILOT_NUM_GPUS=0 && which nvidia-smi > /dev/null && SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index,name --format=csv,noheader | wc -l); ray stop; RAY_SCHEDULER_EVENTS=0 ray start --disable-usage-stats --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --num-gpus=$SKYPILOT_NUM_GPUS || exit 1; @@ -198,6 +241,17 @@ head_start_ray_commands: # Worker commands are needed for TPU VM Pods {%- if num_nodes > 1 or tpu_vm %} worker_start_ray_commands: +{%- if gpu is not none %} + - | + echo "Installing NVIDIA GPU driver." >> ~/.sky/nvlog + while ! nvidia-smi &> /dev/null + do + echo "Waiting for NVIDIA drivers to be installed..." >> ~/.sky/nvlog + sleep 5 + done + sleep 10 + echo "NVIDIA GPU is ready." >> ~/.sky/nvlog +{%- endif %} - SKYPILOT_NUM_GPUS=0 && which nvidia-smi > /dev/null && SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index,name --format=csv,noheader | wc -l); ray stop; RAY_SCHEDULER_EVENTS=0 ray start --disable-usage-stats --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --num-gpus=$SKYPILOT_NUM_GPUS || exit 1; which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done; diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 5f29476710..1ea1d17dc6 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1107,6 +1107,7 @@ def test_gcp_start_stop(): f'sky logs {name} 4 --status', # Ensure the job succeeded. ], f'sky down -y {name}', + timeout=20 * 60, ) run_one_test(test) @@ -1783,6 +1784,7 @@ def test_azure_start_stop_two_nodes(): # ---------- Testing env for disk tier ---------- +@pytest.mark.aws def test_aws_disk_tier(): def _get_aws_query_command(region, instance_id, field, expected): @@ -1811,11 +1813,11 @@ def _get_aws_query_command(region, instance_id, field, expected): specs['disk_throughput']))), ], f'sky down -y {name}', - timeout=10 * 60, # 10 mins (it takes around ~6 mins) + timeout=12 * 60, ) run_one_test(test) - +@pytest.mark.gcp def test_gcp_disk_tier(): for disk_tier in ['low', 'medium', 'high']: type = GCP._get_disk_type(disk_tier) @@ -1837,6 +1839,7 @@ def test_gcp_disk_tier(): run_one_test(test) +@pytest.mark.azure def test_azure_disk_tier(): for disk_tier in ['low', 'medium']: type = Azure._get_disk_type(disk_tier) From 685327e603e0f56ecb65c03e84c217216720b24a Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 12 May 2023 12:57:38 -0700 Subject: [PATCH 02/14] Use file based apt update disabling --- sky/templates/gcp-ray.yml.j2 | 38 +++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index 5874ccc4d0..813ddfa5d3 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -64,15 +64,18 @@ available_node_types: - name: skypilot:ssh_user shell: /bin/bash sudo: ALL=(ALL) NOPASSWD:ALL - ssh-authorized-keys: + ssh_authorized_keys: - skypilot:ssh_public_key_content - bootcmd: - - echo 'ClientAliveInterval 720' >> /etc/ssh/sshd_config - - echo 'ClientAliveCountMax 720' >> /etc/ssh/sshd_config - - systemctl restart sshd - - echo 'APT::Periodic::Enable "0";' > /etc/apt/apt.conf.d/10cloudinit-disable - - apt-get -y purge update-notifier-common ubuntu-release-upgrader-core landscape-common unattended-upgrades - - echo "Removed APT" | systemd-cat + write_files: + - path: /etc/apt/apt.conf.d/20auto-upgrades + content: | + APT::Periodic::Update-Package-Lists "0"; + APT::Periodic::Download-Upgradeable-Packages "0"; + APT::Periodic::AutocleanInterval "0"; + APT::Periodic::Unattended-Upgrade "0"; + - path: /etc/apt/apt.conf.d/10cloudinit-disable + content: | + APT::Periodic::Enable "0"; {%- if gpu is not none %} {%- if 'tesla-k80' in gpu %} runcmd: @@ -128,15 +131,18 @@ available_node_types: - name: skypilot:ssh_user shell: /bin/bash sudo: ALL=(ALL) NOPASSWD:ALL - ssh-authorized-keys: + ssh_authorized_keys: - skypilot:ssh_public_key_content - bootcmd: - - echo 'ClientAliveInterval 720' >> /etc/ssh/sshd_config - - echo 'ClientAliveCountMax 720' >> /etc/ssh/sshd_config - - systemctl restart sshd - - echo 'APT::Periodic::Enable "0";' > /etc/apt/apt.conf.d/10cloudinit-disable - - apt-get -y purge update-notifier-common ubuntu-release-upgrader-core landscape-common unattended-upgrades - - echo "Removed APT" | systemd-cat + write_files: + - path: /etc/apt/apt.conf.d/20auto-upgrades + content: | + APT::Periodic::Update-Package-Lists "0"; + APT::Periodic::Download-Upgradeable-Packages "0"; + APT::Periodic::AutocleanInterval "0"; + APT::Periodic::Unattended-Upgrade "0"; + - path: /etc/apt/apt.conf.d/10cloudinit-disable + content: | + APT::Periodic::Enable "0"; {%- if gpu is not none %} {%- if 'tesla-k80' in gpu %} runcmd: From 289db1eca634cb1e3ec09471254a3e395de11a7a Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 12 May 2023 13:23:21 -0700 Subject: [PATCH 03/14] Add ssh config --- sky/templates/gcp-ray.yml.j2 | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index 813ddfa5d3..5e286e957c 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -76,6 +76,11 @@ available_node_types: - path: /etc/apt/apt.conf.d/10cloudinit-disable content: | APT::Periodic::Enable "0"; + - path: /etc/ssh/sshd_config + append: true + content: | + ClientAliveInterval 720 + ClientAliveCountMax 720 {%- if gpu is not none %} {%- if 'tesla-k80' in gpu %} runcmd: @@ -143,6 +148,11 @@ available_node_types: - path: /etc/apt/apt.conf.d/10cloudinit-disable content: | APT::Periodic::Enable "0"; + - path: /etc/ssh/sshd_config + append: true + content: | + ClientAliveInterval 720 + ClientAliveCountMax 720 {%- if gpu is not none %} {%- if 'tesla-k80' in gpu %} runcmd: From 17e35b200494b2a9d9a78cf2382b563110523f36 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 12 May 2023 15:11:57 -0700 Subject: [PATCH 04/14] Add magic number for waiting --- sky/templates/gcp-ray.yml.j2 | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index 5e286e957c..45ee66d850 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -246,7 +246,9 @@ head_start_ray_commands: echo "Waiting for NVIDIA drivers to be installed..." >> ~/.sky/nvlog sleep 5 done - sleep 10 + # Magic number for waiting the nvidia driver to be ready and the instance + # to be rebooted. + sleep 18 echo "NVIDIA GPU is ready." >> ~/.sky/nvlog {%- endif %} - ((ps aux | grep -v nohup | grep -v grep | grep -q -- "python3 -m sky.skylet.skylet") || nohup python3 -m sky.skylet.skylet >> ~/.sky/skylet.log 2>&1 &); @@ -265,7 +267,9 @@ worker_start_ray_commands: echo "Waiting for NVIDIA drivers to be installed..." >> ~/.sky/nvlog sleep 5 done - sleep 10 + # Magic number for waiting the nvidia driver to be ready and the instance + # to be rebooted. + sleep 18 echo "NVIDIA GPU is ready." >> ~/.sky/nvlog {%- endif %} - SKYPILOT_NUM_GPUS=0 && which nvidia-smi > /dev/null && SKYPILOT_NUM_GPUS=$(nvidia-smi --query-gpu=index,name --format=csv,noheader | wc -l); From 3f86961579934eaee251d6a3aeea34347c314b7d Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 12 May 2023 16:14:54 -0700 Subject: [PATCH 05/14] format --- tests/test_smoke.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 1ea1d17dc6..38afb72903 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1817,6 +1817,7 @@ def _get_aws_query_command(region, instance_id, field, expected): ) run_one_test(test) + @pytest.mark.gcp def test_gcp_disk_tier(): for disk_tier in ['low', 'medium', 'high']: From f4507bb69d56f0629a376e7fbe6d449ac3a680ed Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 12 May 2023 16:27:02 -0700 Subject: [PATCH 06/14] uninstall ray-cpp as well --- sky/templates/aws-ray.yml.j2 | 2 +- sky/templates/azure-ray.yml.j2 | 2 +- sky/templates/gcp-ray.yml.j2 | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index 66966e6384..62659ae66d 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -177,7 +177,7 @@ setup_commands: (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; (which conda > /dev/null 2>&1 && conda init > /dev/null) || (wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -b && eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true); source ~/.bashrc; - (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 uninstall -y ray && pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app; + (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 uninstall -y ray ray-cpp && pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app; (pip3 list | grep skypilot && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[aws]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf'; sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2 index fb88290940..802cc1f59e 100644 --- a/sky/templates/azure-ray.yml.j2 +++ b/sky/templates/azure-ray.yml.j2 @@ -126,7 +126,7 @@ setup_commands: (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; which conda > /dev/null 2>&1 || (wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -b && eval "$(/home/azureuser/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true); source ~/.bashrc; - (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 uninstall -y ray && pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful; + (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 uninstall -y ray ray-cpp && pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful; (pip3 list | grep skypilot && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[azure]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf'; sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index 45ee66d850..34fa525e26 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -218,7 +218,7 @@ setup_commands: test -f /home/gcpuser/miniconda3/etc/profile.d/conda.sh && source /home/gcpuser/miniconda3/etc/profile.d/conda.sh && conda activate base || true; pip3 install --upgrade google-api-python-client; {%- endif %} - (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 uninstall -y ray && pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app; + (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 uninstall -y ray ray-cpp && pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app; (pip3 list | grep skypilot && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[gcp]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf'; sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; From d1cc352db954fca3e2d7d82484c9aae4e6e8baa6 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 12 May 2023 16:36:10 -0700 Subject: [PATCH 07/14] check the corrupted installation --- sky/templates/gcp-ray.yml.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index 34fa525e26..9bc347d569 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -218,8 +218,8 @@ setup_commands: test -f /home/gcpuser/miniconda3/etc/profile.d/conda.sh && source /home/gcpuser/miniconda3/etc/profile.d/conda.sh && conda activate base || true; pip3 install --upgrade google-api-python-client; {%- endif %} - (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 uninstall -y ray ray-cpp && pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app; - (pip3 list | grep skypilot && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[gcp]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); + (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null && python3 -c "import ray" || pip3 uninstall -y ray ray-cpp && pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app; + (pip3 list | grep skypilot && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) && python3 -c "import sky" || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[gcp]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1); sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf'; sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload; mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config; From 56b925fc1a79f88a3cdb3f1081f09a7e7950800e Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 12 May 2023 21:00:25 -0700 Subject: [PATCH 08/14] increase the timeout --- tests/test_smoke.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 38afb72903..4567990913 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -954,7 +954,7 @@ def test_multi_echo(generic_cloud: str): # unfulfilled' error. If process not found, grep->ssh returns 1. [f'ssh {name} \'ps aux | grep "[/]"monitor.py\''], f'sky down -y {name}', - timeout=20 * 60, + timeout=25 * 60, ) run_one_test(test) From ccd7365f0b6105a777d75bbf30c41f7377f3a1c8 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 14 May 2023 11:17:02 -0700 Subject: [PATCH 09/14] increase catalog version --- sky/clouds/service_catalog/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/constants.py b/sky/clouds/service_catalog/constants.py index 8148fef468..0fa8c7e9c2 100644 --- a/sky/clouds/service_catalog/constants.py +++ b/sky/clouds/service_catalog/constants.py @@ -2,5 +2,5 @@ import os HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long -CATALOG_SCHEMA_VERSION = 'v5' +CATALOG_SCHEMA_VERSION = 'v6' LOCAL_CATALOG_DIR = os.path.expanduser('~/.sky/catalogs/') From f5c2044d6651645c76c265a6b1f477daa511c246 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 14 May 2023 22:13:38 -0700 Subject: [PATCH 10/14] Address comments --- sky/authentication.py | 8 +++--- sky/clouds/gcp.py | 34 +++++++++++++---------- sky/skylet/providers/gcp/node_provider.py | 14 +++++++++- sky/templates/aws-ray.yml.j2 | 1 - sky/templates/gcp-ray.yml.j2 | 29 ++++++++----------- 5 files changed, 49 insertions(+), 37 deletions(-) diff --git a/sky/authentication.py b/sky/authentication.py index 99de4bdff9..78acd21d7d 100644 --- a/sky/authentication.py +++ b/sky/authentication.py @@ -133,7 +133,7 @@ def _wait_for_compute_global_operation(project_name: str, operation_name: str, return result -def _maybe_gcp_add_ssh_key_to_account(compute, project, config: Dict[str, Any], +def _maybe_add_ssh_key_to_gcp_project_if_debian(compute, project, config: Dict[str, Any], os_login_enabled: bool): """Add ssh key to GCP account if using Debian image without cloud-init. @@ -151,8 +151,8 @@ def _maybe_gcp_add_ssh_key_to_account(compute, project, config: Dict[str, Any], {}).get('sourceImage') # image_id is None when TPU VM is used, as TPU VM does not use image. if image_id is not None and 'debian' not in image_id.lower(): - image_infos = clouds.GCP.get_image_infos(image_id) - if 'debian' not in json.dumps(image_infos).lower(): + image_info = clouds.GCP.get_image_info(image_id) + if 'debian' not in json.dumps(image_info).lower(): # The non-Debian images have the ssh key setup by cloud-init. return logger.info('Adding ssh key to GCP account.') @@ -323,7 +323,7 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]: # This function is for backward compatibility, as the user using the old # Debian-based image may not have the cloud-init enabled, and we need to # add the ssh key to the account. - _maybe_gcp_add_ssh_key_to_account(compute, project, config, oslogin_enabled) + _maybe_add_ssh_key_to_gcp_project_if_debian(compute, project, config, oslogin_enabled) return config diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index f36e2fee21..a149380f72 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -69,6 +69,11 @@ # TODO(zhwu): Move the default AMI size to the catalog instead. DEFAULT_GCP_IMAGE_GB = 50 +_DEFAULT_CPU_IMAGE = 'skypilot:cpu-ubuntu-2004' +# Other GPUs: CUDA driver version 510.47.03, CUDA Library 11.6 +# K80: CUDA driver version 470.103.01, CUDA Library 11.4 (we manually install the older +# CUDA driver in the gcp-ray.yaml to support K80) +_DEFAULT_GPU_IMAGE = 'skypilot:gpu-ubuntu-2004' def _run_output(cmd): @@ -236,7 +241,7 @@ def is_same_cloud(self, other): return isinstance(other, GCP) @classmethod - def get_image_infos(cls, image_id) -> Dict[str, Any]: + def get_image_info(cls, image_id) -> Dict[str, Any]: try: compute = gcp.build('compute', 'v1', @@ -250,9 +255,9 @@ def get_image_infos(cls, image_id) -> Dict[str, Any]: raise ValueError(f'Image {image_id!r} not found in GCP.') project = image_attrs[1] image_name = image_attrs[-1] - image_infos = compute.images().get(project=project, + image_info = compute.images().get(project=project, image=image_name).execute() - return image_infos + return image_info except gcp.http_error_exception() as e: if e.resp.status == 403: with ux_utils.print_exception_no_traceback(): @@ -267,11 +272,17 @@ def get_image_infos(cls, image_id) -> Dict[str, Any]: def get_image_size(self, image_id: str, region: Optional[str]) -> float: del region # unused if image_id.startswith('skypilot:'): + # Hack: this utilizes the knowledge that both the selected debian + # and ubuntu images on GCP have the same size of 50GB, to reduce + # the overhead for querying the image size. return DEFAULT_GCP_IMAGE_GB - image_infos = self.get_image_infos(image_id) - if 'diskSizeGb' not in image_infos: + image_info = self.get_image_info(image_id) + if 'diskSizeGb' not in image_info: + # All the images in GCP should have the diskSizeGb field, but + # just in case, we do not want to crash the program, as the image + # size check is not critical. return DEFAULT_GCP_IMAGE_GB - return float(image_infos['diskSizeGb']) + return float(image_info['diskSizeGb']) @classmethod def get_default_instance_type( @@ -295,9 +306,7 @@ def make_deploy_resources_variables( # gcloud compute images list \ # --project deeplearning-platform-release \ # --no-standard-images | grep ubuntu-2004 - # We use the debian image, as the ubuntu image has some connectivity - # issue when first booted. - image_id = 'skypilot:cpu-ubuntu-2004' + image_id = _DEFAULT_CPU_IMAGE r = resources # Find GPU spec, if any. @@ -337,11 +346,8 @@ def make_deploy_resources_variables( resources_vars['gpu'] = 'nvidia-tesla-{}'.format( acc.lower()) resources_vars['gpu_count'] = acc_count - # Though the image is called cu113, it actually has later - # versions of CUDA as noted below. - # CUDA driver version 510.47.03, CUDA Library 11.6 - # K80: CUDA driver version 470.103.01, CUDA Library 11.4 - image_id = 'skypilot:gpu-ubuntu-2004' + + image_id = _DEFAULT_GPU_IMAGE if resources.image_id is not None: if None in resources.image_id: diff --git a/sky/skylet/providers/gcp/node_provider.py b/sky/skylet/providers/gcp/node_provider.py index 224a75d4da..e2e390b25e 100644 --- a/sky/skylet/providers/gcp/node_provider.py +++ b/sky/skylet/providers/gcp/node_provider.py @@ -382,7 +382,19 @@ class SSHCommandRunnerWithRetry(SSHCommandRunner): def _run_helper( self, final_cmd, with_output=False, exit_on_fail=False, silent=False ): - """Wrapper around _run_helper to retry on failure.""" + """Wrapper around _run_helper to retry on failure. + + Fix the ssh connection issue caused by control master for GCP with ubuntu + image. Before the fix, the ssh connection will be disconnected when ray + trying to setup the runtime dependencies, which is probably because the + ssh connection is unstable when the cluster is just provisioned + https://github.com/ray-project/ray/issues/16539#issuecomment-1073138982. + The root cause can be that the GCP's async nvidia-driver installation will + reboot the machine when finished. + + We added retry for the ssh commands executed by ray up, which is ok since + our setup commands are idempotent. + """ retry_cnt = 0 import click diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index 62659ae66d..c4f0983f29 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -162,7 +162,6 @@ initialization_commands: [] # Increment the following for catching performance bugs easier: # current num items (num SSH connections): 1 setup_commands: - # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.) # Create ~/.ssh/config file in case the file does not exist in the custom image. # Make sure python3 & pip3 are available on this image. # We set auto_activate_base to be false for pre-installed conda. diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index 9bc347d569..548a723f3b 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -76,19 +76,15 @@ available_node_types: - path: /etc/apt/apt.conf.d/10cloudinit-disable content: | APT::Periodic::Enable "0"; - - path: /etc/ssh/sshd_config - append: true - content: | - ClientAliveInterval 720 - ClientAliveCountMax 720 {%- if gpu is not none %} {%- if 'tesla-k80' in gpu %} runcmd: - curl https://raw.githubusercontent.com/GoogleCloudPlatform/compute-gpu-installation/main/linux/install_gpu_driver.py --output install_gpu_driver.py - python3 install_gpu_driver.py - {%- endif %} + {%- else %} - key: install-nvidia-driver value: true + {%- endif %} guestAccelerators: - acceleratorType: projects/{{gcp_project_id}}/zones/{{zones}}/acceleratorTypes/{{gpu}} acceleratorCount: {{gpu_count}} @@ -148,19 +144,15 @@ available_node_types: - path: /etc/apt/apt.conf.d/10cloudinit-disable content: | APT::Periodic::Enable "0"; - - path: /etc/ssh/sshd_config - append: true - content: | - ClientAliveInterval 720 - ClientAliveCountMax 720 {%- if gpu is not none %} {%- if 'tesla-k80' in gpu %} runcmd: - curl https://raw.githubusercontent.com/GoogleCloudPlatform/compute-gpu-installation/main/linux/install_gpu_driver.py --output install_gpu_driver.py - python3 install_gpu_driver.py - {%- endif %} + {%- else %} - key: install-nvidia-driver value: true + {%- endif %} guestAccelerators: - acceleratorType: projects/{{gcp_project_id}}/zones/{{zones}}/acceleratorTypes/{{gpu}} acceleratorCount: {{gpu_count}} @@ -198,12 +190,13 @@ initialization_commands: [] # Increment the following for catching performance bugs easier: # current num items (num SSH connections): 1 (+1 if tpu_vm) setup_commands: - # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.) # Line 'mkdir -p ..': Create ~/.ssh/config file in case the file does not exist in the custom image. # Line 'pip3 --v ..': Make sure python3 & pip3 are available on this image. # Line 'which conda ..': some images (TPU VM) do not install conda by # default. 'source ~/.bashrc' is needed so conda takes effect for the next # commands. + # Line 'pip3 list | ..': Install the correct version of ray[default] if it is not. The 'python3 -c "import ray"' is to check the integrity of the + # installed ray package. The integrity check is needed because the reboot of the machine during the ray installation may cause the corruption. # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase. # Line 'mkdir -p ..': disable host key check @@ -246,8 +239,9 @@ head_start_ray_commands: echo "Waiting for NVIDIA drivers to be installed..." >> ~/.sky/nvlog sleep 5 done - # Magic number for waiting the nvidia driver to be ready and the instance - # to be rebooted. + # Magic number for waiting for the nvidia driver to be ready and the instance + # to be rebooted. The number is determined by experiments, and it is the + # minimum number that works. sleep 18 echo "NVIDIA GPU is ready." >> ~/.sky/nvlog {%- endif %} @@ -267,8 +261,9 @@ worker_start_ray_commands: echo "Waiting for NVIDIA drivers to be installed..." >> ~/.sky/nvlog sleep 5 done - # Magic number for waiting the nvidia driver to be ready and the instance - # to be rebooted. + # Magic number for waiting for the nvidia driver to be ready and the instance + # to be rebooted. The number is determined by experiments, and it is the + # minimum number that works. sleep 18 echo "NVIDIA GPU is ready." >> ~/.sky/nvlog {%- endif %} From 31f906a3ac89596ead626176a2e814cd40c634eb Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 14 May 2023 22:34:11 -0700 Subject: [PATCH 11/14] longer time for job queue --- tests/test_smoke.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 4567990913..1e11f06d01 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -791,6 +791,7 @@ def test_job_queue(generic_cloud: str): f'sky logs {name} 5 --status', ], f'sky down -y {name}', + timeout=20 * 60, ) run_one_test(test) From fcf05ec682371707a0b2bfb14993364811774e27 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 14 May 2023 22:58:54 -0700 Subject: [PATCH 12/14] format --- sky/authentication.py | 8 +++++--- sky/clouds/gcp.py | 8 ++++---- sky/skylet/providers/gcp/node_provider.py | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/sky/authentication.py b/sky/authentication.py index 78acd21d7d..92b42073b8 100644 --- a/sky/authentication.py +++ b/sky/authentication.py @@ -133,8 +133,9 @@ def _wait_for_compute_global_operation(project_name: str, operation_name: str, return result -def _maybe_add_ssh_key_to_gcp_project_if_debian(compute, project, config: Dict[str, Any], - os_login_enabled: bool): +def _maybe_add_ssh_key_to_gcp_project_if_debian(compute, project, + config: Dict[str, Any], + os_login_enabled: bool): """Add ssh key to GCP account if using Debian image without cloud-init. This function is for backward compatibility. It is only used when the user @@ -323,7 +324,8 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]: # This function is for backward compatibility, as the user using the old # Debian-based image may not have the cloud-init enabled, and we need to # add the ssh key to the account. - _maybe_add_ssh_key_to_gcp_project_if_debian(compute, project, config, oslogin_enabled) + _maybe_add_ssh_key_to_gcp_project_if_debian(compute, project, config, + oslogin_enabled) return config diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index a149380f72..525ff3f149 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -70,9 +70,9 @@ # TODO(zhwu): Move the default AMI size to the catalog instead. DEFAULT_GCP_IMAGE_GB = 50 _DEFAULT_CPU_IMAGE = 'skypilot:cpu-ubuntu-2004' -# Other GPUs: CUDA driver version 510.47.03, CUDA Library 11.6 -# K80: CUDA driver version 470.103.01, CUDA Library 11.4 (we manually install the older -# CUDA driver in the gcp-ray.yaml to support K80) +# Other GPUs: CUDA driver version 510.47.03, CUDA Library 11.6. +# K80: CUDA driver version 470.103.01, CUDA Library 11.4 (we manually install +# the older CUDA driver in the gcp-ray.yaml to support K80). _DEFAULT_GPU_IMAGE = 'skypilot:gpu-ubuntu-2004' @@ -256,7 +256,7 @@ def get_image_info(cls, image_id) -> Dict[str, Any]: project = image_attrs[1] image_name = image_attrs[-1] image_info = compute.images().get(project=project, - image=image_name).execute() + image=image_name).execute() return image_info except gcp.http_error_exception() as e: if e.resp.status == 403: diff --git a/sky/skylet/providers/gcp/node_provider.py b/sky/skylet/providers/gcp/node_provider.py index e2e390b25e..14d75dd5d6 100644 --- a/sky/skylet/providers/gcp/node_provider.py +++ b/sky/skylet/providers/gcp/node_provider.py @@ -391,7 +391,7 @@ def _run_helper( https://github.com/ray-project/ray/issues/16539#issuecomment-1073138982. The root cause can be that the GCP's async nvidia-driver installation will reboot the machine when finished. - + We added retry for the ssh commands executed by ray up, which is ok since our setup commands are idempotent. """ From 2e871e7cd51e15bb6a0c084ef07ae4cafc3ceddc Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 30 Jun 2023 23:59:58 -0700 Subject: [PATCH 13/14] adopt gcp official way to set ssh key --- sky/clouds/service_catalog/constants.py | 2 +- sky/templates/gcp-ray.yml.j2 | 18 ++++++------------ 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/sky/clouds/service_catalog/constants.py b/sky/clouds/service_catalog/constants.py index 0fa8c7e9c2..8148fef468 100644 --- a/sky/clouds/service_catalog/constants.py +++ b/sky/clouds/service_catalog/constants.py @@ -2,5 +2,5 @@ import os HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long -CATALOG_SCHEMA_VERSION = 'v6' +CATALOG_SCHEMA_VERSION = 'v5' LOCAL_CATALOG_DIR = os.path.expanduser('~/.sky/catalogs/') diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index 2e7f8ce5cd..6c69310afd 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -57,15 +57,12 @@ available_node_types: diskType: zones/{{zones}}/diskTypes/{{disk_tier}} metadata: items: + - key: ssh-keys + value: | + skypilot:ssh_user:skypilot:ssh_public_key_content - key: user-data value: | #cloud-config - users: - - name: skypilot:ssh_user - shell: /bin/bash - sudo: ALL=(ALL) NOPASSWD:ALL - ssh_authorized_keys: - - skypilot:ssh_public_key_content write_files: - path: /etc/apt/apt.conf.d/20auto-upgrades content: | @@ -125,15 +122,12 @@ available_node_types: diskType: zones/{{zones}}/diskTypes/{{disk_tier}} metadata: items: + - key: ssh-keys + value: | + skypilot:ssh_user:skypilot:ssh_public_key_content - key: user-data value: | #cloud-config - users: - - name: skypilot:ssh_user - shell: /bin/bash - sudo: ALL=(ALL) NOPASSWD:ALL - ssh_authorized_keys: - - skypilot:ssh_public_key_content write_files: - path: /etc/apt/apt.conf.d/20auto-upgrades content: | From f85b4f3c1bc46c20da23cde6f98a8ef88a37ca54 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sat, 1 Jul 2023 03:00:55 -0700 Subject: [PATCH 14/14] autoupdate images.csv --- sky/authentication.py | 2 +- sky/clouds/service_catalog/common.py | 2 +- sky/clouds/service_catalog/gcp_catalog.py | 4 ++++ sky/templates/gcp-ray.yml.j2 | 7 +++++-- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/sky/authentication.py b/sky/authentication.py index 7670fd7166..d77d4a098f 100644 --- a/sky/authentication.py +++ b/sky/authentication.py @@ -103,7 +103,7 @@ def _replace_cloud_init_ssh_info_in_config(config: Dict[str, Any], def setup_aws_authentication(config: Dict[str, Any]) -> Dict[str, Any]: _, public_key_path = get_or_generate_keys() with open(public_key_path, 'r') as f: - public_key = f.read() + public_key = f.read().strip() config = _replace_cloud_init_ssh_info_in_config(config, public_key) return config diff --git a/sky/clouds/service_catalog/common.py b/sky/clouds/service_catalog/common.py index e55f5b3350..aa323456a9 100644 --- a/sky/clouds/service_catalog/common.py +++ b/sky/clouds/service_catalog/common.py @@ -67,7 +67,7 @@ def read_catalog(filename: str, """ assert filename.endswith('.csv'), 'The catalog file must be a CSV file.' assert (pull_frequency_hours is None or - pull_frequency_hours > 0), pull_frequency_hours + pull_frequency_hours >= 0), pull_frequency_hours catalog_path = get_catalog_path(filename) cloud = cloud_lib.CLOUD_REGISTRY.from_str(os.path.dirname(filename)) diff --git a/sky/clouds/service_catalog/gcp_catalog.py b/sky/clouds/service_catalog/gcp_catalog.py index 3558e8b7f0..ccbfeadc1d 100644 --- a/sky/clouds/service_catalog/gcp_catalog.py +++ b/sky/clouds/service_catalog/gcp_catalog.py @@ -26,6 +26,10 @@ pull_frequency_hours=_PULL_FREQUENCY_HOURS) _image_df = common.read_catalog('gcp/images.csv', pull_frequency_hours=_PULL_FREQUENCY_HOURS) +if _image_df[_image_df['Tag'] == 'skypilot:cpu-ubuntu-2004'].empty: + # Update the image catalog if it does not include the updated images + # https://github.com/skypilot-org/skypilot-catalog/pull/25. + _image_df = common.read_catalog('gcp/images.csv', pull_frequency_hours=0) _TPU_REGIONS = [ 'us-central1', diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index 6c69310afd..80283170b8 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -189,8 +189,11 @@ setup_commands: # Line 'which conda ..': some images (TPU VM) do not install conda by # default. 'source ~/.bashrc' is needed so conda takes effect for the next # commands. - # Line 'pip3 list | ..': Install the correct version of ray[default] if it is not. The 'python3 -c "import ray"' is to check the integrity of the - # installed ray package. The integrity check is needed because the reboot of the machine during the ray installation may cause the corruption. + # Line 'pip3 list | ..': Ensure only one Ray version (which is our ray_version) is installed, + # regardless of if the image comes pre-installed with another Ray version. The + # 'python3 -c "import ray"' is to check the integrity of the installed ray package. The integrity + # check is needed because the reboot of the machine during the ray installation may cause the + # corruption. # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase. # Line 'mkdir -p ..': disable host key check