diff --git a/sky/catalog/gcp_catalog.py b/sky/catalog/gcp_catalog.py index b4e6f0f4ee..38da01c332 100644 --- a/sky/catalog/gcp_catalog.py +++ b/sky/catalog/gcp_catalog.py @@ -685,6 +685,13 @@ def get_image_id_from_tag(tag: str, region: Optional[str]) -> Optional[str]: _image_df = common.read_catalog('gcp/images.csv', pull_frequency_hours=0) image_id = common.get_image_id_from_tag_impl(_image_df, tag, region) + # Remove this debug lines after catalog PR merged + if tag == 'skypilot:custom-cpu-ubuntu-2204-250923': + image_id = ('projects/sky-dev-465/global/images/' + 'skypilot-gcp-cpu-ubuntu-250923') + elif tag == 'skypilot:custom-gpu-ubuntu-2204-250923': + image_id = ('projects/sky-dev-465/global/images/' + 'skypilot-gcp-gpu-ubuntu-250923') return image_id diff --git a/sky/catalog/images/provisioners/cuda.sh b/sky/catalog/images/provisioners/cuda.sh index 9508b851c6..3affbf2d0e 100644 --- a/sky/catalog/images/provisioners/cuda.sh +++ b/sky/catalog/images/provisioners/cuda.sh @@ -17,6 +17,17 @@ else ARCH_PATH="x86_64" fi +# Install GCC 12 and set as default compiler +# This is required because newer Ubuntu 22.04 kernels (6.5.0+ and 6.8.0+) are built with GCC 12, +# but Ubuntu 22.04 LTS defaults to GCC 11. Without GCC 12, NVIDIA DKMS driver compilation +# will fail with error: "unrecognized command-line option '-ftrivial-auto-var-init=zero'" +# This flag was introduced in GCC 12 and is not recognized by GCC 11. +echo "Installing GCC 12 to match kernel compiler version..." +sudo apt-get update +sudo apt-get install -y gcc-12 g++-12 +sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 100 +sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 100 + # Download architecture-specific CUDA keyring package wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH_PATH}/cuda-keyring_1.1-1_all.deb sudo dpkg -i cuda-keyring_1.1-1_all.deb diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index a6d308920d..82c802198f 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -111,9 +111,9 @@ ) # Image ID tags -_DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu-2204' +_DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu-2204-250923' # For GPU-related package version, see sky/clouds/catalog/images/provisioners/cuda.sh -_DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-2204' +_DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-2204-250923' _DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-debian-10' # Use COS image with GPU Direct support. # Need to contact GCP support to build our own image for GPUDirect-TCPX support. diff --git a/tests/smoke_tests/test_basic.py b/tests/smoke_tests/test_basic.py index ff425ec982..307c9e5dc0 100644 --- a/tests/smoke_tests/test_basic.py +++ b/tests/smoke_tests/test_basic.py @@ -23,7 +23,6 @@ import os import pathlib import subprocess -import sys import tempfile import textwrap import time diff --git a/tests/smoke_tests/test_examples.py b/tests/smoke_tests/test_examples.py new file mode 100644 index 0000000000..b2b3cd6313 --- /dev/null +++ b/tests/smoke_tests/test_examples.py @@ -0,0 +1,135 @@ +# Smoke tests for SkyPilot for basic functionality +# Default options are set in pyproject.toml +# Example usage: +# Run all tests except for AWS and Lambda Cloud +# > pytest tests/smoke_tests/test_llm.py +# +# Terminate failed clusters after test finishes +# > pytest tests/smoke_tests/test_llm.py --terminate-on-failure +# +# Re-run last failed tests +# > pytest --lf +# +# Run one of the smoke tests +# > pytest tests/smoke_tests/test_llm.py::test_deepseek_r1 +# +# Only run test for AWS + generic tests +# > pytest tests/smoke_tests/test_llm.py --aws +# +# Change cloud for generic tests to aws +# > pytest tests/smoke_tests/test_llm.py --generic-cloud aws + +import json + +import pytest +from smoke_tests import smoke_tests_utils +# TODO(zeping): move them to smoke_tests_utils +from smoke_tests.test_sky_serve import SERVE_ENDPOINT_WAIT +from smoke_tests.test_sky_serve import SERVE_WAIT_UNTIL_READY +from smoke_tests.test_sky_serve import TEARDOWN_SERVICE + +import sky + + +@pytest.mark.gcp +@pytest.mark.parametrize('model_name,gpu_spec', [ + ('deepseek-ai/DeepSeek-R1-Distill-Llama-8B', 'L4:1'), + ('deepseek-ai/DeepSeek-R1-Distill-Llama-70B', 'A100-80GB:2'), +]) +def test_deepseek_r1_vllm(generic_cloud: str, model_name: str, gpu_spec: str): + name = smoke_tests_utils.get_cluster_name() + + payload = { + "model": model_name, + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Who are you?" + }, + ], + } + json_payload = json.dumps(payload) + + test = smoke_tests_utils.Test( + 'deepseek_r1_vllm', + [ + f'sky launch -y -d -c {name} --infra {generic_cloud} --env MODEL_NAME={model_name} --gpus {gpu_spec} llm/deepseek-r1-distilled/deepseek-r1-vllm.yaml', + smoke_tests_utils.get_cmd_wait_until_cluster_status_contains( + cluster_name=name, + cluster_status=[sky.ClusterStatus.UP], + timeout=300), + # Disable SKYPILOT_DEBUG while retrieving the IP to avoid debug logs + # contaminating the output of `sky status --ip`, which would break curl. + # Use `tail -n 1` to ensure only the pure IP/hostname is captured. + ( + f'ORIGIN_SKYPILOT_DEBUG=$SKYPILOT_DEBUG; export SKYPILOT_DEBUG=0; ' + f'ENDPOINT=$(sky status --ip {name} | tail -n 1); ' + f'export SKYPILOT_DEBUG=$ORIGIN_SKYPILOT_DEBUG; ' + # Wait up to 10 minutes for the model server to be ready + f'start_time=$SECONDS; timeout=1800; s=""; ' + f'while true; do ' + f' resp=$(curl -sS --max-time 15 http://$ENDPOINT:8000/v1/chat/completions ' + f' -H "Content-Type: application/json" -d \'{json_payload}\' || true); ' + f' if echo "$resp" | jq -e ".choices[0].message.content" > /dev/null 2>&1; then ' + f' s="$resp"; break; fi; ' + f' if (( SECONDS - start_time > timeout )); then ' + f' echo "Timeout after $timeout seconds waiting for model server readiness"; echo "$resp"; exit 1; fi; ' + f' echo "Waiting for model server to be ready..."; sleep 10; ' + f'done; ' + f'echo "$s" | jq .; ' + f'content=$(echo "$s" | jq -r ".choices[0].message.content"); ' + f'echo "$content"; ' + # Accept either opening or closing think tag, or explicit self-identification + f'(echo "$content" | grep -qi "" || ' + f' echo "$content" | grep -qi "" || ' + f' echo "$content" | grep -qi "I\'m DeepSeek-R1") || ' + f'(echo "Expected tag or model self-identification not found" && exit 1)' + ), + ], + f'sky down -y {name}', + ) + smoke_tests_utils.run_one_test(test) + + +@pytest.mark.gcp +def test_sglang_llava_serving(generic_cloud: str): + name = smoke_tests_utils.get_cluster_name() + + payload = { + "model": "liuhaotian/llava-v1.6-vicuna-7b", + "messages": [{ + "role": "user", + "content": [{ + "type": "text", + "text": "Describe this image" + }, { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/examples/frontend_language/quick_start/images/cat.jpeg" + } + }] + }], + } + json_payload = json.dumps(payload) + + test = smoke_tests_utils.Test( + 'sglang_llava', + [ + f'sky serve up -n {name} --infra {generic_cloud} --gpus L4:1 -y llm/sglang/llava.yaml', + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), + (f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' + f's=$(curl -sS $endpoint/v1/chat/completions -H "Content-Type: application/json" -d \'{json_payload}\'); ' + f'echo "$s" | jq .; ' + f'content=$(echo "$s" | jq -r ".choices[0].message.content"); ' + f'echo "$content"; ' + f'echo "$content" | grep -E ".+"'), + ], + TEARDOWN_SERVICE.format(name=name), + env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV, + timeout=40 * 60, + ) + smoke_tests_utils.run_one_test(test) diff --git a/tests/smoke_tests/test_sky_serve.py b/tests/smoke_tests/test_sky_serve.py index b6facebf3c..5a77652b91 100644 --- a/tests/smoke_tests/test_sky_serve.py +++ b/tests/smoke_tests/test_sky_serve.py @@ -59,7 +59,7 @@ def _get_service_name() -> str: # failure detected. In the end we sleep for # serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS to make sure load balancer have # enough time to sync with the controller and get all ready replica IPs. -_SERVE_WAIT_UNTIL_READY = ( +SERVE_WAIT_UNTIL_READY = ( '{{ while true; do' ' s=$(sky serve status {name}); echo "$s";' ' echo "$s" | grep -q "{replica_num}/{replica_num}" && break;' @@ -84,7 +84,7 @@ def _get_service_name() -> str: # controller is UP before we can terminate the service. # The teardown command has a 10-mins timeout, so we don't need to do # the timeout here. See implementation of run_one_test() for details. -_TEARDOWN_SERVICE = _SHOW_SERVE_STATUS + ( +TEARDOWN_SERVICE = _SHOW_SERVE_STATUS + ( '(for i in `seq 1 20`; do' ' s=$(sky serve down -y {name});' ' echo "Trying to terminate {name}";' @@ -111,7 +111,7 @@ def _get_service_name() -> str: ' sleep 10; ' 'done)') -_SERVE_ENDPOINT_WAIT = ( +SERVE_ENDPOINT_WAIT = ( 'export ORIGIN_SKYPILOT_DEBUG=$SKYPILOT_DEBUG; export SKYPILOT_DEBUG=0; ' 'endpoint=$(sky serve status --endpoint {name}); ' 'until ! echo "$endpoint" | grep -qE "Controller is initializing|^-$"; ' @@ -199,11 +199,11 @@ def _get_skyserve_http_test(name: str, cloud: str, f'test-skyserve-{cloud.replace("_", "-")}', [ f'sky serve up -n {name} -y {smoke_tests_utils.LOW_RESOURCE_ARG} tests/skyserve/http/{cloud}.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'curl $endpoint | grep "Hi, SkyPilot here"', ], - _TEARDOWN_SERVICE.format(name=name), + TEARDOWN_SERVICE.format(name=name), env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV, timeout=timeout_minutes * 60, ) @@ -356,7 +356,7 @@ def generate_llm_test_command(prompt: str, expected_output: str) -> str: prompt = shlex.quote(prompt) expected_output = shlex.quote(expected_output) return ( - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 's=$(python tests/skyserve/llm/get_response.py --endpoint $endpoint ' f'--prompt {prompt} --auth_token {auth_token}); ' 'echo "$s"; ' @@ -370,13 +370,13 @@ def generate_llm_test_command(prompt: str, expected_output: str) -> str: 'test-skyserve-llm', [ f'sky serve up -n {name} --infra {generic_cloud} --gpus {accelerator} -y --secret AUTH_TOKEN={auth_token} tests/skyserve/llm/service.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), *[ generate_llm_test_command(prompt, output) for prompt, output in prompt2output.items() ], ], - _TEARDOWN_SERVICE.format(name=name), + TEARDOWN_SERVICE.format(name=name), env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV, timeout=40 * 60, ) @@ -394,17 +394,17 @@ def test_skyserve_spot_recovery(): [ smoke_tests_utils.launch_cluster_for_cloud_cmd('gcp', name), f'sky serve up -n {name} {smoke_tests_utils.LOW_RESOURCE_ARG} -y tests/skyserve/spot/recovery.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'request_output=$(curl $endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', smoke_tests_utils.run_cloud_cmd_on_cluster( name, smoke_tests_utils.terminate_gcp_replica(name, zone, 1)), - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'request_output=$(curl $endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', ], f'{smoke_tests_utils.down_cluster_for_cloud_cmd(name)}; ' - f'{_TEARDOWN_SERVICE.format(name=name)}', + f'{TEARDOWN_SERVICE.format(name=name)}', env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV, timeout=20 * 60, ) @@ -425,11 +425,11 @@ def test_skyserve_base_ondemand_fallback(generic_cloud: str): 'test-skyserve-base-ondemand-fallback', [ f'sky serve up -n {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} -y tests/skyserve/spot/base_ondemand_fallback.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), _check_replica_in_status(name, [(1, True, 'READY'), (1, False, 'READY')]), ], - _TEARDOWN_SERVICE.format(name=name), + TEARDOWN_SERVICE.format(name=name), env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV, timeout=20 * 60, ) @@ -466,7 +466,7 @@ def test_skyserve_dynamic_ondemand_fallback(): f'[ "$count" -eq 1 ] || [ "$count" -eq 2 ] || {{ echo "Expected 1 or 2 instances, got $count"; exit 1; }}', # Wait until 2 spot instances are ready. - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), _check_replica_in_status(name, [(2, True, 'READY'), (0, False, '')]), smoke_tests_utils.run_cloud_cmd_on_cluster( @@ -486,12 +486,12 @@ def test_skyserve_dynamic_ondemand_fallback(): ]), # Wait until 2 spot instances are ready. - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), _check_replica_in_status(name, [(2, True, 'READY'), (0, False, '')]), ], f'{smoke_tests_utils.down_cluster_for_cloud_cmd(name)}; ' - f'{_TEARDOWN_SERVICE.format(name=name)}', + f'{TEARDOWN_SERVICE.format(name=name)}', env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV, timeout=20 * 60, ) @@ -534,12 +534,12 @@ def test_skyserve_user_bug_restart(generic_cloud: str): increase_initial_delay_seconds( f'sky serve update {name} --infra {generic_cloud} {resource_arg} -y tests/skyserve/auto_restart.yaml' ), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'until curl --connect-timeout 10 --max-time 10 $endpoint | grep "Hi, SkyPilot here"; do sleep 1; done; sleep 2; ' + _check_replica_in_status(name, [(1, False, 'READY'), (1, False, 'FAILED')]), ], - _TEARDOWN_SERVICE.format(name=name), + TEARDOWN_SERVICE.format(name=name), env=env, timeout=20 * 60, ) @@ -557,15 +557,15 @@ def test_skyserve_load_balancer(generic_cloud: str): 'test-skyserve-load-balancer', [ f'sky serve up -n {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} -y tests/skyserve/load_balancer/service.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3), + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' f'{_SERVE_STATUS_WAIT.format(name=name)}; ' f'{_get_replica_ip(name, 1)}; ' f'{_get_replica_ip(name, 2)}; {_get_replica_ip(name, 3)}; ' 'python tests/skyserve/load_balancer/test_round_robin.py ' '--endpoint $endpoint --replica-num 3 --replica-ips $ip1 $ip2 $ip3', ], - _TEARDOWN_SERVICE.format(name=name), + TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV, ) @@ -586,8 +586,8 @@ def test_skyserve_auto_restart(): # TODO(tian): we can dynamically generate YAML from template to # avoid maintaining too many YAML files f'sky serve up -n {name} -y {smoke_tests_utils.LOW_RESOURCE_ARG} tests/skyserve/auto_restart.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'request_output=$(curl $endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', # sleep for 20 seconds (initial delay) to make sure it will # be restarted @@ -601,19 +601,19 @@ def test_skyserve_auto_restart(): # queries takes a lot of time). Instead, we think continuous 3 min probe # failure is not a temporary problem but indeed a failure. 'sleep 180', - # We cannot use _SERVE_WAIT_UNTIL_READY; there will be a intermediate time + # We cannot use SERVE_WAIT_UNTIL_READY; there will be a intermediate time # that the output of `sky serve status` shows FAILED and this status will - # cause _SERVE_WAIT_UNTIL_READY to early quit. + # cause SERVE_WAIT_UNTIL_READY to early quit. '(while true; do' f' output=$(sky serve status {name});' ' echo "$output" | grep -q "1/1" && break;' ' sleep 10;' f'done); sleep {serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS};', - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'request_output=$(curl $endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', ], f'{smoke_tests_utils.down_cluster_for_cloud_cmd(name)}; ' - f'{_TEARDOWN_SERVICE.format(name=name)}', + f'{TEARDOWN_SERVICE.format(name=name)}', env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV, timeout=20 * 60, ) @@ -632,8 +632,8 @@ def test_skyserve_cancel(generic_cloud: str): 'test-skyserve-cancel', [ f'sky serve up -n {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} -y tests/skyserve/cancel/cancel.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; python3 ' + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; python3 ' 'tests/skyserve/cancel/send_cancel_request.py ' '--endpoint $endpoint | grep "Request was cancelled"', f's=$(sky serve logs {name} 1 --no-follow); ' @@ -642,7 +642,7 @@ def test_skyserve_cancel(generic_cloud: str): f's=$(sky serve logs {name} 1 --no-follow); done; ' 'echo "$s"; echo "$s" | grep "Client disconnected, stopping computation"', ], - _TEARDOWN_SERVICE.format(name=name), + TEARDOWN_SERVICE.format(name=name), env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV, timeout=20 * 60, ) @@ -662,12 +662,12 @@ def test_skyserve_streaming(generic_cloud: str): 'test-skyserve-streaming', [ f'sky serve up -n {name} --infra {generic_cloud} {resource_arg} -y tests/skyserve/streaming/streaming.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'python3 tests/skyserve/streaming/send_streaming_request.py ' '--endpoint $endpoint | grep "Streaming test passed"', ], - _TEARDOWN_SERVICE.format(name=name), + TEARDOWN_SERVICE.format(name=name), env=env, timeout=20 * 60, ) @@ -693,7 +693,7 @@ def test_skyserve_readiness_timeout_fail(generic_cloud: str): 'sleep 60', f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s" | grep "{name}" | grep "FAILED_INITIAL_DELAY" | wc -l | grep 1;' ], - _TEARDOWN_SERVICE.format(name=name), + TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV, ) @@ -711,11 +711,11 @@ def test_skyserve_large_readiness_timeout(generic_cloud: str): 'test-skyserve-large-readiness-timeout', [ f'sky serve up -n {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} -y tests/skyserve/readiness_timeout/task_large_timeout.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'request_output=$(curl $endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"', ], - _TEARDOWN_SERVICE.format(name=name), + TEARDOWN_SERVICE.format(name=name), env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV, timeout=20 * 60, ) @@ -740,12 +740,12 @@ def test_skyserve_update(generic_cloud: str): 'test-skyserve-update', [ f'sky serve up -n {name} --infra {generic_cloud} {resource_arg} -y tests/skyserve/update/old.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl $endpoint | grep "Hi, SkyPilot here"', + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; curl $endpoint | grep "Hi, SkyPilot here"', f'sky serve update {name} --infra {generic_cloud} {resource_arg} --mode blue_green -y tests/skyserve/update/new.yaml', # sleep before update is registered. 'sleep 20', - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'until curl $endpoint | grep "Hi, new SkyPilot here!"; do sleep 2; done;' # Make sure the traffic is not mixed 'curl $endpoint | grep "Hi, new SkyPilot here"', @@ -755,7 +755,7 @@ def test_skyserve_update(generic_cloud: str): timeout_seconds=replica_check_timeout_seconds) + _check_service_version(name, "2")), ], - _TEARDOWN_SERVICE.format(name=name), + TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, env=env, ) @@ -788,8 +788,8 @@ def test_skyserve_rolling_update(generic_cloud: str): increase_initial_delay_seconds( f'sky serve up -n {name} --infra {generic_cloud} {resource_arg} -y tests/skyserve/update/old.yaml' ), - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl $endpoint | grep "Hi, SkyPilot here"', + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; curl $endpoint | grep "Hi, SkyPilot here"', increase_initial_delay_seconds( f'sky serve update {name} --infra {generic_cloud} {resource_arg} -y tests/skyserve/update/new.yaml' ), @@ -797,7 +797,7 @@ def test_skyserve_rolling_update(generic_cloud: str): # with even id will sleep 120 seconds before being ready, so we # should be able to get observe the period that the traffic is mixed # across two versions. - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'until curl $endpoint | grep "Hi, new SkyPilot here!"; do sleep 2; done; sleep 2; ' # The latest version should have one READY and the one of the older versions should be shutting down f'{single_new_replica} {_check_service_version(name, "1,2")} ' @@ -820,7 +820,7 @@ def test_skyserve_rolling_update(generic_cloud: str): ' echo "$result2" | grep "Hi, SkyPilot here" || exit 1; ' 'fi', ], - _TEARDOWN_SERVICE.format(name=name), + TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, env=env, ) @@ -840,8 +840,8 @@ def test_skyserve_fast_update(generic_cloud: str): 'test-skyserve-fast-update', [ f'sky serve up -n {name} -y {smoke_tests_utils.LOW_RESOURCE_ARG} --infra {generic_cloud} tests/skyserve/update/bump_version_before.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl $endpoint | grep "Hi, SkyPilot here"', + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2), + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; curl $endpoint | grep "Hi, SkyPilot here"', f'sky serve update {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} --mode blue_green -y tests/skyserve/update/bump_version_after.yaml', # sleep to wait for update to be registered. 'sleep 40', @@ -852,9 +852,9 @@ def test_skyserve_fast_update(generic_cloud: str): (1, False, _SERVICE_LAUNCHING_STATUS_REGEX)]) + # Fast update will directly have the latest version ready. _check_service_version(name, "2")), - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3) + + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3) + _check_service_version(name, "2"), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl $endpoint | grep "Hi, SkyPilot here"', + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; curl $endpoint | grep "Hi, SkyPilot here"', # Test rolling update f'sky serve update {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} -y tests/skyserve/update/bump_version_before.yaml', # sleep to wait for update to be registered. @@ -862,11 +862,11 @@ def test_skyserve_fast_update(generic_cloud: str): # 2 on-demand (ready) + 1 on-demand (shutting down). _check_replica_in_status(name, [(2, False, 'READY'), (1, False, 'SHUTTING_DOWN')]), - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) + + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) + _check_service_version(name, "3"), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl $endpoint | grep "Hi, SkyPilot here"', + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; curl $endpoint | grep "Hi, SkyPilot here"', ], - _TEARDOWN_SERVICE.format(name=name), + TEARDOWN_SERVICE.format(name=name), timeout=30 * 60, env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV, ) @@ -890,9 +890,9 @@ def test_skyserve_update_autoscale(generic_cloud: str): increase_initial_delay_seconds( f'sky serve up -n {name} --infra {generic_cloud} {resource_arg} -y tests/skyserve/update/num_min_two.yaml' ), - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) + + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) + _check_service_version(name, "1"), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'curl $endpoint | grep "Hi, SkyPilot here"', increase_initial_delay_seconds( f'sky serve update {name} --infra {generic_cloud} {resource_arg} --mode blue_green -y tests/skyserve/update/num_min_one.yaml' @@ -900,9 +900,9 @@ def test_skyserve_update_autoscale(generic_cloud: str): # sleep before update is registered. 'sleep 20', # Timeout will be triggered when update fails. - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1) + + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1) + _check_service_version(name, "2"), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'curl $endpoint | grep "Hi, SkyPilot here!"', # Rolling Update increase_initial_delay_seconds( @@ -911,12 +911,12 @@ def test_skyserve_update_autoscale(generic_cloud: str): # sleep before update is registered. 'sleep 20', # Timeout will be triggered when update fails. - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) + + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) + _check_service_version(name, "3"), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'curl $endpoint | grep "Hi, SkyPilot here!"', ], - _TEARDOWN_SERVICE.format(name=name), + TEARDOWN_SERVICE.format(name=name), timeout=30 * 60, env=env, ) @@ -973,9 +973,9 @@ def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str): f'test-skyserve-new-autoscaler-update-{mode}', [ f'sky serve up -n {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} -y tests/skyserve/update/new_autoscaler_before.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) + + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) + _check_service_version(name, "1"), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 's=$(curl $endpoint); echo "$s"; echo "$s" | grep "Hi, SkyPilot here"', f'sky serve update {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} --mode {mode} -y tests/skyserve/update/new_autoscaler_after.yaml', # Wait for update to be registered @@ -987,13 +987,13 @@ def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str): (2, False, TWO_OLD_ON_DEMAND_INSTANCES_STATUS_AFTER_AUTOSCALE) ]), *update_check, - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=5), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=5), + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'curl $endpoint | grep "Hi, SkyPilot here"', _check_replica_in_status(name, [(4, True, 'READY'), (1, False, 'READY')]), ], - _TEARDOWN_SERVICE.format(name=name), + TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV, ) @@ -1054,7 +1054,7 @@ def test_skyserve_failures(generic_cloud: str): ]), # TODO(zhwu): add test for FAILED_PROVISION ], - _TEARDOWN_SERVICE.format(name=name), + TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, env=env, ) @@ -1080,21 +1080,21 @@ def test_skyserve_https(generic_cloud: str): [ f'sky serve up -n {name} {smoke_tests_utils.LOW_RESOURCE_ARG} --infra {generic_cloud} -y tests/skyserve/https/service.yaml ' f'--env TLS_KEYFILE_ENV_VAR={keyfile} --secret TLS_CERTFILE_ENV_VAR={certfile}', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'output=$(curl $endpoint -k); echo $output; ' 'echo $output | grep "Hi, SkyPilot here"', # Self signed certificate should fail without -k. - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'output=$(curl $endpoint 2>&1); echo $output; ' 'echo $output | grep -E "self[ -]signed certificate"', # curl with wrong schema (http) should fail. - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'http_endpoint="${endpoint/https:/http:}"; ' 'output=$(curl $http_endpoint 2>&1); echo $output; ' 'echo $output | grep "Empty reply from server"', ], - _TEARDOWN_SERVICE.format(name=name) + f'; rm -f {keyfile}', + TEARDOWN_SERVICE.format(name=name) + f'; rm -f {keyfile}', timeout=20 * 60, env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV, ) @@ -1111,15 +1111,15 @@ def test_skyserve_multi_ports(generic_cloud: str): 'test-skyserve-multi-ports', [ f'sky serve up -n {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} -y tests/skyserve/multi_ports.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'curl $replica_endpoint | grep "Hi, SkyPilot here"; ' f'export replica_endpoint=$(sky serve status {name} | tail -n 1 | awk \'{{print $4}}\'); ' 'export replica_endpoint_alt=$(echo $endpoint | sed "s/8080/8081/"); ' 'curl $replica_endpoint | grep "Hi, SkyPilot here"; ' 'curl $replica_endpoint_alt | grep "Hi, SkyPilot here"', ], - _TEARDOWN_SERVICE.format(name=name), + TEARDOWN_SERVICE.format(name=name), timeout=20 * 60, env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV, ) @@ -1172,20 +1172,20 @@ def test_skyserve_ha_kill_after_ready(): smoke_tests_utils.launch_cluster_for_cloud_cmd('kubernetes', name), # Launch service and wait for ready f'sky serve up -n {name} -y tests/skyserve/high_availability/service.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), _check_replica_in_status(name, [(1, False, 'READY')]), # Verify service is accessible - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'curl $endpoint | grep "Hi, SkyPilot here"', # Kill controller and verify recovery smoke_tests_utils.kill_and_wait_controller(name, 'serve'), # Verify service remains accessible after controller recovery - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), _check_replica_in_status(name, [(1, False, 'READY')]), - # f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + # f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' # 'curl $endpoint | grep "Hi, SkyPilot here"', ], - _TEARDOWN_SERVICE.format(name=name), + TEARDOWN_SERVICE.format(name=name), timeout=30 * 60, env={ skypilot_config.ENV_VAR_GLOBAL_CONFIG: 'tests/skyserve/high_availability/config.yaml' @@ -1219,9 +1219,9 @@ def test_skyserve_ha_kill_during_provision(): # Kill controller during provisioning smoke_tests_utils.kill_and_wait_controller(name, 'serve'), # Verify service eventually becomes ready - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), _check_replica_in_status(name, [(1, False, 'READY')]), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'curl $endpoint | grep "Hi, SkyPilot here"', # Check there is only one cluster f'instance_names=$(gcloud compute instances list --filter="name~{name}" --format="value(name)"); ' @@ -1229,7 +1229,7 @@ def test_skyserve_ha_kill_during_provision(): 'num_instances=$(echo "$instance_names" | wc -l); ' '[ "$num_instances" -eq "1" ] || (echo "Expected 1 instance, got $num_instances"; exit 1)', ], - _TEARDOWN_SERVICE.format(name=name), + TEARDOWN_SERVICE.format(name=name), timeout=30 * 60, env={ skypilot_config.ENV_VAR_GLOBAL_CONFIG: 'tests/skyserve/high_availability/config.yaml' @@ -1260,9 +1260,9 @@ def test_skyserve_ha_kill_during_pending(): # Kill controller during pending smoke_tests_utils.kill_and_wait_controller(name, 'serve'), # Verify service eventually becomes ready and accessible - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), _check_replica_in_status(name, [(1, False, 'READY')]), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'curl $endpoint | grep "Hi, SkyPilot here"', # Check there are one cluster f'instance_names=$(gcloud compute instances list --filter="(labels.ray-cluster-name:{replica_cluster_name})" --format="value(name)"); ' @@ -1270,7 +1270,7 @@ def test_skyserve_ha_kill_during_pending(): 'num_instances=$(echo "$instance_names" | wc -l); ' '[ "$num_instances" -eq "1" ] || (echo "Expected 1 instance, got $num_instances"; exit 1)', ], - _TEARDOWN_SERVICE.format(name=name), + TEARDOWN_SERVICE.format(name=name), timeout=30 * 60, env={ skypilot_config.ENV_VAR_GLOBAL_CONFIG: 'tests/skyserve/high_availability/config.yaml' @@ -1296,8 +1296,8 @@ def test_skyserve_ha_kill_during_shutdown(): smoke_tests_utils.launch_cluster_for_cloud_cmd('kubernetes', name), # Launch service and wait for ready f'sky serve up -n {name} -y tests/skyserve/high_availability/service.yaml', - _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), - f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; ' + SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1), + f'{SERVE_ENDPOINT_WAIT.format(name=name)}; ' 'curl $endpoint | grep "Hi, SkyPilot here"', # Record instance names and initiate shutdown of the replica f'instance_names=$(gcloud compute instances list --filter="(labels.ray-cluster-name:{replica_cluster_name})" --format="value(name)"); ' @@ -1325,7 +1325,7 @@ def test_skyserve_ha_kill_during_shutdown(): ' echo "Waiting for instances to terminate..."; sleep 5; ' 'done', ], - _TEARDOWN_SERVICE.format(name=name), + TEARDOWN_SERVICE.format(name=name), timeout=30 * 60, env={ skypilot_config.ENV_VAR_GLOBAL_CONFIG: 'tests/skyserve/high_availability/config.yaml' diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 6d79bb1356..80f5a8905d 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -30,6 +30,7 @@ from smoke_tests.test_basic import * from smoke_tests.test_cli import * from smoke_tests.test_cluster_job import * +from smoke_tests.test_examples import * from smoke_tests.test_images import * from smoke_tests.test_logs import * from smoke_tests.test_managed_job import * @@ -38,4 +39,4 @@ from smoke_tests.test_region_and_zone import * from smoke_tests.test_sky_serve import * from smoke_tests.test_ssm import * -from smoke_tests.test_workspaces import * \ No newline at end of file +from smoke_tests.test_workspaces import *