diff --git a/sky/catalog/gcp_catalog.py b/sky/catalog/gcp_catalog.py
index b4e6f0f4ee..38da01c332 100644
--- a/sky/catalog/gcp_catalog.py
+++ b/sky/catalog/gcp_catalog.py
@@ -685,6 +685,13 @@ def get_image_id_from_tag(tag: str, region: Optional[str]) -> Optional[str]:
_image_df = common.read_catalog('gcp/images.csv',
pull_frequency_hours=0)
image_id = common.get_image_id_from_tag_impl(_image_df, tag, region)
+ # Remove this debug lines after catalog PR merged
+ if tag == 'skypilot:custom-cpu-ubuntu-2204-250923':
+ image_id = ('projects/sky-dev-465/global/images/'
+ 'skypilot-gcp-cpu-ubuntu-250923')
+ elif tag == 'skypilot:custom-gpu-ubuntu-2204-250923':
+ image_id = ('projects/sky-dev-465/global/images/'
+ 'skypilot-gcp-gpu-ubuntu-250923')
return image_id
diff --git a/sky/catalog/images/provisioners/cuda.sh b/sky/catalog/images/provisioners/cuda.sh
index 9508b851c6..3affbf2d0e 100644
--- a/sky/catalog/images/provisioners/cuda.sh
+++ b/sky/catalog/images/provisioners/cuda.sh
@@ -17,6 +17,17 @@ else
ARCH_PATH="x86_64"
fi
+# Install GCC 12 and set as default compiler
+# This is required because newer Ubuntu 22.04 kernels (6.5.0+ and 6.8.0+) are built with GCC 12,
+# but Ubuntu 22.04 LTS defaults to GCC 11. Without GCC 12, NVIDIA DKMS driver compilation
+# will fail with error: "unrecognized command-line option '-ftrivial-auto-var-init=zero'"
+# This flag was introduced in GCC 12 and is not recognized by GCC 11.
+echo "Installing GCC 12 to match kernel compiler version..."
+sudo apt-get update
+sudo apt-get install -y gcc-12 g++-12
+sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 100
+sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 100
+
# Download architecture-specific CUDA keyring package
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH_PATH}/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py
index a6d308920d..82c802198f 100644
--- a/sky/clouds/gcp.py
+++ b/sky/clouds/gcp.py
@@ -111,9 +111,9 @@
)
# Image ID tags
-_DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu-2204'
+_DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu-2204-250923'
# For GPU-related package version, see sky/clouds/catalog/images/provisioners/cuda.sh
-_DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-2204'
+_DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-2204-250923'
_DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-debian-10'
# Use COS image with GPU Direct support.
# Need to contact GCP support to build our own image for GPUDirect-TCPX support.
diff --git a/tests/smoke_tests/test_basic.py b/tests/smoke_tests/test_basic.py
index ff425ec982..307c9e5dc0 100644
--- a/tests/smoke_tests/test_basic.py
+++ b/tests/smoke_tests/test_basic.py
@@ -23,7 +23,6 @@
import os
import pathlib
import subprocess
-import sys
import tempfile
import textwrap
import time
diff --git a/tests/smoke_tests/test_examples.py b/tests/smoke_tests/test_examples.py
new file mode 100644
index 0000000000..b2b3cd6313
--- /dev/null
+++ b/tests/smoke_tests/test_examples.py
@@ -0,0 +1,135 @@
+# Smoke tests for SkyPilot for basic functionality
+# Default options are set in pyproject.toml
+# Example usage:
+# Run all tests except for AWS and Lambda Cloud
+# > pytest tests/smoke_tests/test_llm.py
+#
+# Terminate failed clusters after test finishes
+# > pytest tests/smoke_tests/test_llm.py --terminate-on-failure
+#
+# Re-run last failed tests
+# > pytest --lf
+#
+# Run one of the smoke tests
+# > pytest tests/smoke_tests/test_llm.py::test_deepseek_r1
+#
+# Only run test for AWS + generic tests
+# > pytest tests/smoke_tests/test_llm.py --aws
+#
+# Change cloud for generic tests to aws
+# > pytest tests/smoke_tests/test_llm.py --generic-cloud aws
+
+import json
+
+import pytest
+from smoke_tests import smoke_tests_utils
+# TODO(zeping): move them to smoke_tests_utils
+from smoke_tests.test_sky_serve import SERVE_ENDPOINT_WAIT
+from smoke_tests.test_sky_serve import SERVE_WAIT_UNTIL_READY
+from smoke_tests.test_sky_serve import TEARDOWN_SERVICE
+
+import sky
+
+
+@pytest.mark.gcp
+@pytest.mark.parametrize('model_name,gpu_spec', [
+ ('deepseek-ai/DeepSeek-R1-Distill-Llama-8B', 'L4:1'),
+ ('deepseek-ai/DeepSeek-R1-Distill-Llama-70B', 'A100-80GB:2'),
+])
+def test_deepseek_r1_vllm(generic_cloud: str, model_name: str, gpu_spec: str):
+ name = smoke_tests_utils.get_cluster_name()
+
+ payload = {
+ "model": model_name,
+ "messages": [
+ {
+ "role": "system",
+ "content": "You are a helpful assistant."
+ },
+ {
+ "role": "user",
+ "content": "Who are you?"
+ },
+ ],
+ }
+ json_payload = json.dumps(payload)
+
+ test = smoke_tests_utils.Test(
+ 'deepseek_r1_vllm',
+ [
+ f'sky launch -y -d -c {name} --infra {generic_cloud} --env MODEL_NAME={model_name} --gpus {gpu_spec} llm/deepseek-r1-distilled/deepseek-r1-vllm.yaml',
+ smoke_tests_utils.get_cmd_wait_until_cluster_status_contains(
+ cluster_name=name,
+ cluster_status=[sky.ClusterStatus.UP],
+ timeout=300),
+ # Disable SKYPILOT_DEBUG while retrieving the IP to avoid debug logs
+ # contaminating the output of `sky status --ip`, which would break curl.
+ # Use `tail -n 1` to ensure only the pure IP/hostname is captured.
+ (
+ f'ORIGIN_SKYPILOT_DEBUG=$SKYPILOT_DEBUG; export SKYPILOT_DEBUG=0; '
+ f'ENDPOINT=$(sky status --ip {name} | tail -n 1); '
+ f'export SKYPILOT_DEBUG=$ORIGIN_SKYPILOT_DEBUG; '
+ # Wait up to 10 minutes for the model server to be ready
+ f'start_time=$SECONDS; timeout=1800; s=""; '
+ f'while true; do '
+ f' resp=$(curl -sS --max-time 15 http://$ENDPOINT:8000/v1/chat/completions '
+ f' -H "Content-Type: application/json" -d \'{json_payload}\' || true); '
+ f' if echo "$resp" | jq -e ".choices[0].message.content" > /dev/null 2>&1; then '
+ f' s="$resp"; break; fi; '
+ f' if (( SECONDS - start_time > timeout )); then '
+ f' echo "Timeout after $timeout seconds waiting for model server readiness"; echo "$resp"; exit 1; fi; '
+ f' echo "Waiting for model server to be ready..."; sleep 10; '
+ f'done; '
+ f'echo "$s" | jq .; '
+ f'content=$(echo "$s" | jq -r ".choices[0].message.content"); '
+ f'echo "$content"; '
+ # Accept either opening or closing think tag, or explicit self-identification
+ f'(echo "$content" | grep -qi "" || '
+ f' echo "$content" | grep -qi "" || '
+ f' echo "$content" | grep -qi "I\'m DeepSeek-R1") || '
+ f'(echo "Expected tag or model self-identification not found" && exit 1)'
+ ),
+ ],
+ f'sky down -y {name}',
+ )
+ smoke_tests_utils.run_one_test(test)
+
+
+@pytest.mark.gcp
+def test_sglang_llava_serving(generic_cloud: str):
+ name = smoke_tests_utils.get_cluster_name()
+
+ payload = {
+ "model": "liuhaotian/llava-v1.6-vicuna-7b",
+ "messages": [{
+ "role": "user",
+ "content": [{
+ "type": "text",
+ "text": "Describe this image"
+ }, {
+ "type": "image_url",
+ "image_url": {
+ "url": "https://raw.githubusercontent.com/sgl-project/sglang/main/examples/frontend_language/quick_start/images/cat.jpeg"
+ }
+ }]
+ }],
+ }
+ json_payload = json.dumps(payload)
+
+ test = smoke_tests_utils.Test(
+ 'sglang_llava',
+ [
+ f'sky serve up -n {name} --infra {generic_cloud} --gpus L4:1 -y llm/sglang/llava.yaml',
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
+ (f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ f's=$(curl -sS $endpoint/v1/chat/completions -H "Content-Type: application/json" -d \'{json_payload}\'); '
+ f'echo "$s" | jq .; '
+ f'content=$(echo "$s" | jq -r ".choices[0].message.content"); '
+ f'echo "$content"; '
+ f'echo "$content" | grep -E ".+"'),
+ ],
+ TEARDOWN_SERVICE.format(name=name),
+ env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
+ timeout=40 * 60,
+ )
+ smoke_tests_utils.run_one_test(test)
diff --git a/tests/smoke_tests/test_sky_serve.py b/tests/smoke_tests/test_sky_serve.py
index b6facebf3c..5a77652b91 100644
--- a/tests/smoke_tests/test_sky_serve.py
+++ b/tests/smoke_tests/test_sky_serve.py
@@ -59,7 +59,7 @@ def _get_service_name() -> str:
# failure detected. In the end we sleep for
# serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS to make sure load balancer have
# enough time to sync with the controller and get all ready replica IPs.
-_SERVE_WAIT_UNTIL_READY = (
+SERVE_WAIT_UNTIL_READY = (
'{{ while true; do'
' s=$(sky serve status {name}); echo "$s";'
' echo "$s" | grep -q "{replica_num}/{replica_num}" && break;'
@@ -84,7 +84,7 @@ def _get_service_name() -> str:
# controller is UP before we can terminate the service.
# The teardown command has a 10-mins timeout, so we don't need to do
# the timeout here. See implementation of run_one_test() for details.
-_TEARDOWN_SERVICE = _SHOW_SERVE_STATUS + (
+TEARDOWN_SERVICE = _SHOW_SERVE_STATUS + (
'(for i in `seq 1 20`; do'
' s=$(sky serve down -y {name});'
' echo "Trying to terminate {name}";'
@@ -111,7 +111,7 @@ def _get_service_name() -> str:
' sleep 10; '
'done)')
-_SERVE_ENDPOINT_WAIT = (
+SERVE_ENDPOINT_WAIT = (
'export ORIGIN_SKYPILOT_DEBUG=$SKYPILOT_DEBUG; export SKYPILOT_DEBUG=0; '
'endpoint=$(sky serve status --endpoint {name}); '
'until ! echo "$endpoint" | grep -qE "Controller is initializing|^-$"; '
@@ -199,11 +199,11 @@ def _get_skyserve_http_test(name: str, cloud: str,
f'test-skyserve-{cloud.replace("_", "-")}',
[
f'sky serve up -n {name} -y {smoke_tests_utils.LOW_RESOURCE_ARG} tests/skyserve/http/{cloud}.yaml',
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
'curl $endpoint | grep "Hi, SkyPilot here"',
],
- _TEARDOWN_SERVICE.format(name=name),
+ TEARDOWN_SERVICE.format(name=name),
env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
timeout=timeout_minutes * 60,
)
@@ -356,7 +356,7 @@ def generate_llm_test_command(prompt: str, expected_output: str) -> str:
prompt = shlex.quote(prompt)
expected_output = shlex.quote(expected_output)
return (
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
's=$(python tests/skyserve/llm/get_response.py --endpoint $endpoint '
f'--prompt {prompt} --auth_token {auth_token}); '
'echo "$s"; '
@@ -370,13 +370,13 @@ def generate_llm_test_command(prompt: str, expected_output: str) -> str:
'test-skyserve-llm',
[
f'sky serve up -n {name} --infra {generic_cloud} --gpus {accelerator} -y --secret AUTH_TOKEN={auth_token} tests/skyserve/llm/service.yaml',
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
*[
generate_llm_test_command(prompt, output)
for prompt, output in prompt2output.items()
],
],
- _TEARDOWN_SERVICE.format(name=name),
+ TEARDOWN_SERVICE.format(name=name),
env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
timeout=40 * 60,
)
@@ -394,17 +394,17 @@ def test_skyserve_spot_recovery():
[
smoke_tests_utils.launch_cluster_for_cloud_cmd('gcp', name),
f'sky serve up -n {name} {smoke_tests_utils.LOW_RESOURCE_ARG} -y tests/skyserve/spot/recovery.yaml',
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
'request_output=$(curl $endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
smoke_tests_utils.run_cloud_cmd_on_cluster(
name, smoke_tests_utils.terminate_gcp_replica(name, zone, 1)),
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
'request_output=$(curl $endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
],
f'{smoke_tests_utils.down_cluster_for_cloud_cmd(name)}; '
- f'{_TEARDOWN_SERVICE.format(name=name)}',
+ f'{TEARDOWN_SERVICE.format(name=name)}',
env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
timeout=20 * 60,
)
@@ -425,11 +425,11 @@ def test_skyserve_base_ondemand_fallback(generic_cloud: str):
'test-skyserve-base-ondemand-fallback',
[
f'sky serve up -n {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} -y tests/skyserve/spot/base_ondemand_fallback.yaml',
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
_check_replica_in_status(name, [(1, True, 'READY'),
(1, False, 'READY')]),
],
- _TEARDOWN_SERVICE.format(name=name),
+ TEARDOWN_SERVICE.format(name=name),
env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
timeout=20 * 60,
)
@@ -466,7 +466,7 @@ def test_skyserve_dynamic_ondemand_fallback():
f'[ "$count" -eq 1 ] || [ "$count" -eq 2 ] || {{ echo "Expected 1 or 2 instances, got $count"; exit 1; }}',
# Wait until 2 spot instances are ready.
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
_check_replica_in_status(name, [(2, True, 'READY'),
(0, False, '')]),
smoke_tests_utils.run_cloud_cmd_on_cluster(
@@ -486,12 +486,12 @@ def test_skyserve_dynamic_ondemand_fallback():
]),
# Wait until 2 spot instances are ready.
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
_check_replica_in_status(name, [(2, True, 'READY'),
(0, False, '')]),
],
f'{smoke_tests_utils.down_cluster_for_cloud_cmd(name)}; '
- f'{_TEARDOWN_SERVICE.format(name=name)}',
+ f'{TEARDOWN_SERVICE.format(name=name)}',
env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
timeout=20 * 60,
)
@@ -534,12 +534,12 @@ def test_skyserve_user_bug_restart(generic_cloud: str):
increase_initial_delay_seconds(
f'sky serve update {name} --infra {generic_cloud} {resource_arg} -y tests/skyserve/auto_restart.yaml'
),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
'until curl --connect-timeout 10 --max-time 10 $endpoint | grep "Hi, SkyPilot here"; do sleep 1; done; sleep 2; '
+ _check_replica_in_status(name, [(1, False, 'READY'),
(1, False, 'FAILED')]),
],
- _TEARDOWN_SERVICE.format(name=name),
+ TEARDOWN_SERVICE.format(name=name),
env=env,
timeout=20 * 60,
)
@@ -557,15 +557,15 @@ def test_skyserve_load_balancer(generic_cloud: str):
'test-skyserve-load-balancer',
[
f'sky serve up -n {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} -y tests/skyserve/load_balancer/service.yaml',
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3),
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
f'{_SERVE_STATUS_WAIT.format(name=name)}; '
f'{_get_replica_ip(name, 1)}; '
f'{_get_replica_ip(name, 2)}; {_get_replica_ip(name, 3)}; '
'python tests/skyserve/load_balancer/test_round_robin.py '
'--endpoint $endpoint --replica-num 3 --replica-ips $ip1 $ip2 $ip3',
],
- _TEARDOWN_SERVICE.format(name=name),
+ TEARDOWN_SERVICE.format(name=name),
timeout=20 * 60,
env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
)
@@ -586,8 +586,8 @@ def test_skyserve_auto_restart():
# TODO(tian): we can dynamically generate YAML from template to
# avoid maintaining too many YAML files
f'sky serve up -n {name} -y {smoke_tests_utils.LOW_RESOURCE_ARG} tests/skyserve/auto_restart.yaml',
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
'request_output=$(curl $endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
# sleep for 20 seconds (initial delay) to make sure it will
# be restarted
@@ -601,19 +601,19 @@ def test_skyserve_auto_restart():
# queries takes a lot of time). Instead, we think continuous 3 min probe
# failure is not a temporary problem but indeed a failure.
'sleep 180',
- # We cannot use _SERVE_WAIT_UNTIL_READY; there will be a intermediate time
+ # We cannot use SERVE_WAIT_UNTIL_READY; there will be a intermediate time
# that the output of `sky serve status` shows FAILED and this status will
- # cause _SERVE_WAIT_UNTIL_READY to early quit.
+ # cause SERVE_WAIT_UNTIL_READY to early quit.
'(while true; do'
f' output=$(sky serve status {name});'
' echo "$output" | grep -q "1/1" && break;'
' sleep 10;'
f'done); sleep {serve.LB_CONTROLLER_SYNC_INTERVAL_SECONDS};',
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
'request_output=$(curl $endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
],
f'{smoke_tests_utils.down_cluster_for_cloud_cmd(name)}; '
- f'{_TEARDOWN_SERVICE.format(name=name)}',
+ f'{TEARDOWN_SERVICE.format(name=name)}',
env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
timeout=20 * 60,
)
@@ -632,8 +632,8 @@ def test_skyserve_cancel(generic_cloud: str):
'test-skyserve-cancel',
[
f'sky serve up -n {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} -y tests/skyserve/cancel/cancel.yaml',
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; python3 '
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; python3 '
'tests/skyserve/cancel/send_cancel_request.py '
'--endpoint $endpoint | grep "Request was cancelled"',
f's=$(sky serve logs {name} 1 --no-follow); '
@@ -642,7 +642,7 @@ def test_skyserve_cancel(generic_cloud: str):
f's=$(sky serve logs {name} 1 --no-follow); done; '
'echo "$s"; echo "$s" | grep "Client disconnected, stopping computation"',
],
- _TEARDOWN_SERVICE.format(name=name),
+ TEARDOWN_SERVICE.format(name=name),
env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
timeout=20 * 60,
)
@@ -662,12 +662,12 @@ def test_skyserve_streaming(generic_cloud: str):
'test-skyserve-streaming',
[
f'sky serve up -n {name} --infra {generic_cloud} {resource_arg} -y tests/skyserve/streaming/streaming.yaml',
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
'python3 tests/skyserve/streaming/send_streaming_request.py '
'--endpoint $endpoint | grep "Streaming test passed"',
],
- _TEARDOWN_SERVICE.format(name=name),
+ TEARDOWN_SERVICE.format(name=name),
env=env,
timeout=20 * 60,
)
@@ -693,7 +693,7 @@ def test_skyserve_readiness_timeout_fail(generic_cloud: str):
'sleep 60',
f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s" | grep "{name}" | grep "FAILED_INITIAL_DELAY" | wc -l | grep 1;'
],
- _TEARDOWN_SERVICE.format(name=name),
+ TEARDOWN_SERVICE.format(name=name),
timeout=20 * 60,
env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
)
@@ -711,11 +711,11 @@ def test_skyserve_large_readiness_timeout(generic_cloud: str):
'test-skyserve-large-readiness-timeout',
[
f'sky serve up -n {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} -y tests/skyserve/readiness_timeout/task_large_timeout.yaml',
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
'request_output=$(curl $endpoint); echo "$request_output"; echo "$request_output" | grep "Hi, SkyPilot here"',
],
- _TEARDOWN_SERVICE.format(name=name),
+ TEARDOWN_SERVICE.format(name=name),
env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
timeout=20 * 60,
)
@@ -740,12 +740,12 @@ def test_skyserve_update(generic_cloud: str):
'test-skyserve-update',
[
f'sky serve up -n {name} --infra {generic_cloud} {resource_arg} -y tests/skyserve/update/old.yaml',
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl $endpoint | grep "Hi, SkyPilot here"',
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; curl $endpoint | grep "Hi, SkyPilot here"',
f'sky serve update {name} --infra {generic_cloud} {resource_arg} --mode blue_green -y tests/skyserve/update/new.yaml',
# sleep before update is registered.
'sleep 20',
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
'until curl $endpoint | grep "Hi, new SkyPilot here!"; do sleep 2; done;'
# Make sure the traffic is not mixed
'curl $endpoint | grep "Hi, new SkyPilot here"',
@@ -755,7 +755,7 @@ def test_skyserve_update(generic_cloud: str):
timeout_seconds=replica_check_timeout_seconds) +
_check_service_version(name, "2")),
],
- _TEARDOWN_SERVICE.format(name=name),
+ TEARDOWN_SERVICE.format(name=name),
timeout=20 * 60,
env=env,
)
@@ -788,8 +788,8 @@ def test_skyserve_rolling_update(generic_cloud: str):
increase_initial_delay_seconds(
f'sky serve up -n {name} --infra {generic_cloud} {resource_arg} -y tests/skyserve/update/old.yaml'
),
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl $endpoint | grep "Hi, SkyPilot here"',
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; curl $endpoint | grep "Hi, SkyPilot here"',
increase_initial_delay_seconds(
f'sky serve update {name} --infra {generic_cloud} {resource_arg} -y tests/skyserve/update/new.yaml'
),
@@ -797,7 +797,7 @@ def test_skyserve_rolling_update(generic_cloud: str):
# with even id will sleep 120 seconds before being ready, so we
# should be able to get observe the period that the traffic is mixed
# across two versions.
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
'until curl $endpoint | grep "Hi, new SkyPilot here!"; do sleep 2; done; sleep 2; '
# The latest version should have one READY and the one of the older versions should be shutting down
f'{single_new_replica} {_check_service_version(name, "1,2")} '
@@ -820,7 +820,7 @@ def test_skyserve_rolling_update(generic_cloud: str):
' echo "$result2" | grep "Hi, SkyPilot here" || exit 1; '
'fi',
],
- _TEARDOWN_SERVICE.format(name=name),
+ TEARDOWN_SERVICE.format(name=name),
timeout=20 * 60,
env=env,
)
@@ -840,8 +840,8 @@ def test_skyserve_fast_update(generic_cloud: str):
'test-skyserve-fast-update',
[
f'sky serve up -n {name} -y {smoke_tests_utils.LOW_RESOURCE_ARG} --infra {generic_cloud} tests/skyserve/update/bump_version_before.yaml',
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl $endpoint | grep "Hi, SkyPilot here"',
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2),
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; curl $endpoint | grep "Hi, SkyPilot here"',
f'sky serve update {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} --mode blue_green -y tests/skyserve/update/bump_version_after.yaml',
# sleep to wait for update to be registered.
'sleep 40',
@@ -852,9 +852,9 @@ def test_skyserve_fast_update(generic_cloud: str):
(1, False, _SERVICE_LAUNCHING_STATUS_REGEX)]) +
# Fast update will directly have the latest version ready.
_check_service_version(name, "2")),
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3) +
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=3) +
_check_service_version(name, "2"),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl $endpoint | grep "Hi, SkyPilot here"',
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; curl $endpoint | grep "Hi, SkyPilot here"',
# Test rolling update
f'sky serve update {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} -y tests/skyserve/update/bump_version_before.yaml',
# sleep to wait for update to be registered.
@@ -862,11 +862,11 @@ def test_skyserve_fast_update(generic_cloud: str):
# 2 on-demand (ready) + 1 on-demand (shutting down).
_check_replica_in_status(name, [(2, False, 'READY'),
(1, False, 'SHUTTING_DOWN')]),
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) +
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) +
_check_service_version(name, "3"),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; curl $endpoint | grep "Hi, SkyPilot here"',
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; curl $endpoint | grep "Hi, SkyPilot here"',
],
- _TEARDOWN_SERVICE.format(name=name),
+ TEARDOWN_SERVICE.format(name=name),
timeout=30 * 60,
env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
)
@@ -890,9 +890,9 @@ def test_skyserve_update_autoscale(generic_cloud: str):
increase_initial_delay_seconds(
f'sky serve up -n {name} --infra {generic_cloud} {resource_arg} -y tests/skyserve/update/num_min_two.yaml'
),
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) +
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) +
_check_service_version(name, "1"),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
'curl $endpoint | grep "Hi, SkyPilot here"',
increase_initial_delay_seconds(
f'sky serve update {name} --infra {generic_cloud} {resource_arg} --mode blue_green -y tests/skyserve/update/num_min_one.yaml'
@@ -900,9 +900,9 @@ def test_skyserve_update_autoscale(generic_cloud: str):
# sleep before update is registered.
'sleep 20',
# Timeout will be triggered when update fails.
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1) +
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1) +
_check_service_version(name, "2"),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
'curl $endpoint | grep "Hi, SkyPilot here!"',
# Rolling Update
increase_initial_delay_seconds(
@@ -911,12 +911,12 @@ def test_skyserve_update_autoscale(generic_cloud: str):
# sleep before update is registered.
'sleep 20',
# Timeout will be triggered when update fails.
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) +
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) +
_check_service_version(name, "3"),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
'curl $endpoint | grep "Hi, SkyPilot here!"',
],
- _TEARDOWN_SERVICE.format(name=name),
+ TEARDOWN_SERVICE.format(name=name),
timeout=30 * 60,
env=env,
)
@@ -973,9 +973,9 @@ def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str):
f'test-skyserve-new-autoscaler-update-{mode}',
[
f'sky serve up -n {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} -y tests/skyserve/update/new_autoscaler_before.yaml',
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) +
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=2) +
_check_service_version(name, "1"),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
's=$(curl $endpoint); echo "$s"; echo "$s" | grep "Hi, SkyPilot here"',
f'sky serve update {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} --mode {mode} -y tests/skyserve/update/new_autoscaler_after.yaml',
# Wait for update to be registered
@@ -987,13 +987,13 @@ def test_skyserve_new_autoscaler_update(mode: str, generic_cloud: str):
(2, False, TWO_OLD_ON_DEMAND_INSTANCES_STATUS_AFTER_AUTOSCALE)
]),
*update_check,
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=5),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=5),
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
'curl $endpoint | grep "Hi, SkyPilot here"',
_check_replica_in_status(name, [(4, True, 'READY'),
(1, False, 'READY')]),
],
- _TEARDOWN_SERVICE.format(name=name),
+ TEARDOWN_SERVICE.format(name=name),
timeout=20 * 60,
env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
)
@@ -1054,7 +1054,7 @@ def test_skyserve_failures(generic_cloud: str):
]),
# TODO(zhwu): add test for FAILED_PROVISION
],
- _TEARDOWN_SERVICE.format(name=name),
+ TEARDOWN_SERVICE.format(name=name),
timeout=20 * 60,
env=env,
)
@@ -1080,21 +1080,21 @@ def test_skyserve_https(generic_cloud: str):
[
f'sky serve up -n {name} {smoke_tests_utils.LOW_RESOURCE_ARG} --infra {generic_cloud} -y tests/skyserve/https/service.yaml '
f'--env TLS_KEYFILE_ENV_VAR={keyfile} --secret TLS_CERTFILE_ENV_VAR={certfile}',
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
'output=$(curl $endpoint -k); echo $output; '
'echo $output | grep "Hi, SkyPilot here"',
# Self signed certificate should fail without -k.
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
'output=$(curl $endpoint 2>&1); echo $output; '
'echo $output | grep -E "self[ -]signed certificate"',
# curl with wrong schema (http) should fail.
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
'http_endpoint="${endpoint/https:/http:}"; '
'output=$(curl $http_endpoint 2>&1); echo $output; '
'echo $output | grep "Empty reply from server"',
],
- _TEARDOWN_SERVICE.format(name=name) + f'; rm -f {keyfile}',
+ TEARDOWN_SERVICE.format(name=name) + f'; rm -f {keyfile}',
timeout=20 * 60,
env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
)
@@ -1111,15 +1111,15 @@ def test_skyserve_multi_ports(generic_cloud: str):
'test-skyserve-multi-ports',
[
f'sky serve up -n {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} -y tests/skyserve/multi_ports.yaml',
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
'curl $replica_endpoint | grep "Hi, SkyPilot here"; '
f'export replica_endpoint=$(sky serve status {name} | tail -n 1 | awk \'{{print $4}}\'); '
'export replica_endpoint_alt=$(echo $endpoint | sed "s/8080/8081/"); '
'curl $replica_endpoint | grep "Hi, SkyPilot here"; '
'curl $replica_endpoint_alt | grep "Hi, SkyPilot here"',
],
- _TEARDOWN_SERVICE.format(name=name),
+ TEARDOWN_SERVICE.format(name=name),
timeout=20 * 60,
env=smoke_tests_utils.LOW_CONTROLLER_RESOURCE_ENV,
)
@@ -1172,20 +1172,20 @@ def test_skyserve_ha_kill_after_ready():
smoke_tests_utils.launch_cluster_for_cloud_cmd('kubernetes', name),
# Launch service and wait for ready
f'sky serve up -n {name} -y tests/skyserve/high_availability/service.yaml',
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
_check_replica_in_status(name, [(1, False, 'READY')]),
# Verify service is accessible
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
'curl $endpoint | grep "Hi, SkyPilot here"',
# Kill controller and verify recovery
smoke_tests_utils.kill_and_wait_controller(name, 'serve'),
# Verify service remains accessible after controller recovery
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
_check_replica_in_status(name, [(1, False, 'READY')]),
- # f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ # f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
# 'curl $endpoint | grep "Hi, SkyPilot here"',
],
- _TEARDOWN_SERVICE.format(name=name),
+ TEARDOWN_SERVICE.format(name=name),
timeout=30 * 60,
env={
skypilot_config.ENV_VAR_GLOBAL_CONFIG: 'tests/skyserve/high_availability/config.yaml'
@@ -1219,9 +1219,9 @@ def test_skyserve_ha_kill_during_provision():
# Kill controller during provisioning
smoke_tests_utils.kill_and_wait_controller(name, 'serve'),
# Verify service eventually becomes ready
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
_check_replica_in_status(name, [(1, False, 'READY')]),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
'curl $endpoint | grep "Hi, SkyPilot here"',
# Check there is only one cluster
f'instance_names=$(gcloud compute instances list --filter="name~{name}" --format="value(name)"); '
@@ -1229,7 +1229,7 @@ def test_skyserve_ha_kill_during_provision():
'num_instances=$(echo "$instance_names" | wc -l); '
'[ "$num_instances" -eq "1" ] || (echo "Expected 1 instance, got $num_instances"; exit 1)',
],
- _TEARDOWN_SERVICE.format(name=name),
+ TEARDOWN_SERVICE.format(name=name),
timeout=30 * 60,
env={
skypilot_config.ENV_VAR_GLOBAL_CONFIG: 'tests/skyserve/high_availability/config.yaml'
@@ -1260,9 +1260,9 @@ def test_skyserve_ha_kill_during_pending():
# Kill controller during pending
smoke_tests_utils.kill_and_wait_controller(name, 'serve'),
# Verify service eventually becomes ready and accessible
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
_check_replica_in_status(name, [(1, False, 'READY')]),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
'curl $endpoint | grep "Hi, SkyPilot here"',
# Check there are one cluster
f'instance_names=$(gcloud compute instances list --filter="(labels.ray-cluster-name:{replica_cluster_name})" --format="value(name)"); '
@@ -1270,7 +1270,7 @@ def test_skyserve_ha_kill_during_pending():
'num_instances=$(echo "$instance_names" | wc -l); '
'[ "$num_instances" -eq "1" ] || (echo "Expected 1 instance, got $num_instances"; exit 1)',
],
- _TEARDOWN_SERVICE.format(name=name),
+ TEARDOWN_SERVICE.format(name=name),
timeout=30 * 60,
env={
skypilot_config.ENV_VAR_GLOBAL_CONFIG: 'tests/skyserve/high_availability/config.yaml'
@@ -1296,8 +1296,8 @@ def test_skyserve_ha_kill_during_shutdown():
smoke_tests_utils.launch_cluster_for_cloud_cmd('kubernetes', name),
# Launch service and wait for ready
f'sky serve up -n {name} -y tests/skyserve/high_availability/service.yaml',
- _SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
- f'{_SERVE_ENDPOINT_WAIT.format(name=name)}; '
+ SERVE_WAIT_UNTIL_READY.format(name=name, replica_num=1),
+ f'{SERVE_ENDPOINT_WAIT.format(name=name)}; '
'curl $endpoint | grep "Hi, SkyPilot here"',
# Record instance names and initiate shutdown of the replica
f'instance_names=$(gcloud compute instances list --filter="(labels.ray-cluster-name:{replica_cluster_name})" --format="value(name)"); '
@@ -1325,7 +1325,7 @@ def test_skyserve_ha_kill_during_shutdown():
' echo "Waiting for instances to terminate..."; sleep 5; '
'done',
],
- _TEARDOWN_SERVICE.format(name=name),
+ TEARDOWN_SERVICE.format(name=name),
timeout=30 * 60,
env={
skypilot_config.ENV_VAR_GLOBAL_CONFIG: 'tests/skyserve/high_availability/config.yaml'
diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 6d79bb1356..80f5a8905d 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -30,6 +30,7 @@
from smoke_tests.test_basic import *
from smoke_tests.test_cli import *
from smoke_tests.test_cluster_job import *
+from smoke_tests.test_examples import *
from smoke_tests.test_images import *
from smoke_tests.test_logs import *
from smoke_tests.test_managed_job import *
@@ -38,4 +39,4 @@
from smoke_tests.test_region_and_zone import *
from smoke_tests.test_sky_serve import *
from smoke_tests.test_ssm import *
-from smoke_tests.test_workspaces import *
\ No newline at end of file
+from smoke_tests.test_workspaces import *