Skip to content

Commit e9a699e

Browse files
Make azure-start-stop use 1 node to speed up tests. (skypilot-org#619)
* Make azure-start-stop use 1 node to speed up tests. * Add --num_nodes to both launch and exec. Tested: - sky launch --num_nodes=2 'echo $(hostname)' -c test - sky exec test 'echo $(hostname)' --num_nodes=2 - sky exec test 'echo $(hostname)' * Support test_azure_start_stop_two_nodes() under --runslow. * Support passing a test name. * Make Task.num_nodes a property and validate it.
1 parent a966420 commit e9a699e

7 files changed

+163
-42
lines changed

examples/azure_start_stop.yaml

+4-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,10 @@ name: azure-start-stop
44
resources:
55
cloud: azure
66

7-
num_nodes: 2
7+
# Optimizing for smoke tests
8+
# 2 nodes: smoke tests ~37 mins
9+
# 1 node: smoke tests ~19 mins
10+
# num_nodes: 2
811

912
# The setup command. Will be run under the working directory.
1013
setup: 'echo "azure-start-stop [setup]"'

sky/cli.py

+52-15
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,12 @@ def cli():
500500
'resources and is used for scheduling the task. '
501501
'Overrides the "accelerators" '
502502
'config in the YAML if both are supplied.'))
503+
@click.option('--num_nodes',
504+
required=False,
505+
type=int,
506+
help=('Number of nodes to launch and to execute the task on. '
507+
'Overrides the "num_nodes" config in the YAML if both are '
508+
'supplied.'))
503509
@click.option(
504510
'--use-spot/--no-use-spot',
505511
required=False,
@@ -523,10 +529,21 @@ def cli():
523529
default=False,
524530
required=False,
525531
help='Skip confirmation prompt.')
526-
def launch(entrypoint: str, cluster: Optional[str], dryrun: bool,
527-
detach_run: bool, backend_name: str, workdir: Optional[str],
528-
cloud: Optional[str], gpus: Optional[str], use_spot: Optional[bool],
529-
name: Optional[str], disk_size: Optional[int], yes: bool):
532+
def launch(
533+
entrypoint: str,
534+
cluster: Optional[str],
535+
dryrun: bool,
536+
detach_run: bool,
537+
backend_name: str,
538+
workdir: Optional[str],
539+
cloud: Optional[str],
540+
gpus: Optional[str],
541+
num_nodes: Optional[int],
542+
use_spot: Optional[bool],
543+
name: Optional[str],
544+
disk_size: Optional[int],
545+
yes: bool,
546+
):
530547
"""Launch a task from a YAML or a command (rerun setup if cluster exists).
531548
532549
If ENTRYPOINT points to a valid YAML file, it is read in as the task
@@ -570,6 +587,8 @@ def launch(entrypoint: str, cluster: Optional[str], dryrun: bool,
570587
if disk_size is not None:
571588
new_resources.disk_size = disk_size
572589
task.set_resources({new_resources})
590+
if num_nodes is not None:
591+
task.num_nodes = num_nodes
573592
if name is not None:
574593
task.name = name
575594

@@ -623,21 +642,34 @@ def launch(entrypoint: str, cluster: Optional[str], dryrun: bool,
623642
'--gpus',
624643
required=False,
625644
type=str,
626-
help=('Task demands: Type and number of GPUs to use. Example values: '
645+
help=('Task demand: Type and number of GPUs to use. Example values: '
627646
'"V100:8", "V100" (short for a count of 1), or "V100:0.5" '
628647
'(fractional counts are supported by the scheduling framework). '
629648
'This is used for scheduling the task, so it must fit the '
630649
'cluster\'s total resources. Overrides the "accelerators" '
631650
'config in the YAML if both are supplied.'))
651+
@click.option('--num_nodes',
652+
required=False,
653+
type=int,
654+
help=('Task demand: Number of nodes to execute the task on. '
655+
'Overrides the "num_nodes" config in the YAML if both are '
656+
'supplied.'))
632657
@click.option('--name',
633658
'-n',
634659
required=False,
635660
type=str,
636661
help=('Task name. Overrides the "name" '
637662
'config in the YAML if both are supplied.'))
638663
# pylint: disable=redefined-builtin
639-
def exec(cluster: str, entrypoint: str, detach_run: bool,
640-
workdir: Optional[str], gpus: Optional[str], name: Optional[str]):
664+
def exec(
665+
cluster: str,
666+
entrypoint: str,
667+
detach_run: bool,
668+
workdir: Optional[str],
669+
gpus: Optional[str],
670+
num_nodes: Optional[int],
671+
name: Optional[str],
672+
):
641673
"""Execute a task or a command on a cluster (skip setup).
642674
643675
If ENTRYPOINT points to a valid YAML file, it is read in as the task
@@ -646,13 +678,15 @@ def exec(cluster: str, entrypoint: str, detach_run: bool,
646678
\b
647679
Execution and scheduling behavior:
648680
\b
649-
- If ENTRYPOINT is a YAML, or if it is a command with `--gpus` specified:
650-
it is treated as a proper task that will undergo job queue scheduling,
651-
respecting its resource requirement. It can be executed on any node of th
652-
cluster with enough resources.
653-
- Otherwise (if ENTRYPOINT is a command and no `--gpus` specified), it is
654-
treated as an inline command, to be executed only on the head node of the
655-
cluster.
681+
- If ENTRYPOINT is a YAML, or if it is a command with a resource demand
682+
flag specified (`--gpus` or `--num_nodes`): it is treated as a proper
683+
task that will undergo job queue scheduling, respecting its resource
684+
requirement. It can be executed on any node of th cluster with enough
685+
resources.
686+
- Otherwise (if ENTRYPOINT is a command and no resource demand flag
687+
specified), it is treated as an inline command, to be executed only on
688+
the head node of the cluster. This is useful for monitoring commands
689+
(e.g., gpustat, htop).
656690
657691
In both cases, the commands are run under the task's workdir (if specified).
658692
@@ -704,6 +738,7 @@ def exec(cluster: str, entrypoint: str, detach_run: bool,
704738
raise click.BadParameter(f'Cluster \'{cluster}\' not found. '
705739
'Use `sky launch` to provision first.')
706740
backend = backend_utils.get_backend_from_handle(handle)
741+
resource_demand_specified = gpus is not None or num_nodes is not None
707742

708743
with sky.Dag() as dag:
709744
if _check_yaml(entrypoint):
@@ -722,7 +757,7 @@ def exec(cluster: str, entrypoint: str, detach_run: bool,
722757
# Run inline commands directly on head node if the resources are
723758
# not set. User should take the responsibility to not overload
724759
# the cluster.
725-
if gpus is None:
760+
if not resource_demand_specified:
726761
if workdir is not None:
727762
backend.sync_workdir(handle, workdir)
728763
backend.run_on_head(
@@ -743,6 +778,8 @@ def exec(cluster: str, entrypoint: str, detach_run: bool,
743778
copied = copy.deepcopy(list(task.resources)[0])
744779
copied.accelerators = _parse_accelerator_options(gpus)
745780
task.set_resources({copied})
781+
if num_nodes is not None:
782+
task.num_nodes = num_nodes
746783
if name is not None:
747784
task.name = name
748785

sky/task.py

+15-7
Original file line numberDiff line numberDiff line change
@@ -110,10 +110,9 @@ def __init__(
110110
self.storage_plans = {}
111111
self.setup = setup
112112
self.workdir = workdir
113-
self.docker_image = docker_image if docker_image \
114-
else 'gpuci/miniconda-cuda:11.4-runtime-ubuntu18.04'
115-
self._explicit_num_nodes = num_nodes # Used as a scheduling constraint.
116-
self.num_nodes = 1 if num_nodes is None else num_nodes
113+
self.docker_image = (docker_image if docker_image else
114+
'gpuci/miniconda-cuda:11.4-runtime-ubuntu18.04')
115+
self.num_nodes = num_nodes
117116
self.inputs = None
118117
self.outputs = None
119118
self.estimated_inputs_size_gigabytes = None
@@ -323,9 +322,18 @@ def from_yaml(yaml_path):
323322
task.set_resources({resources})
324323
return task
325324

326-
def validate_config(self):
327-
if self.num_nodes <= 0:
328-
raise ValueError('Must set Task.num_nodes to >0.')
325+
@property
326+
def num_nodes(self) -> int:
327+
return self._num_nodes
328+
329+
@num_nodes.setter
330+
def num_nodes(self, num_nodes: Optional[int]) -> None:
331+
if num_nodes is None:
332+
num_nodes = 1
333+
if not isinstance(num_nodes, int) or num_nodes <= 0:
334+
raise ValueError(
335+
f'num_nodes should be a positive int. Got: {num_nodes}')
336+
self._num_nodes = num_nodes
329337

330338
# E.g., 's3://bucket', 'gs://bucket', or None.
331339
def set_inputs(self, inputs, estimated_size_gigabytes):

tests/conftest.py

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import pytest
2+
3+
# Usage: use
4+
# @pytest.mark.slow
5+
# to mark a test as slow and to skip by default.
6+
# https://docs.pytest.org/en/latest/example/simple.html#control-skipping-of-tests-according-to-command-line-option
7+
8+
9+
def pytest_addoption(parser):
10+
parser.addoption('--runslow',
11+
action='store_true',
12+
default=False,
13+
help='run slow tests')
14+
15+
16+
def pytest_configure(config):
17+
config.addinivalue_line('markers', 'slow: mark test as slow to run')
18+
19+
20+
def pytest_collection_modifyitems(config, items):
21+
if config.getoption('--runslow'):
22+
# --runslow given in cli: do not skip slow tests
23+
return
24+
skip_slow = pytest.mark.skip(reason='need --runslow option to run')
25+
for item in items:
26+
if 'slow' in item.keywords:
27+
item.add_marker(skip_slow)

tests/run_smoke_tests.sh

+21-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,22 @@
11
#!/bin/bash
2-
time pytest -s -n 16 -q --tb=short --disable-warnings tests/test_smoke.py
2+
# Usage:
3+
#
4+
# # Run everything
5+
# bash tests/run_smoke_tests.sh
6+
#
7+
# # Re-run a failed test
8+
# bash tests/run_smoke_tests.sh test_azure_start_stop
9+
#
10+
11+
test=${1:-""}
12+
if [ -z "$test" ]
13+
then
14+
test_spec=tests/test_smoke.py
15+
else
16+
test_spec=tests/test_smoke.py::"$test"
17+
fi
18+
19+
pytest -s -n 16 -q --tb=short --disable-warnings "$test_spec"
20+
21+
# To run all tests including the slow ones, add the --runslow flag:
22+
# pytest --runslow -s -n 16 -q --tb=short --disable-warnings tests/test_smoke.py

tests/test_optimizer_dryruns.py

+9
Original file line numberDiff line numberDiff line change
@@ -174,3 +174,12 @@ def test_parse_accelerators_from_yaml():
174174
with pytest.raises(ValueError) as e:
175175
_test_parse_accelerators(spec, None)
176176
assert 'The "accelerators" field as a str ' in str(e.value)
177+
178+
179+
def test_invalid_num_nodes():
180+
for invalid_value in (-1, 2.2, 1.0):
181+
with pytest.raises(ValueError) as e:
182+
with sky.Dag():
183+
task = sky.Task()
184+
task.num_nodes = invalid_value
185+
assert 'num_nodes should be a positive int' in str(e.value)

tests/test_smoke.py

+35-18
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from typing import List, Optional, Tuple, NamedTuple
55

66
import colorama
7+
import pytest
78

89
from sky.backends import backend_utils
910

@@ -33,16 +34,12 @@ def run_one_test(test: Test) -> Tuple[int, str, str]:
3334
delete=False)
3435
test.echo('Test started.')
3536
for command in test.commands:
36-
# test.echo(f' {command}')
3737
proc = subprocess.Popen(
3838
command,
3939
stdout=log_file,
4040
stderr=subprocess.STDOUT,
4141
shell=True,
4242
)
43-
message = (f'Test command exited with non-zero status: '
44-
f'{command!r}')
45-
error_occurred = (f'{message}')
4643
try:
4744
proc.wait(timeout=test.timeout)
4845
except subprocess.TimeoutExpired as e:
@@ -56,7 +53,8 @@ def run_one_test(test: Test) -> Tuple[int, str, str]:
5653

5754
style = colorama.Style
5855
fore = colorama.Fore
59-
outcome = f'{fore.RED}Failed{style.RESET_ALL}' if proc.returncode else f'{fore.GREEN}Passed{style.RESET_ALL}'
56+
outcome = (f'{fore.RED}Failed{style.RESET_ALL}'
57+
if proc.returncode else f'{fore.GREEN}Passed{style.RESET_ALL}')
6058
reason = f'\nReason: {command!r}' if proc.returncode else ''
6159
test.echo(f'{outcome}.'
6260
f'{reason}'
@@ -231,6 +229,25 @@ def test_distributed_tf():
231229
run_one_test(test)
232230

233231

232+
# ---------- Testing GCP start and stop instances ----------
233+
def test_gcp_start_stop():
234+
test = Test(
235+
'gcp-start-stop',
236+
[
237+
'sky launch -y -c test-gcp-start-stop examples/gcp_start_stop.yaml',
238+
'sky logs test-gcp-start-stop 1 --status', # Ensure the job succeeded.
239+
'sky exec test-gcp-start-stop examples/gcp_start_stop.yaml',
240+
'sky logs test-gcp-start-stop 2 --status', # Ensure the job succeeded.
241+
'sky stop -y test-gcp-start-stop',
242+
'sky start -y test-gcp-start-stop',
243+
'sky exec test-gcp-start-stop examples/gcp_start_stop.yaml',
244+
'sky logs test-gcp-start-stop 3 --status', # Ensure the job succeeded.
245+
],
246+
'sky down -y test-gcp-start-stop',
247+
)
248+
run_one_test(test)
249+
250+
234251
# ---------- Testing Azure start and stop instances ----------
235252
def test_azure_start_stop():
236253
test = Test(
@@ -245,25 +262,25 @@ def test_azure_start_stop():
245262
'sky logs test-azure-start-stop 2 --status', # Ensure the job succeeded.
246263
],
247264
'sky down -y test-azure-start-stop',
248-
timeout=30 * 60, # 30 mins (it takes around ~23 mins)
265+
timeout=30 * 60, # 30 mins
249266
)
250267
run_one_test(test)
251268

252269

253-
# ---------- Testing GCP start and stop instances ----------
254-
def test_gcp_start_stop():
270+
@pytest.mark.slow
271+
def test_azure_start_stop_two_nodes():
255272
test = Test(
256-
'gcp-start-stop',
273+
'azure-start-stop-two-nodes',
257274
[
258-
'sky launch -y -c test-gcp-start-stop examples/gcp_start_stop.yaml',
259-
'sky logs test-gcp-start-stop 1 --status', # Ensure the job succeeded.
260-
'sky exec test-gcp-start-stop examples/gcp_start_stop.yaml',
261-
'sky logs test-gcp-start-stop 2 --status', # Ensure the job succeeded.
262-
'sky stop -y test-gcp-start-stop',
263-
'sky start -y test-gcp-start-stop',
264-
'sky exec test-gcp-start-stop examples/gcp_start_stop.yaml',
265-
'sky logs test-gcp-start-stop 3 --status', # Ensure the job succeeded.
275+
'sky launch --num_nodes=2 -y -c test-azure-start-stop examples/azure_start_stop.yaml',
276+
'sky exec --num_nodes=2 test-azure-start-stop examples/azure_start_stop.yaml',
277+
'sky logs test-azure-start-stop 1 --status', # Ensure the job succeeded.
278+
'sky stop -y test-azure-start-stop',
279+
'sky start -y test-azure-start-stop',
280+
'sky exec --num_nodes=2 test-azure-start-stop examples/azure_start_stop.yaml',
281+
'sky logs test-azure-start-stop 2 --status', # Ensure the job succeeded.
266282
],
267-
'sky down -y test-gcp-start-stop',
283+
'sky down -y test-azure-start-stop',
284+
timeout=30 * 60, # 30 mins (it takes around ~23 mins)
268285
)
269286
run_one_test(test)

0 commit comments

Comments
 (0)