Skip to content

Commit 76eed62

Browse files
authored
[Resources] Add cpus in resource specification (skypilot-org#1622)
1 parent a9c194a commit 76eed62

19 files changed

+510
-144
lines changed

docs/source/reference/yaml-spec.rst

+6
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,12 @@ Available fields:
4747
# Format: <name>:<count> (or simply <name>, short for a count of 1).
4848
accelerators: V100:4
4949
50+
# Number of vCPUs per node (optional).
51+
#
52+
# Format: <count> (exactly <count> vCPUs) or <count>+
53+
# (at least <count> vCPUs).
54+
cpus: 32
55+
5056
# Instance type to use (optional). If 'accelerators' is specified,
5157
# the corresponding instance type is automatically inferred.
5258
instance_type: p3.8xlarge

examples/example_app.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def make_application():
4343
sky.Resources(sky.AWS(), 'p3.2xlarge'), # 1 V100, EC2.
4444
sky.Resources(sky.AWS(), 'p3.8xlarge'), # 4 V100s, EC2.
4545
# Tuples mean all resources are required.
46-
sky.Resources(sky.GCP(), 'n1-standard-8', 'tpu-v3-8'),
46+
sky.Resources(sky.GCP(), 'n1-standard-8', accelerators='tpu-v3-8'),
4747
})
4848

4949
train_op.set_time_estimator(time_estimators.resnet50_estimate_runtime)
@@ -60,8 +60,8 @@ def make_application():
6060
infer_op.set_resources({
6161
sky.Resources(sky.AWS(), 'inf1.2xlarge'),
6262
sky.Resources(sky.AWS(), 'p3.2xlarge'),
63-
sky.Resources(sky.GCP(), 'n1-standard-4', 'T4'),
64-
sky.Resources(sky.GCP(), 'n1-standard-8', 'T4'),
63+
sky.Resources(sky.GCP(), 'n1-standard-4', accelerators='T4'),
64+
sky.Resources(sky.GCP(), 'n1-standard-8', accelerators='T4'),
6565
})
6666

6767
infer_op.set_time_estimator(

sky/cli.py

+59-18
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,13 @@ def _interactive_node_cli_command(cli_func):
174174
default=None,
175175
type=str,
176176
help='Instance type to use.')
177+
cpus = click.option(
178+
'--cpus',
179+
default=None,
180+
type=str,
181+
help=('Number of vCPUs each instance must have '
182+
'(e.g., ``--cpus=4`` (exactly 4) or ``--cpus=4+`` (at least 4)). '
183+
'This is used to automatically select the instance type.'))
177184
gpus = click.option('--gpus',
178185
default=None,
179186
type=str,
@@ -268,6 +275,7 @@ def _interactive_node_cli_command(cli_func):
268275
region_option,
269276
zone_option,
270277
instance_type_option,
278+
cpus,
271279
*([gpus] if cli_func.__name__ == 'gpunode' else []),
272280
*([tpus] if cli_func.__name__ == 'tpunode' else []),
273281
spot_option,
@@ -556,6 +564,7 @@ def _parse_override_params(cloud: Optional[str] = None,
556564
region: Optional[str] = None,
557565
zone: Optional[str] = None,
558566
gpus: Optional[str] = None,
567+
cpus: Optional[str] = None,
559568
instance_type: Optional[str] = None,
560569
use_spot: Optional[bool] = None,
561570
image_id: Optional[str] = None,
@@ -582,6 +591,11 @@ def _parse_override_params(cloud: Optional[str] = None,
582591
override_params['accelerators'] = None
583592
else:
584593
override_params['accelerators'] = gpus
594+
if cpus is not None:
595+
if cpus.lower() == 'none':
596+
override_params['cpus'] = None
597+
else:
598+
override_params['cpus'] = cpus
585599
if instance_type is not None:
586600
if instance_type.lower() == 'none':
587601
override_params['instance_type'] = None
@@ -908,6 +922,7 @@ def _make_task_from_entrypoint_with_overrides(
908922
region: Optional[str] = None,
909923
zone: Optional[str] = None,
910924
gpus: Optional[str] = None,
925+
cpus: Optional[str] = None,
911926
instance_type: Optional[str] = None,
912927
num_nodes: Optional[int] = None,
913928
use_spot: Optional[bool] = None,
@@ -949,6 +964,7 @@ def _make_task_from_entrypoint_with_overrides(
949964
region=region,
950965
zone=zone,
951966
gpus=gpus,
967+
cpus=cpus,
952968
instance_type=instance_type,
953969
use_spot=use_spot,
954970
image_id=image_id,
@@ -1090,6 +1106,13 @@ def cli():
10901106
default=False,
10911107
help='If used, runs locally inside a docker container.')
10921108
@_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS)
1109+
@click.option('--cpus',
1110+
default=None,
1111+
type=str,
1112+
required=False,
1113+
help=('Number of vCPUs each instance must have (e.g., '
1114+
'``--cpus=4`` (exactly 4) or ``--cpus=4+`` (at least 4)). '
1115+
'This is used to automatically select the instance type.'))
10931116
@click.option('--disk-size',
10941117
default=None,
10951118
type=int,
@@ -1154,6 +1177,7 @@ def launch(
11541177
region: Optional[str],
11551178
zone: Optional[str],
11561179
gpus: Optional[str],
1180+
cpus: Optional[str],
11571181
instance_type: Optional[str],
11581182
num_nodes: Optional[int],
11591183
use_spot: Optional[bool],
@@ -1198,6 +1222,7 @@ def launch(
11981222
region=region,
11991223
zone=zone,
12001224
gpus=gpus,
1225+
cpus=cpus,
12011226
instance_type=instance_type,
12021227
num_nodes=num_nodes,
12031228
use_spot=use_spot,
@@ -1343,6 +1368,7 @@ def exec(
13431368
region=region,
13441369
zone=zone,
13451370
gpus=gpus,
1371+
cpus=None,
13461372
instance_type=instance_type,
13471373
use_spot=use_spot,
13481374
image_id=image_id,
@@ -2414,11 +2440,11 @@ def _down_or_stop(name: str):
24142440
# pylint: disable=redefined-outer-name
24152441
def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
24162442
cloud: Optional[str], region: Optional[str], zone: Optional[str],
2417-
instance_type: Optional[str], gpus: Optional[str],
2418-
use_spot: Optional[bool], screen: Optional[bool],
2419-
tmux: Optional[bool], disk_size: Optional[int],
2420-
idle_minutes_to_autostop: Optional[int], down: bool,
2421-
retry_until_up: bool):
2443+
instance_type: Optional[str], cpus: Optional[str],
2444+
gpus: Optional[str], use_spot: Optional[bool],
2445+
screen: Optional[bool], tmux: Optional[bool],
2446+
disk_size: Optional[int], idle_minutes_to_autostop: Optional[int],
2447+
down: bool, retry_until_up: bool):
24222448
"""Launch or attach to an interactive GPU node.
24232449
24242450
Examples:
@@ -2457,7 +2483,8 @@ def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
24572483

24582484
user_requested_resources = not (cloud is None and region is None and
24592485
zone is None and instance_type is None and
2460-
gpus is None and use_spot is None)
2486+
cpus is None and gpus is None and
2487+
use_spot is None)
24612488
default_resources = _INTERACTIVE_NODE_DEFAULT_RESOURCES['gpunode']
24622489
cloud_provider = clouds.CLOUD_REGISTRY.from_str(cloud)
24632490
if gpus is None and instance_type is None:
@@ -2470,6 +2497,7 @@ def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
24702497
region=region,
24712498
zone=zone,
24722499
instance_type=instance_type,
2500+
cpus=cpus,
24732501
accelerators=gpus,
24742502
use_spot=use_spot,
24752503
disk_size=disk_size)
@@ -2493,10 +2521,11 @@ def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
24932521
# pylint: disable=redefined-outer-name
24942522
def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
24952523
cloud: Optional[str], region: Optional[str], zone: Optional[str],
2496-
instance_type: Optional[str], use_spot: Optional[bool],
2497-
screen: Optional[bool], tmux: Optional[bool],
2498-
disk_size: Optional[int], idle_minutes_to_autostop: Optional[int],
2499-
down: bool, retry_until_up: bool):
2524+
instance_type: Optional[str], cpus: Optional[str],
2525+
use_spot: Optional[bool], screen: Optional[bool],
2526+
tmux: Optional[bool], disk_size: Optional[int],
2527+
idle_minutes_to_autostop: Optional[int], down: bool,
2528+
retry_until_up: bool):
25002529
"""Launch or attach to an interactive CPU node.
25012530
25022531
Examples:
@@ -2534,7 +2563,7 @@ def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
25342563

25352564
user_requested_resources = not (cloud is None and region is None and
25362565
zone is None and instance_type is None and
2537-
use_spot is None)
2566+
cpus is None and use_spot is None)
25382567
default_resources = _INTERACTIVE_NODE_DEFAULT_RESOURCES['cpunode']
25392568
cloud_provider = clouds.CLOUD_REGISTRY.from_str(cloud)
25402569
if instance_type is None:
@@ -2545,6 +2574,7 @@ def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
25452574
region=region,
25462575
zone=zone,
25472576
instance_type=instance_type,
2577+
cpus=cpus,
25482578
use_spot=use_spot,
25492579
disk_size=disk_size)
25502580

@@ -2567,11 +2597,12 @@ def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
25672597
# pylint: disable=redefined-outer-name
25682598
def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
25692599
region: Optional[str], zone: Optional[str],
2570-
instance_type: Optional[str], tpus: Optional[str],
2571-
use_spot: Optional[bool], tpu_vm: Optional[bool],
2572-
screen: Optional[bool], tmux: Optional[bool],
2573-
disk_size: Optional[int], idle_minutes_to_autostop: Optional[int],
2574-
down: bool, retry_until_up: bool):
2600+
instance_type: Optional[str], cpus: Optional[str],
2601+
tpus: Optional[str], use_spot: Optional[bool],
2602+
tpu_vm: Optional[bool], screen: Optional[bool],
2603+
tmux: Optional[bool], disk_size: Optional[int],
2604+
idle_minutes_to_autostop: Optional[int], down: bool,
2605+
retry_until_up: bool):
25752606
"""Launch or attach to an interactive TPU node.
25762607
25772608
Examples:
@@ -2608,8 +2639,8 @@ def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
26082639
name = _default_interactive_node_name('tpunode')
26092640

26102641
user_requested_resources = not (region is None and zone is None and
2611-
instance_type is None and tpus is None and
2612-
use_spot is None)
2642+
instance_type is None and cpus is None and
2643+
tpus is None and use_spot is None)
26132644
default_resources = _INTERACTIVE_NODE_DEFAULT_RESOURCES['tpunode']
26142645
accelerator_args = default_resources.accelerator_args
26152646
if tpu_vm:
@@ -2625,6 +2656,7 @@ def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
26252656
region=region,
26262657
zone=zone,
26272658
instance_type=instance_type,
2659+
cpus=cpus,
26282660
accelerators=tpus,
26292661
accelerator_args=accelerator_args,
26302662
use_spot=use_spot,
@@ -2967,6 +2999,13 @@ def spot():
29672999
**_get_shell_complete_args(_complete_file_name))
29683000
# TODO(zhwu): Add --dryrun option to test the launch command.
29693001
@_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS)
3002+
@click.option('--cpus',
3003+
default=None,
3004+
type=str,
3005+
required=False,
3006+
help=('Number of vCPUs each instance must have (e.g., '
3007+
'``--cpus=4`` (exactly 4) or ``--cpus=4+`` (at least 4)). '
3008+
'This is used to automatically select the instance type.'))
29703009
@click.option('--spot-recovery',
29713010
default=None,
29723011
type=str,
@@ -3009,6 +3048,7 @@ def spot_launch(
30093048
region: Optional[str],
30103049
zone: Optional[str],
30113050
gpus: Optional[str],
3051+
cpus: Optional[str],
30123052
instance_type: Optional[str],
30133053
num_nodes: Optional[int],
30143054
use_spot: Optional[bool],
@@ -3047,6 +3087,7 @@ def spot_launch(
30473087
region=region,
30483088
zone=zone,
30493089
gpus=gpus,
3090+
cpus=cpus,
30503091
instance_type=instance_type,
30513092
num_nodes=num_nodes,
30523093
use_spot=use_spot,

sky/clouds/aws.py

+14-9
Original file line numberDiff line numberDiff line change
@@ -274,10 +274,10 @@ def is_same_cloud(self, other: clouds.Cloud):
274274
return isinstance(other, AWS)
275275

276276
@classmethod
277-
def get_default_instance_type(cls) -> str:
278-
# General-purpose instance with 8 vCPUs and 32 GB RAM.
279-
# Intel Ice Lake 8375C
280-
return 'm6i.2xlarge'
277+
def get_default_instance_type(cls,
278+
cpus: Optional[str] = None) -> Optional[str]:
279+
return service_catalog.get_default_instance_type(cpus=cpus,
280+
clouds='aws')
281281

282282
# TODO: factor the following three methods, as they are the same logic
283283
# between Azure and AWS.
@@ -334,12 +334,11 @@ def make_deploy_resources_variables(
334334

335335
def get_feasible_launchable_resources(self,
336336
resources: 'resources_lib.Resources'):
337-
fuzzy_candidate_list: List[str] = []
338337
if resources.instance_type is not None:
339338
assert resources.is_launchable(), resources
340339
# Treat Resources(AWS, p3.2x, V100) as Resources(AWS, p3.2x).
341340
resources = resources.copy(accelerators=None)
342-
return ([resources], fuzzy_candidate_list)
341+
return ([resources], [])
343342

344343
def _make(instance_list):
345344
resource_list = []
@@ -350,16 +349,21 @@ def _make(instance_list):
350349
# Setting this to None as AWS doesn't separately bill /
351350
# attach the accelerators. Billed as part of the VM type.
352351
accelerators=None,
352+
cpus=None,
353353
)
354354
resource_list.append(r)
355355
return resource_list
356356

357357
# Currently, handle a filter on accelerators only.
358358
accelerators = resources.accelerators
359359
if accelerators is None:
360-
# No requirements to filter, so just return a default VM type.
361-
return (_make([AWS.get_default_instance_type()]),
362-
fuzzy_candidate_list)
360+
# Return a default instance type with the given number of vCPUs.
361+
default_instance_type = AWS.get_default_instance_type(
362+
cpus=resources.cpus)
363+
if default_instance_type is None:
364+
return ([], [])
365+
else:
366+
return (_make([default_instance_type]), [])
363367

364368
assert len(accelerators) == 1, resources
365369
acc, acc_count = list(accelerators.items())[0]
@@ -368,6 +372,7 @@ def _make(instance_list):
368372
acc,
369373
acc_count,
370374
use_spot=resources.use_spot,
375+
cpus=resources.cpus,
371376
region=resources.region,
372377
zone=resources.zone,
373378
clouds='aws')

sky/clouds/azure.py

+16-10
Original file line numberDiff line numberDiff line change
@@ -94,10 +94,10 @@ def is_same_cloud(self, other):
9494
return isinstance(other, Azure)
9595

9696
@classmethod
97-
def get_default_instance_type(cls) -> str:
98-
# General-purpose instance with 8 vCPUs and 32 GB RAM.
99-
# Intel Ice Lake 8370C
100-
return 'Standard_D8_v5'
97+
def get_default_instance_type(cls,
98+
cpus: Optional[str] = None) -> Optional[str]:
99+
return service_catalog.get_default_instance_type(cpus=cpus,
100+
clouds='azure')
101101

102102
def _get_image_config(self, gen_version, instance_type):
103103
# az vm image list \
@@ -250,12 +250,11 @@ def get_feasible_launchable_resources(self, resources):
250250
# TODO(zhwu): our azure subscription offer ID does not support spot.
251251
# Need to support it.
252252
return ([], [])
253-
fuzzy_candidate_list = []
254253
if resources.instance_type is not None:
255254
assert resources.is_launchable(), resources
256255
# Treat Resources(AWS, p3.2x, V100) as Resources(AWS, p3.2x).
257256
resources = resources.copy(accelerators=None)
258-
return ([resources], fuzzy_candidate_list)
257+
return ([resources], [])
259258

260259
def _make(instance_list):
261260
resource_list = []
@@ -265,23 +264,30 @@ def _make(instance_list):
265264
instance_type=instance_type,
266265
# Setting this to None as Azure doesn't separately bill /
267266
# attach the accelerators. Billed as part of the VM type.
268-
accelerators=None)
267+
accelerators=None,
268+
cpus=None,
269+
)
269270
resource_list.append(r)
270271
return resource_list
271272

272273
# Currently, handle a filter on accelerators only.
273274
accelerators = resources.accelerators
274275
if accelerators is None:
275-
# No requirements to filter, so just return a default VM type.
276-
return (_make([Azure.get_default_instance_type()]),
277-
fuzzy_candidate_list)
276+
# Return a default instance type with the given number of vCPUs.
277+
default_instance_type = Azure.get_default_instance_type(
278+
cpus=resources.cpus)
279+
if default_instance_type is None:
280+
return ([], [])
281+
else:
282+
return (_make([default_instance_type]), [])
278283

279284
assert len(accelerators) == 1, resources
280285
acc, acc_count = list(accelerators.items())[0]
281286
(instance_list, fuzzy_candidate_list
282287
) = service_catalog.get_instance_type_for_accelerator(
283288
acc,
284289
acc_count,
290+
cpus=resources.cpus,
285291
use_spot=resources.use_spot,
286292
region=resources.region,
287293
zone=resources.zone,

0 commit comments

Comments
 (0)