@@ -500,6 +500,12 @@ def cli():
500
500
'resources and is used for scheduling the task. '
501
501
'Overrides the "accelerators" '
502
502
'config in the YAML if both are supplied.' ))
503
+ @click .option ('--num_nodes' ,
504
+ required = False ,
505
+ type = int ,
506
+ help = ('Number of nodes to launch and to execute the task on. '
507
+ 'Overrides the "num_nodes" config in the YAML if both are '
508
+ 'supplied.' ))
503
509
@click .option (
504
510
'--use-spot/--no-use-spot' ,
505
511
required = False ,
@@ -523,10 +529,21 @@ def cli():
523
529
default = False ,
524
530
required = False ,
525
531
help = 'Skip confirmation prompt.' )
526
- def launch (entrypoint : str , cluster : Optional [str ], dryrun : bool ,
527
- detach_run : bool , backend_name : str , workdir : Optional [str ],
528
- cloud : Optional [str ], gpus : Optional [str ], use_spot : Optional [bool ],
529
- name : Optional [str ], disk_size : Optional [int ], yes : bool ):
532
+ def launch (
533
+ entrypoint : str ,
534
+ cluster : Optional [str ],
535
+ dryrun : bool ,
536
+ detach_run : bool ,
537
+ backend_name : str ,
538
+ workdir : Optional [str ],
539
+ cloud : Optional [str ],
540
+ gpus : Optional [str ],
541
+ num_nodes : Optional [int ],
542
+ use_spot : Optional [bool ],
543
+ name : Optional [str ],
544
+ disk_size : Optional [int ],
545
+ yes : bool ,
546
+ ):
530
547
"""Launch a task from a YAML or a command (rerun setup if cluster exists).
531
548
532
549
If ENTRYPOINT points to a valid YAML file, it is read in as the task
@@ -570,6 +587,8 @@ def launch(entrypoint: str, cluster: Optional[str], dryrun: bool,
570
587
if disk_size is not None :
571
588
new_resources .disk_size = disk_size
572
589
task .set_resources ({new_resources })
590
+ if num_nodes is not None :
591
+ task .num_nodes = num_nodes
573
592
if name is not None :
574
593
task .name = name
575
594
@@ -623,21 +642,34 @@ def launch(entrypoint: str, cluster: Optional[str], dryrun: bool,
623
642
'--gpus' ,
624
643
required = False ,
625
644
type = str ,
626
- help = ('Task demands : Type and number of GPUs to use. Example values: '
645
+ help = ('Task demand : Type and number of GPUs to use. Example values: '
627
646
'"V100:8", "V100" (short for a count of 1), or "V100:0.5" '
628
647
'(fractional counts are supported by the scheduling framework). '
629
648
'This is used for scheduling the task, so it must fit the '
630
649
'cluster\' s total resources. Overrides the "accelerators" '
631
650
'config in the YAML if both are supplied.' ))
651
+ @click .option ('--num_nodes' ,
652
+ required = False ,
653
+ type = int ,
654
+ help = ('Task demand: Number of nodes to execute the task on. '
655
+ 'Overrides the "num_nodes" config in the YAML if both are '
656
+ 'supplied.' ))
632
657
@click .option ('--name' ,
633
658
'-n' ,
634
659
required = False ,
635
660
type = str ,
636
661
help = ('Task name. Overrides the "name" '
637
662
'config in the YAML if both are supplied.' ))
638
663
# pylint: disable=redefined-builtin
639
- def exec (cluster : str , entrypoint : str , detach_run : bool ,
640
- workdir : Optional [str ], gpus : Optional [str ], name : Optional [str ]):
664
+ def exec (
665
+ cluster : str ,
666
+ entrypoint : str ,
667
+ detach_run : bool ,
668
+ workdir : Optional [str ],
669
+ gpus : Optional [str ],
670
+ num_nodes : Optional [int ],
671
+ name : Optional [str ],
672
+ ):
641
673
"""Execute a task or a command on a cluster (skip setup).
642
674
643
675
If ENTRYPOINT points to a valid YAML file, it is read in as the task
@@ -646,13 +678,15 @@ def exec(cluster: str, entrypoint: str, detach_run: bool,
646
678
\b
647
679
Execution and scheduling behavior:
648
680
\b
649
- - If ENTRYPOINT is a YAML, or if it is a command with `--gpus` specified:
650
- it is treated as a proper task that will undergo job queue scheduling,
651
- respecting its resource requirement. It can be executed on any node of th
652
- cluster with enough resources.
653
- - Otherwise (if ENTRYPOINT is a command and no `--gpus` specified), it is
654
- treated as an inline command, to be executed only on the head node of the
655
- cluster.
681
+ - If ENTRYPOINT is a YAML, or if it is a command with a resource demand
682
+ flag specified (`--gpus` or `--num_nodes`): it is treated as a proper
683
+ task that will undergo job queue scheduling, respecting its resource
684
+ requirement. It can be executed on any node of th cluster with enough
685
+ resources.
686
+ - Otherwise (if ENTRYPOINT is a command and no resource demand flag
687
+ specified), it is treated as an inline command, to be executed only on
688
+ the head node of the cluster. This is useful for monitoring commands
689
+ (e.g., gpustat, htop).
656
690
657
691
In both cases, the commands are run under the task's workdir (if specified).
658
692
@@ -704,6 +738,7 @@ def exec(cluster: str, entrypoint: str, detach_run: bool,
704
738
raise click .BadParameter (f'Cluster \' { cluster } \' not found. '
705
739
'Use `sky launch` to provision first.' )
706
740
backend = backend_utils .get_backend_from_handle (handle )
741
+ resource_demand_specified = gpus is not None or num_nodes is not None
707
742
708
743
with sky .Dag () as dag :
709
744
if _check_yaml (entrypoint ):
@@ -722,7 +757,7 @@ def exec(cluster: str, entrypoint: str, detach_run: bool,
722
757
# Run inline commands directly on head node if the resources are
723
758
# not set. User should take the responsibility to not overload
724
759
# the cluster.
725
- if gpus is None :
760
+ if not resource_demand_specified :
726
761
if workdir is not None :
727
762
backend .sync_workdir (handle , workdir )
728
763
backend .run_on_head (
@@ -743,6 +778,8 @@ def exec(cluster: str, entrypoint: str, detach_run: bool,
743
778
copied = copy .deepcopy (list (task .resources )[0 ])
744
779
copied .accelerators = _parse_accelerator_options (gpus )
745
780
task .set_resources ({copied })
781
+ if num_nodes is not None :
782
+ task .num_nodes = num_nodes
746
783
if name is not None :
747
784
task .name = name
748
785
0 commit comments