@@ -686,25 +686,32 @@ def _yield_region_zones(self, to_provision: resources_lib.Resources,
686
686
prev_resources = handle .launched_resources
687
687
if prev_resources is not None and cloud .is_same_cloud (
688
688
prev_resources .cloud ):
689
- if cloud .is_same_cloud (sky .GCP ()) or cloud .is_same_cloud (
690
- sky .AWS ()):
689
+ if cloud .is_same_cloud (clouds .GCP ()) or cloud .is_same_cloud (
690
+ clouds .AWS ()):
691
691
region = config ['provider' ]['region' ]
692
692
zones = config ['provider' ]['availability_zone' ]
693
- elif cloud .is_same_cloud (sky .Azure ()):
693
+ elif cloud .is_same_cloud (clouds .Azure ()):
694
694
region = config ['provider' ]['location' ]
695
695
zones = None
696
- elif cloud .is_same_cloud (sky .Local ()):
696
+ elif cloud .is_same_cloud (clouds .Local ()):
697
697
local_regions = clouds .Local .regions ()
698
698
region = local_regions [0 ].name
699
699
zones = None
700
700
else :
701
701
assert False , cloud
702
- if region != prev_resources .region :
703
- raise ValueError (
704
- f'Region mismatch. The region in '
705
- f'{ handle .cluster_yaml } '
706
- 'has been changed from '
707
- f'{ prev_resources .region } to { region } .' )
702
+ assert region == prev_resources .region , (
703
+ f'Region mismatch. The region in '
704
+ f'{ handle .cluster_yaml } '
705
+ 'has been changed from '
706
+ f'{ prev_resources .region } to { region } .' )
707
+ assert (zones is None or prev_resources .zone is None or
708
+ prev_resources .zone
709
+ in zones ), (f'{ prev_resources .zone } not found in '
710
+ f'zones of { handle .cluster_yaml } .' )
711
+ # Note that we don't overwrite the zone field in Ray YAML
712
+ # even if prev_resources.zone != zones.
713
+ # This is because Ray will consider the YAML hash changed
714
+ # and not reuse the existing cluster.
708
715
except FileNotFoundError :
709
716
# Happens if no previous cluster.yaml exists.
710
717
pass
@@ -798,10 +805,15 @@ def _yield_region_zones(self, to_provision: resources_lib.Resources,
798
805
accelerators = to_provision .accelerators ,
799
806
use_spot = to_provision .use_spot ,
800
807
):
801
- # Do not retry on region if it's not in the requested region .
808
+ # Only retry requested region/zones or all if not specified .
802
809
if (to_provision .region is not None and
803
810
region .name != to_provision .region ):
804
811
continue
812
+ if to_provision .zone is not None :
813
+ zones_name = [zone .name for zone in zones ]
814
+ if to_provision .zone not in zones_name :
815
+ continue
816
+ zones = [clouds .Zone (name = to_provision .zone )]
805
817
yield (region , zones )
806
818
807
819
def _try_provision_tpu (self , to_provision : resources_lib .Resources ,
@@ -1410,12 +1422,12 @@ def _update_cluster_region(self):
1410
1422
config = common_utils .read_yaml (self .cluster_yaml )
1411
1423
provider = config ['provider' ]
1412
1424
cloud = self .launched_resources .cloud
1413
- if cloud .is_same_cloud (sky .Azure ()):
1425
+ if cloud .is_same_cloud (clouds .Azure ()):
1414
1426
region = provider ['location' ]
1415
- elif cloud .is_same_cloud (sky .GCP ()) or cloud .is_same_cloud (
1416
- sky .AWS ()):
1427
+ elif cloud .is_same_cloud (clouds .GCP ()) or cloud .is_same_cloud (
1428
+ clouds .AWS ()):
1417
1429
region = provider ['region' ]
1418
- elif cloud .is_same_cloud (sky .Local ()):
1430
+ elif cloud .is_same_cloud (clouds .Local ()):
1419
1431
# There is only 1 region for Local cluster, 'Local'.
1420
1432
local_regions = clouds .Local .regions ()
1421
1433
region = local_regions [0 ].name
@@ -1495,6 +1507,16 @@ def check_resources_fit_cluster(self, handle: ResourceHandle,
1495
1507
'Task requested resources in region '
1496
1508
f'{ task_resources .region !r} , but the existing cluster '
1497
1509
f'is in region { launched_resources .region !r} .' )
1510
+ if (task_resources .zone is not None and
1511
+ task_resources .zone != launched_resources .zone ):
1512
+ zone_str = (f'is in zone { launched_resources .zone !r} .'
1513
+ if launched_resources .zone is not None else
1514
+ 'does not have zone specified.' )
1515
+ with ux_utils .print_exception_no_traceback ():
1516
+ raise exceptions .ResourcesMismatchError (
1517
+ 'Task requested resources in zone '
1518
+ f'{ task_resources .zone !r} , but the existing cluster '
1519
+ f'{ zone_str } ' )
1498
1520
with ux_utils .print_exception_no_traceback ():
1499
1521
raise exceptions .ResourcesMismatchError (
1500
1522
'Requested resources do not match the existing cluster.\n '
@@ -1620,6 +1642,21 @@ def _provision(self,
1620
1642
# TPU.
1621
1643
tpu_create_script = config_dict .get ('tpu-create-script' ),
1622
1644
tpu_delete_script = config_dict .get ('tpu-delete-script' ))
1645
+
1646
+ # Get actual zone info and save it into handle
1647
+ get_zone_cmd = handle .launched_resources .cloud .get_zone_shell_cmd ()
1648
+ if get_zone_cmd is not None :
1649
+ # We leave the zone field to None for multi-node cases
1650
+ # if zone is not specified because head and worker nodes
1651
+ # can be launched in different zones.
1652
+ if (task .num_nodes == 1 or
1653
+ handle .launched_resources .zone is not None ):
1654
+ returncode , stdout , _ = self .run_on_head (
1655
+ handle , get_zone_cmd , require_outputs = True )
1656
+ # zone will be checked during Resources cls initialization.
1657
+ handle .launched_resources = handle .launched_resources .copy (
1658
+ zone = stdout .strip ())
1659
+
1623
1660
usage_lib .messages .usage .update_cluster_resources (
1624
1661
handle .launched_nodes , handle .launched_resources )
1625
1662
usage_lib .messages .usage .update_final_cluster_status (
0 commit comments