Skip to content

Commit 4b67bca

Browse files
authored
Fix GCP VM leak issue (skypilot-org#1102)
* tmp * const * sky start safeguard * address comments * style
1 parent c224819 commit 4b67bca

File tree

4 files changed

+50
-5
lines changed

4 files changed

+50
-5
lines changed

sky/backends/cloud_vm_ray_backend.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,9 @@ def _update_blocklist_on_gcp_error(self, region, zones, stdout, stderr):
510510
# when VM is preempted during creation. This seems to be
511511
# not documented by GCP.
512512
self._blocked_zones.add(zone.name)
513+
elif code in ['RESOURCE_NOT_READY']:
514+
# This code is returned when the VM is still STOPPING.
515+
self._blocked_zones.add(zone.name)
513516
elif code == 8:
514517
# Error code 8 means TPU resources is out of
515518
# capacity. Example:

sky/skylet/providers/gcp/node.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ class inheriting from ``GCPNode``. Those classes are essentially dicts
4848
# considerably - this probably could be smaller
4949
# TPU deletion uses MAX_POLLS
5050
MAX_POLLS_TPU = MAX_POLLS * 8
51+
# Stopping instances can take several minutes, so we increase the timeout
52+
MAX_POLLS_STOP = MAX_POLLS * 8
5153
POLL_INTERVAL = 5
5254

5355

@@ -134,6 +136,8 @@ class GCPNode(UserDict, metaclass=abc.ABCMeta):
134136

135137
NON_TERMINATED_STATUSES = None
136138
RUNNING_STATUSES = None
139+
STOPPED_STATUSES = None
140+
STOPPING_STATUSES = None
137141
STATUS_FIELD = None
138142

139143
def __init__(self, base_dict: dict, resource: "GCPResource", **kwargs) -> None:
@@ -147,6 +151,12 @@ def is_running(self) -> bool:
147151
def is_terminated(self) -> bool:
148152
return self.get(self.STATUS_FIELD) not in self.NON_TERMINATED_STATUSES
149153

154+
def is_stopped(self) -> bool:
155+
return self.get(self.STATUS_FIELD) in self.STOPPED_STATUSES
156+
157+
def is_stopping(self) -> bool:
158+
return self.get(self.STATUS_FIELD) in self.STOPPING_STATUSES
159+
150160
@property
151161
def id(self) -> str:
152162
return self["name"]
@@ -173,6 +183,8 @@ class GCPComputeNode(GCPNode):
173183
# https://cloud.google.com/compute/docs/instances/instance-life-cycle
174184
NON_TERMINATED_STATUSES = {"PROVISIONING", "STAGING", "RUNNING"}
175185
RUNNING_STATUSES = {"RUNNING"}
186+
STOPPED_STATUSES = {"TERMINATED"}
187+
STOPPING_STATUSES = {"STOPPING"}
176188
STATUS_FIELD = "status"
177189

178190
def get_labels(self) -> dict:
@@ -196,13 +208,14 @@ class GCPTPUNode(GCPNode):
196208

197209
NON_TERMINATED_STATUSES = {"CREATING", "STARTING", "RESTARTING", "READY"}
198210
RUNNING_STATUSES = {"READY"}
211+
STOPPED_STATUSES = {"STOPPED"}
212+
STOPPING_STATUSES = {"STOPPING"}
199213
STATUS_FIELD = "state"
200214

201215
# SKY: get status of TPU VM for status filtering
202216
def get_status(self) -> str:
203217
return self.get(self.STATUS_FIELD)
204218

205-
206219
def get_labels(self) -> dict:
207220
return self.get("labels", {})
208221

sky/skylet/providers/gcp/node_provider.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
GCPNodeType,
3030
INSTANCE_NAME_MAX_LEN,
3131
INSTANCE_NAME_UUID_LEN,
32+
MAX_POLLS_STOP,
33+
POLL_INTERVAL,
3234
)
3335

3436
logger = logging.getLogger(__name__)
@@ -190,10 +192,12 @@ def create_node(self, base_config: dict, tags: dict, count: int) -> None:
190192
if TAG_RAY_USER_NODE_TYPE in labels:
191193
filters[TAG_RAY_USER_NODE_TYPE] = labels[TAG_RAY_USER_NODE_TYPE]
192194
# SKY: "TERMINATED" for compute VM, "STOPPED" for TPU VM
195+
# "STOPPING" means the VM is being stopped, which needs
196+
# to be included to avoid creating a new VM.
193197
if isinstance(resource, GCPCompute):
194-
STOPPED_STATUS = ["TERMINATED"]
198+
STOPPED_STATUS = ["TERMINATED", "STOPPING"]
195199
else:
196-
STOPPED_STATUS = ["STOPPED"]
200+
STOPPED_STATUS = ["STOPPED", "STOPPING"]
197201
reuse_nodes = resource._list_instances(
198202
filters, STOPPED_STATUS)[:count]
199203
reuse_node_ids = [n.id for n in reuse_nodes]
@@ -229,6 +233,26 @@ def terminate_node(self, node_id: str):
229233
),
230234
)
231235
result = resource.stop_instance(node_id=node_id)
236+
237+
# Check if the instance is actually stopped.
238+
# GCP does not fully stop an instance even after
239+
# the stop operation is finished.
240+
for _ in range(MAX_POLLS_STOP):
241+
instance = resource.get_instance(node_id=node_id)
242+
if instance.is_stopped():
243+
logger.info(f"Instance {node_id} is stopped.")
244+
break
245+
elif instance.is_stopping():
246+
time.sleep(POLL_INTERVAL)
247+
else:
248+
raise RuntimeError(f"Unexpected instance status."
249+
" Details: {instance}")
250+
251+
if instance.is_stopping():
252+
raise RuntimeError(f"Maximum number of polls: "
253+
f"{MAX_POLLS_STOP} reached. "
254+
f"Instance {node_id} is still in "
255+
"STOPPING status.")
232256
else:
233257
result = resource.delete_instance(
234258
node_id=node_id,

sky/utils/cli_utils/status_utils.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,6 @@ def show_local_status_table(local_clusters: List[str]):
181181
cluster_status['launched_at']))
182182
_get_region = (
183183
lambda clusters_status: clusters_status['handle'].launched_resources.region)
184-
_get_zone = (
185-
lambda clusters_status: clusters_status['handle'].launched_resources.zone)
186184
_get_status = (lambda cluster_status: cluster_status['status'].value)
187185
_get_duration = (lambda cluster_status: log_utils.readable_time_duration(
188186
cluster_status['launched_at']))
@@ -205,6 +203,13 @@ def _get_resources(cluster_status):
205203
return resources_str
206204

207205

206+
def _get_zone(cluster_status):
207+
zone_str = cluster_status['handle'].launched_resources.zone
208+
if zone_str is None:
209+
zone_str = '-'
210+
return zone_str
211+
212+
208213
def _get_autostop(cluster_status):
209214
autostop_str = '-'
210215
if cluster_status['autostop'] >= 0:

0 commit comments

Comments
 (0)