Skip to content

Commit c224819

Browse files
authored
Fixes ray dashboard hanging problem (skypilot-org#1088) (skypilot-org#1109)
* patch job_manager * fix cloud API
1 parent fcfe289 commit c224819

File tree

4 files changed

+14
-7
lines changed

4 files changed

+14
-7
lines changed

sky/clouds/azure.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,10 +77,7 @@ def is_same_cloud(self, other):
7777
return isinstance(other, Azure)
7878

7979
@classmethod
80-
def get_default_instance_type(cls,
81-
accelerators: Optional[Dict[str, int]] = None
82-
) -> str:
83-
del accelerators
80+
def get_default_instance_type(cls) -> str:
8481
# 8 vCpus, 32 GB RAM. Prev-gen (as of 2021) general purpose.
8582
return 'Standard_D8_v4'
8683

sky/clouds/cloud.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,7 @@ def get_accelerators_from_instance_type(
149149
raise NotImplementedError
150150

151151
@classmethod
152-
def get_default_instance_type(cls,
153-
accelerators: Optional[Dict[str, int]] = None
154-
) -> str:
152+
def get_default_instance_type(cls) -> str:
155153
raise NotImplementedError
156154

157155
@classmethod

sky/skylet/ray_patches/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ def patch() -> None:
6666
from ray.dashboard.modules.job import cli
6767
_run_patch(cli.__file__, _to_absolute('cli.py.patch'))
6868

69+
from ray.dashboard.modules.job import job_manager
70+
_run_patch(job_manager.__file__, _to_absolute('job_manager.py.patch'))
71+
6972
from ray.autoscaler._private import autoscaler
7073
_run_patch(autoscaler.__file__, _to_absolute('autoscaler.py.patch'))
7174

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
0a1,4
2+
> # Adapted from https://github.com/ray-project/ray/blob/ray-1.13.0/dashboard/modules/job/job_manager.py
3+
> # Fixed the problem where the _monitor_job thread is leaked, due to `await job_supervisor.ping.remote()`
4+
> # does not raise an exception after the job_supervisor is exited, causing the dashboard to hang.
5+
>
6+
334c338
7+
< await job_supervisor.ping.remote()
8+
---
9+
> ray.get(job_supervisor.ping.remote())

0 commit comments

Comments
 (0)