From 79991061e9d2a18b24ab24c94dc1c51f39d45552 Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Wed, 26 Jul 2023 16:01:25 -0400 Subject: [PATCH] cluster status safeguards --- src/codeflare_sdk/cluster/cluster.py | 23 ++++++++++++++++++----- src/codeflare_sdk/cluster/model.py | 6 ++++-- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index c45c50e08..d698331e6 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -176,9 +176,15 @@ def status( ready = False status = CodeFlareClusterStatus.FAILED # should deleted be separate return status, ready # exit early, no need to check ray status - elif appwrapper.status in [AppWrapperStatus.PENDING]: + elif appwrapper.status in [ + AppWrapperStatus.PENDING, + AppWrapperStatus.QUEUEING, + ]: ready = False - status = CodeFlareClusterStatus.QUEUED + if appwrapper.status == AppWrapperStatus.PENDING: + status = CodeFlareClusterStatus.QUEUED + else: + status = CodeFlareClusterStatus.QUEUEING if print_to_console: pretty_print.print_app_wrappers_status([appwrapper]) return ( @@ -561,11 +567,18 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]: def _map_to_app_wrapper(aw) -> AppWrapper: + if "status" in aw and "canrun" in aw["status"]: + return AppWrapper( + name=aw["metadata"]["name"], + status=AppWrapperStatus(aw["status"]["state"].lower()), + can_run=aw["status"]["canrun"], + job_state=aw["status"]["queuejobstate"], + ) return AppWrapper( name=aw["metadata"]["name"], - status=AppWrapperStatus(aw["status"]["state"].lower()), - can_run=aw["status"]["canrun"], - job_state=aw["status"]["queuejobstate"], + status=AppWrapperStatus("queueing"), + can_run=False, + job_state="Still adding to queue", ) diff --git a/src/codeflare_sdk/cluster/model.py b/src/codeflare_sdk/cluster/model.py index 0f031995a..639cc7340 100644 --- a/src/codeflare_sdk/cluster/model.py +++ b/src/codeflare_sdk/cluster/model.py @@ -39,6 +39,7 @@ class AppWrapperStatus(Enum): Defines the possible reportable states of an AppWrapper. """ + QUEUEING = "queueing" PENDING = "pending" RUNNING = "running" FAILED = "failed" @@ -55,8 +56,9 @@ class CodeFlareClusterStatus(Enum): READY = 1 STARTING = 2 QUEUED = 3 - FAILED = 4 - UNKNOWN = 5 + QUEUEING = 4 + FAILED = 5 + UNKNOWN = 6 @dataclass